6 files changed, 436 insertions, 419 deletions
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index dbd47ccfa9..13c2d314e4 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -38,9 +38,8 @@ struct DepthwiseConfig
     DepthwiseMethod method = DepthwiseMethod::DEFAULT;
     std::string     filter = "";
 
-    DepthwiseConfig(DepthwiseMethod method)
-        : method(method) {};
-    DepthwiseConfig() {};
+    DepthwiseConfig(DepthwiseMethod method) : method(method){};
+    DepthwiseConfig(){};
 };
 
 struct DepthwiseArgs
@@ -63,18 +62,24 @@ struct DepthwiseArgs
 
     bool fast_mode = false;
 
-    DepthwiseArgs(
-        const CPUInfo *cpu_info,
-        unsigned int kernel_rows, unsigned int kernel_cols,
-        unsigned int stride_rows, unsigned int stride_cols,
-        unsigned int dilation_rows, unsigned int dilation_cols,
-        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
-        unsigned int input_channels,
-        unsigned int output_rows, unsigned int output_cols,
-        unsigned int  channel_multiplier,
-        PaddingValues padding, arm_gemm::Activation activation,
-
-        const DepthwiseConfig *config)
+    DepthwiseArgs(const CPUInfo       *cpu_info,
+                  unsigned int         kernel_rows,
+                  unsigned int         kernel_cols,
+                  unsigned int         stride_rows,
+                  unsigned int         stride_cols,
+                  unsigned int         dilation_rows,
+                  unsigned int         dilation_cols,
+                  unsigned int         n_batches,
+                  unsigned int         input_rows,
+                  unsigned int         input_cols,
+                  unsigned int         input_channels,
+                  unsigned int         output_rows,
+                  unsigned int         output_cols,
+                  unsigned int         channel_multiplier,
+                  PaddingValues        padding,
+                  arm_gemm::Activation activation,
+
+                  const DepthwiseConfig *config)
         : cpu_info(cpu_info),
           kernel_rows(kernel_rows),
           kernel_cols(kernel_cols),
@@ -95,20 +100,38 @@ struct DepthwiseArgs
     {
     }
 
-    DepthwiseArgs(
-        const CPUInfo *cpu_info,
-        unsigned int kernel_rows, unsigned int kernel_cols,
-        unsigned int stride_rows, unsigned int stride_cols,
-        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
-        unsigned int input_channels,
-        unsigned int output_rows, unsigned int output_cols,
-        unsigned int  channel_multiplier,
-        PaddingValues padding, arm_gemm::Activation activation,
-        const DepthwiseConfig *config)
-        : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
-                        stride_cols, 1, 1, n_batches, input_rows, input_cols,
-                        input_channels, output_rows, output_cols,
-                        channel_multiplier, padding, activation, config)
+    DepthwiseArgs(const CPUInfo         *cpu_info,
+                  unsigned int           kernel_rows,
+                  unsigned int           kernel_cols,
+                  unsigned int           stride_rows,
+                  unsigned int           stride_cols,
+                  unsigned int           n_batches,
+                  unsigned int           input_rows,
+                  unsigned int           input_cols,
+                  unsigned int           input_channels,
+                  unsigned int           output_rows,
+                  unsigned int           output_cols,
+                  unsigned int           channel_multiplier,
+                  PaddingValues          padding,
+                  arm_gemm::Activation   activation,
+                  const DepthwiseConfig *config)
+        : DepthwiseArgs(cpu_info,
+                        kernel_rows,
+                        kernel_cols,
+                        stride_rows,
+                        stride_cols,
+                        1,
+                        1,
+                        n_batches,
+                        input_rows,
+                        input_cols,
+                        input_channels,
+                        output_rows,
+                        output_cols,
+                        channel_multiplier,
+                        padding,
+                        activation,
+                        config)
     {
     }
 };
@@ -127,17 +150,18 @@ struct Tile
     {
     }
 
-    Tile()
-        : Tile(nullptr, 0, 0, 0)
+    Tile() : Tile(nullptr, 0, 0, 0)
     {
     }
 
-    void load_from(
-        const TInput      *input,
-        const unsigned int ld_row, const unsigned int ld_col,
-        const unsigned int n_rows, const unsigned int n_cols,
-        const int input_i, const int input_j,
-        const unsigned int channel_multiplier) const
+    void load_from(const TInput      *input,
+                   const unsigned int ld_row,
+                   const unsigned int ld_col,
+                   const unsigned int n_rows,
+                   const unsigned int n_cols,
+                   const int          input_i,
+                   const int          input_j,
+                   const unsigned int channel_multiplier) const
     {
         const auto pad_top  = input_i < 0 ? -input_i : 0;
         const auto pad_left = input_j < 0 ? -input_j : 0;
@@ -145,18 +169,15 @@ struct Tile
         const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
         const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
 
-        if(padded_rows < tile_rows || padded_cols < tile_cols)
+        if (padded_rows < tile_rows || padded_cols < tile_cols)
         {
             memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
         }
 
-        do_premultiply<TInput>(
-            (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col,
-            ld_row, ld_col,
-            array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
-            tile_cols * tile_channels, tile_channels,
-            padded_rows, padded_cols, tile_channels / channel_multiplier,
-            channel_multiplier);
+        do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+                               ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+                               tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+                               tile_channels / channel_multiplier, channel_multiplier);
     }
 };
 
@@ -168,9 +189,8 @@ protected:
     std::string         m_name{};
 
 public:
-    DepthwiseCommon(const DepthwiseArgs &args)
-        : m_args(args) {};
-    DepthwiseCommon(DepthwiseCommon &) = delete;
+    DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+    DepthwiseCommon(DepthwiseCommon &)            = delete;
     DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
 
     std::string name() const override
@@ -181,19 +201,18 @@ public:
     void set_name(std::string name)
     {
         // Only allow the name to be set once
-        if(m_name.empty())
+        if (m_name.empty())
         {
             m_name = name;
         }
     }
 
-    void execute(
-        const void *const  input,
-        const void *const  parameters,
-        void *const        output,
-        void *const        working_space,
-        const unsigned int thread_id,
-        const unsigned int n_threads) const override final
+    void execute(const void *const  input,
+                 const void *const  parameters,
+                 void *const        output,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
     {
         const size_t ld_input_col    = m_args.input_channels;
         const size_t ld_input_row    = ld_input_col * m_args.input_cols;
@@ -202,56 +221,47 @@ public:
         const size_t ld_output_row   = ld_output_col * m_args.output_cols;
         const size_t ld_output_batch = ld_output_row * m_args.output_rows;
 
-        execute(
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            parameters, output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+                ld_output_batch, working_space, thread_id, n_threads);
     }
 
-    void execute(
-        const void *const  input,
-        size_t             ld_input_col,
-        size_t             ld_input_row,
-        size_t             ld_input_batch,
-        const void *const  parameters,
-        void *const        output,
-        size_t             ld_output_col,
-        size_t             ld_output_row,
-        size_t             ld_output_batch,
-        void *const        working_space,
-        const unsigned int thread_id,
-        const unsigned int n_threads) const override final
+    void execute(const void *const  input,
+                 size_t             ld_input_col,
+                 size_t             ld_input_row,
+                 size_t             ld_input_batch,
+                 const void *const  parameters,
+                 void *const        output,
+                 size_t             ld_output_col,
+                 size_t             ld_output_row,
+                 size_t             ld_output_batch,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
     {
-        execute(
-            m_args.n_batches, m_args.input_rows, m_args.input_cols,
-            m_args.input_channels, m_args.padding,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            parameters,
-            m_args.output_rows, m_args.output_cols,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+                ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+                ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
     }
 
-    void execute(
-        unsigned int         batches,
-        unsigned int         input_height,
-        unsigned int         input_width,
-        unsigned int         channels,
-        const PaddingValues &padding,
-        const void          *input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const void          *parameters,
-        unsigned int         output_height,
-        unsigned int         output_width,
-        void                *output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         n_threads) const override final
+    void execute(unsigned int         batches,
+                 unsigned int         input_height,
+                 unsigned int         input_width,
+                 unsigned int         channels,
+                 const PaddingValues &padding,
+                 const void          *input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const void          *parameters,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void                *output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         n_threads) const override final
     {
         // Construct a new set of arguments to reflect that we might have been
         // passed different input/output tensors. Dilation is handled at this
@@ -271,38 +281,33 @@ public:
         auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
         auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
 
-        for(size_t drow = 0; drow < m_args.dilation_rows; drow++)
+        for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
         {
             size_t start_i;
-            std::tie(args.output_rows, args.input_rows, start_i,
-                     args.padding.top, args.padding.bottom) =
-                         get_reduced_view_for_dilation(
-                             output_height, input_height, drow, m_args.dilation_rows,
-                             m_args.kernel_rows, m_args.stride_rows, padding.top);
+            std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+                get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+                                              m_args.kernel_rows, m_args.stride_rows, padding.top);
 
             auto input_row  = static_cast<const TInput *>(input) + start_i * ld_input_row;
             auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
 
-            if(args.output_rows)
+            if (args.output_rows)
             {
-                for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+                for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
                 {
                     size_t start_j;
-                    std::tie(args.output_cols, args.input_cols, start_j,
-                             args.padding.left, args.padding.right) =
-                                 get_reduced_view_for_dilation(
-                                     output_width, input_width, dcol, m_args.dilation_cols,
-                                     m_args.kernel_cols, m_args.stride_cols, padding.left);
+                    std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+                        get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+                                                      m_args.kernel_cols, m_args.stride_cols, padding.left);
 
                     const TInput *input_col  = input_row + start_j * ld_input_col;
                     TOutput      *output_col = output_row + dcol * ld_output_col;
 
-                    if(args.output_cols)
+                    if (args.output_cols)
                     {
-                        this->execute_internal(
-                            args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
-                            output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
-                            working_space, thread_id, n_threads);
+                        this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+                                               parameters, output_col, ld_output_col_d, ld_output_row_d,
+                                               ld_output_batch, working_space, thread_id, n_threads);
                     }
                 }
             }
@@ -310,20 +315,19 @@ public:
     }
 
 protected:
-    virtual void execute_internal(
-        const DepthwiseArgs &instance_args,
-        const void          *input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const void          *parameters,
-        void                *output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         n_threads) const = 0;
+    virtual void execute_internal(const DepthwiseArgs &instance_args,
+                                  const void          *input,
+                                  size_t               ld_input_col,
+                                  size_t               ld_input_row,
+                                  size_t               ld_input_batch,
+                                  const void          *parameters,
+                                  void                *output,
+                                  size_t               ld_output_col,
+                                  size_t               ld_output_row,
+                                  size_t               ld_output_batch,
+                                  void                *working_space,
+                                  unsigned int         thread_id,
+                                  unsigned int         n_threads) const = 0;
 
     virtual bool uses_premultiply() const
     {
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index a5db793b3d..5ff848e281 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -49,11 +49,7 @@ struct KernelDescription
     bool            is_default     = false;
     uint64_t        cycle_estimate = 0;
 
-    KernelDescription(
-        DepthwiseMethod method,
-        std::string     name,
-        bool            is_default,
-        uint64_t        cycle_estimate)
+    KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
         : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
     {
     }
@@ -78,58 +74,51 @@ public:
     // pointer the bias vector (which may be nullptr in the case of no bias) and
     // a pointer to the array of weights (stored in HWIO order).
     virtual void pack_parameters(
-        void       *buffer,
-        const void *biases,
-        const void *weights,
-        size_t      ld_weight_col = 0,
-        size_t      ld_weight_row = 0) = 0;
+        void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
 
     // Determine the amount of working space required
     virtual size_t get_working_size(unsigned int n_threads) const = 0;
 
     // Execute the convolution over the specified area of memory.
-    virtual void execute(
-        const void *input,       // Pointer to input tensor
-        const void *parameters,  // Packed parameters buffer
-        void        *output,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
-
-    virtual void execute(
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
-
-    virtual void execute(
-        unsigned int batches,
-        unsigned int input_height,
-        unsigned int input_width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        unsigned int output_height,
-        unsigned int output_width,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
+    virtual void execute(const void  *input,      // Pointer to input tensor
+                         const void  *parameters, // Packed parameters buffer
+                         void        *output,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
+
+    virtual void execute(unsigned int batches,
+                         unsigned int input_height,
+                         unsigned int input_width,
+                         unsigned int channels,
+                         const PaddingValues &,
+                         const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
 };
 
 // To handle a dilation factor of D execute the kernel once for each d in
@@ -145,12 +134,13 @@ public:
 // - Number of valid input pixels corresponding to `d`
 // - Offset of the first pixel corresponding to `d`
 // - Amount of padding in the view for `d`
-std::tuple<size_t, size_t, size_t, size_t, size_t>
-get_reduced_view_for_dilation(
-    size_t out_size, size_t in_size,
-    size_t d, size_t dilation_factor,
-    size_t kernel_size, size_t stride,
-    size_t pad_before);
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+                                                                                 size_t in_size,
+                                                                                 size_t d,
+                                                                                 size_t dilation_factor,
+                                                                                 size_t kernel_size,
+                                                                                 size_t stride,
+                                                                                 size_t pad_before);
 
 } // namespace depthwise
 } // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index f1f70cf1d6..045f9f95d3 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -68,45 +68,42 @@ public:
     virtual size_t get_working_size(unsigned int num_threads) const = 0;
 
     // Execute pooling over the specified area of memory.
-    virtual void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         void *const       output,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         void *const       output,
+                         size_t            ld_output_col,
+                         size_t            ld_output_row,
+                         size_t            ld_output_batch,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        unsigned int      batches,
-        unsigned int      height,
-        unsigned int      width,
-        unsigned int      channels,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        const PaddingValues &,
-        unsigned int output_height,
-        unsigned int output_width,
-        void *const  output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int num_threads) const = 0;
+    virtual void execute(unsigned int      batches,
+                         unsigned int      height,
+                         unsigned int      width,
+                         unsigned int      channels,
+                         const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         const PaddingValues &,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void *const  output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int num_threads) const = 0;
 };
 
 } // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index e8db35c593..89d594298e 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -36,9 +36,8 @@ struct PoolingConfig
     PoolingMethod method = PoolingMethod::DEFAULT;
     std::string   filter = "";
 
-    PoolingConfig(PoolingMethod method)
-        : method(method) {};
-    PoolingConfig() {};
+    PoolingConfig(PoolingMethod method) : method(method){};
+    PoolingConfig(){};
 };
 
 struct PoolingArgs
@@ -57,30 +56,40 @@ struct PoolingArgs
 
     const PoolingConfig *config;
 
-    PoolingArgs(
-        const CPUInfo       *cpu_info,
-        PoolingType          pool_type,
-        const PoolingWindow &window,
-        const PoolingStride &stride,
-        bool                 exclude_padding,
-        unsigned int         n_batches,
-        unsigned int         input_rows,
-        unsigned int         input_cols,
-        unsigned int         n_channels,
-        unsigned int         output_rows,
-        unsigned int         output_cols,
-        const PaddingValues &padding,
-        const PoolingConfig *cfg)
-        : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
-          n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+    PoolingArgs(const CPUInfo       *cpu_info,
+                PoolingType          pool_type,
+                const PoolingWindow &window,
+                const PoolingStride &stride,
+                bool                 exclude_padding,
+                unsigned int         n_batches,
+                unsigned int         input_rows,
+                unsigned int         input_cols,
+                unsigned int         n_channels,
+                unsigned int         output_rows,
+                unsigned int         output_cols,
+                const PaddingValues &padding,
+                const PoolingConfig *cfg)
+        : cpu_info(cpu_info),
+          pool_type(pool_type),
+          pool_window(window),
+          pool_stride(stride),
+          exclude_padding(exclude_padding),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          n_channels(n_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          padding(padding),
+          config(cfg)
     {
         // If either of the pooling window dimensions are set to zero, meaning
         // "pool everything", then replace with the corresponding input dimension.
-        if(pool_window.rows == 0)
+        if (pool_window.rows == 0)
         {
             pool_window.rows = input_rows;
         }
-        if(pool_window.cols == 0)
+        if (pool_window.cols == 0)
         {
             pool_window.cols = input_cols;
         }
@@ -100,10 +109,16 @@ struct Requantize32
     int32_t per_layer_right_shift = 0;
     int32_t per_layer_mul         = 0;
 
-    Requantize32(int32_t input_offset, int32_t output_offset,
-                 int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+    Requantize32(int32_t input_offset,
+                 int32_t output_offset,
+                 int32_t per_layer_left_shift,
+                 int32_t per_layer_right_shift,
                  int32_t per_layer_mul)
-        : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+        : input_offset(input_offset),
+          output_offset(output_offset),
+          per_layer_left_shift(per_layer_left_shift),
+          per_layer_right_shift(per_layer_right_shift),
+          per_layer_mul(per_layer_mul)
     {
     }
 };
@@ -115,105 +130,88 @@ protected:
     const PoolingArgs m_args;
 
 public:
-    PoolingCommon(const PoolingArgs &args)
-        : m_args(args)
+    PoolingCommon(const PoolingArgs &args) : m_args(args)
     {
     }
-    PoolingCommon(PoolingCommon &) = delete;
+    PoolingCommon(PoolingCommon &)            = delete;
     PoolingCommon &operator=(PoolingCommon &) = delete;
 
     size_t get_working_size(unsigned int) const override = 0;
 
     // Execute pooling over the specified area of memory.
-    void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const override
+    void execute(const void *const input,
+                 void *const       output,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
     {
-        this->execute(
-            input,
-            m_args.n_channels,
-            m_args.n_channels * m_args.input_cols,
-            m_args.n_channels * m_args.input_cols * m_args.input_rows,
-            output,
-            m_args.n_channels,
-            m_args.n_channels * m_args.output_cols,
-            m_args.n_channels * m_args.output_cols * m_args.output_rows,
-            working_space,
-            thread_id, num_threads);
+        this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+                      m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+                      m_args.n_channels * m_args.output_cols,
+                      m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+                      num_threads);
     }
 
-    void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const override
+    void execute(const void *const input,
+                 size_t            ld_input_col,
+                 size_t            ld_input_row,
+                 size_t            ld_input_batch,
+                 void *const       output,
+                 size_t            ld_output_col,
+                 size_t            ld_output_row,
+                 size_t            ld_output_batch,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
     {
-        this->execute(
-            m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            m_args.padding, m_args.output_rows, m_args.output_cols,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, num_threads);
+        this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+                      ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+                      ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
     }
 
-    void execute(
-        unsigned int         batches,
-        unsigned int         height,
-        unsigned int         width,
-        unsigned int         channels,
-        const void *const    input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const PaddingValues &padding,
-        unsigned int         output_height,
-        unsigned int         output_width,
-        void *const          output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         num_threads) const override
+    void execute(unsigned int         batches,
+                 unsigned int         height,
+                 unsigned int         width,
+                 unsigned int         channels,
+                 const void *const    input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const PaddingValues &padding,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void *const          output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         num_threads) const override
     {
-        this->execute_internal(
-            batches, height, width, channels, padding,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            output_height, output_width,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, num_threads);
+        this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+                               ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+                               ld_output_batch, working_space, thread_id, num_threads);
     }
 
 protected:
-    virtual void execute_internal(
-        unsigned int batches,
-        unsigned int height,
-        unsigned int width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        unsigned int      output_height,
-        unsigned int      output_width,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute_internal(unsigned int batches,
+                                  unsigned int height,
+                                  unsigned int width,
+                                  unsigned int channels,
+                                  const PaddingValues &,
+                                  const void *const input,
+                                  size_t            ld_input_col,
+                                  size_t            ld_input_row,
+                                  size_t            ld_input_batch,
+                                  unsigned int      output_height,
+                                  unsigned int      output_width,
+                                  void *const       output,
+                                  size_t            ld_output_col,
+                                  size_t            ld_output_row,
+                                  size_t            ld_output_batch,
+                                  void             *working_space,
+                                  unsigned int      thread_id,
+                                  unsigned int      num_threads) const = 0;
 };
 
 template <typename TInput, typename TOutput>
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
index 16f26de38a..fb97cf8baf 100644
--- a/src/core/NEON/kernels/assembly/premultiply.hpp
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -44,30 +44,27 @@ void do_premultiply(const T           *in_ptr,
                     const unsigned     input_channels,
                     const unsigned int channel_multiplier)
 {
-    if(sizeof(T) == 4 && channel_multiplier == 6)
+    if (sizeof(T) == 4 && channel_multiplier == 6)
     {
-        do_premultiply_float_6(
-            (const float *)in_ptr, ld_row, ld_col,
-            (float *)out_ptr, out_ld_row, out_ld_col,
-            tile_rows, tile_cols,
-            input_channels);
+        do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+                               tile_rows, tile_cols, input_channels);
     }
     else
     {
-        for(unsigned int i = 0; i < tile_rows; i++)
+        for (unsigned int i = 0; i < tile_rows; i++)
         {
             const T *ip2 = in_ptr + i * ld_row;
             T       *op2 = out_ptr + i * out_ld_row;
-            for(unsigned int j = 0; j < tile_cols; j++)
+            for (unsigned int j = 0; j < tile_cols; j++)
             {
                 const T *ip = ip2;
                 T       *op = op2;
-                for(unsigned int c = 0; c < input_channels; c++)
+                for (unsigned int c = 0; c < input_channels; c++)
                 {
                     T val = *ip;
                     ip++;
 
-                    for(unsigned int r = 0; r < channel_multiplier; r++)
+                    for (unsigned int r = 0; r < channel_multiplier; r++)
                     {
                         op[r] = val;
                     }
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
index 50290757ec..dbf95d23cd 100644
--- a/src/core/NEON/kernels/assembly/winograd.hpp
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -45,17 +45,24 @@ struct ConvolutionArgs
     Shape2D              kernel_shape;
     arm_gemm::Activation activation;
 
-    ConvolutionArgs(
-        unsigned int   n_batches,
-        const Shape2D &input_shape,
-        unsigned int   n_input_channels,
-        unsigned int pad_top, unsigned int pad_left,
-        const Shape2D              &output_shape,
-        unsigned int                n_output_channels,
-        const Shape2D               kernel_shape,
-        const arm_gemm::Activation &activation = {})
-        : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels),
-          kernel_shape(kernel_shape), activation(activation)
+    ConvolutionArgs(unsigned int                n_batches,
+                    const Shape2D              &input_shape,
+                    unsigned int                n_input_channels,
+                    unsigned int                pad_top,
+                    unsigned int                pad_left,
+                    const Shape2D              &output_shape,
+                    unsigned int                n_output_channels,
+                    const Shape2D               kernel_shape,
+                    const arm_gemm::Activation &activation = {})
+        : n_batches(n_batches),
+          input_shape(input_shape),
+          n_input_channels(n_input_channels),
+          pad_top(pad_top),
+          pad_left(pad_left),
+          output_shape(output_shape),
+          n_output_channels(n_output_channels),
+          kernel_shape(kernel_shape),
+          activation(activation)
     {
     }
 };
@@ -105,23 +112,30 @@ public:
     virtual unsigned int get_transformed_tile_rows(void) const = 0;
     virtual unsigned int get_transformed_tile_cols(void) const = 0;
 
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
-        void *outptr, const WinogradDomainSpec &wds,
-        unsigned int thread_id, unsigned int n_threads) const
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 size_t                    ld_input_channel,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args, inptr, ld_in_row, ld_in_col, ld_input_channel,
-            outptr, wds.weight_ld_matrix, wds.weight_ld_row,
-            thread_id, n_threads);
+        this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+                      wds.weight_ld_row, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
-        void *outptr, size_t ld_out_matrix, size_t ld_out_row,
-        unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         size_t                 ld_input_channel,
+                         void                  *outptr,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace weight_transform
@@ -136,27 +150,35 @@ public:
     virtual unsigned int get_input_rows(void) const = 0;
     virtual unsigned int get_input_cols(void) const = 0;
 
-    virtual size_t get_working_space_size(
-        const ConvolutionArgs &args,
-        unsigned int           n_threads) const = 0;
-
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
-        void *outptr, const WinogradDomainSpec &wds,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_batch,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args, inptr, ld_in_batch, ld_in_row, ld_in_col,
-            outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row,
-            working_space, thread_id, n_threads);
+        this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+                      wds.input_ld_row, working_space, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
-        void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace input_transform
@@ -177,31 +199,37 @@ public:
     virtual unsigned int get_kernel_rows(void) const = 0;
     virtual unsigned int get_kernel_cols(void) const = 0;
 
-    virtual size_t get_working_space_size(
-        const ConvolutionArgs &args,
-        unsigned int           n_threads) const = 0;
-
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, const WinogradDomainSpec &wds,
-        const void *bias,
-        void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 const WinogradDomainSpec &wds,
+                 const void               *bias,
+                 void                     *outptr,
+                 size_t                    ld_out_batch,
+                 size_t                    ld_out_row,
+                 size_t                    ld_out_col,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args,
-            inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row,
-            bias,
-            outptr, ld_out_batch, ld_out_row, ld_out_col,
-            working_space, thread_id, n_threads);
+        this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+                      ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
-        const void *bias,
-        void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_matrix,
+                         size_t                 ld_in_row,
+                         const void            *bias,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_row,
+                         size_t                 ld_out_col,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace output_transform
@@ -210,7 +238,7 @@ struct WinogradImpl
 {
     const output_transform::ITransform *output_transform = nullptr;
     const weight_transform::ITransform *weight_transform = nullptr;
-    const input_transform::ITransform *input_transform  = nullptr;
+    const input_transform::ITransform  *input_transform  = nullptr;
     std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
     WinogradDomainSpec                  winograd_spec;
 };
@@ -220,15 +248,18 @@ struct WinogradImpl
  * Assigns to the pointers in the `dest` struct and returns true or false to
  * indicate whether the given problem can be executed or not.
  */
-template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut>
-bool get_implementation(
-    WinogradImpl &dest, // Destination for the selected implementation
-    const CPUInfo *,
-    const ConvolutionArgs &,
-    int  max_threads,
-    bool fast_mode,
-    const WinogradConfig *,
-    const arm_gemm::GemmConfig *);
+template <typename TIn,
+          typename TWeight      = TIn,
+          typename TOut         = TIn,
+          typename TWinogradIn  = TIn,
+          typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+                        const CPUInfo *,
+                        const ConvolutionArgs &,
+                        int  max_threads,
+                        bool fast_mode,
+                        const WinogradConfig *,
+                        const arm_gemm::GemmConfig *);
 
 } // namespace winograd
 } // namespace arm_conv