1 files changed, 149 insertions, 139 deletions
diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp
index 9ac291549b..55ac7c5192 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/cpu/kernels/CpuIm2ColKernel.cpp
@@ -29,13 +29,13 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
@@ -51,26 +51,34 @@ namespace kernels
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *output,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups,
+                          unsigned int         input_pad_right)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
 
     // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+    const unsigned int width_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const unsigned     total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
     const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
     ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
-        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
+        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(
+            input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -106,14 +114,14 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
     // This for loop linearize a volume with 3 slices. This allows:
     // 1) to reduce the iterations of the outer for loop "d"
     // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
-    for(; d <= (kernel_depth - 3); d += 3)
+    for (; d <= (kernel_depth - 3); d += 3)
     {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
+        for (int y = top_left_y; y < y_e; y += dilation_y)
         {
-            if((y < 0 || y >= input_h) && has_pads)
+            if ((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     *(out_ptr + 0 * kernel_size2) = pad_value;
                     *(out_ptr + 1 * kernel_size2) = pad_value;
@@ -122,9 +130,9 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
             }
             else
             {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
-                    if((x < 0 || x >= input_w) && has_pads)
+                    if ((x < 0 || x >= input_w) && has_pads)
                     {
                         *(out_ptr + 0 * kernel_size2) = pad_value;
                         *(out_ptr + 1 * kernel_size2) = pad_value;
@@ -132,9 +140,12 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
                     }
                     else
                     {
-                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
                     }
                 }
             }
@@ -143,11 +154,11 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
     }
 
     // Left over
-    for(; d < kernel_depth; d++)
+    for (; d < kernel_depth; d++)
     {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
+        for (int y = top_left_y; y < y_e; y += dilation_y)
         {
-            if((y < 0 || y >= input_h) && has_pads)
+            if ((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
                 memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
@@ -155,15 +166,16 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
             }
             else
             {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
-                    if((x < 0 || x >= input_w) && has_pads)
+                    if ((x < 0 || x >= input_w) && has_pads)
                     {
                         *out_ptr = pad_value;
                     }
                     else
                     {
-                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *out_ptr = *(reinterpret_cast<const T *>(
+                            in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
                     }
                 }
             }
@@ -171,7 +183,7 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr,
     }
 
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -198,36 +210,39 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
     const int end_y        = start_y + kernel_height * dilation_y;
     const int pad_quant    = kernel_width * input_c;
     const int element_size = static_cast<int>(sizeof(T));
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == input_c * element_size))
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
             //optimized for no dilation and no boundary pixels
-            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
+            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                   input_c * kernel_width * element_size);
             out_ptr += input_c * kernel_width;
         }
     }
     else
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
-            if(y < 0 || y >= input_h)
+            if (y < 0 || y >= input_h)
             {
                 memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
                 out_ptr += pad_quant;
             }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
             {
-                for(int x = start_x; x < end_x; x += dilation_x)
+                for (int x = start_x; x < end_x; x += dilation_x)
                 {
-                    if(x < 0 || x >= input_w)
+                    if (x < 0 || x >= input_w)
                     {
                         memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
                         out_ptr += input_c;
                     }
                     else
                     {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               input_c * element_size);
                         out_ptr += input_c;
                     }
                 }
@@ -235,13 +250,14 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
             else
             {
                 //optimized for no dilation and no boundary pixels
-                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
+                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                       input_c * kernel_width * element_size);
                 out_ptr += input_c * kernel_width;
             }
         }
     }
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -271,12 +287,13 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
     const int element_size       = static_cast<int>(sizeof(T));
     const int channel_chunk_size = input_c * element_size;
 
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == channel_chunk_size))
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == channel_chunk_size))
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
             const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
-            for(int e = 0; e < kernel_width; e++)
+            for (int e = 0; e < kernel_width; e++)
             {
                 memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
                 out_ptr += input_c + pad_right;
@@ -285,25 +302,26 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
     }
     else
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
-            if(y < 0 || y >= input_h)
+            if (y < 0 || y >= input_h)
             {
                 memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
                 out_ptr += pad_quant;
             }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
             {
-                for(int x = start_x; x < end_x; x += dilation_x)
+                for (int x = start_x; x < end_x; x += dilation_x)
                 {
-                    if(x < 0 || x >= input_w)
+                    if (x < 0 || x >= input_w)
                     {
                         memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size);
                         out_ptr += input_c + pad_right;
                     }
                     else
                     {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), channel_chunk_size);
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               channel_chunk_size);
                         out_ptr += input_c + pad_right;
                     }
                 }
@@ -311,16 +329,17 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
             else
             {
                 const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
-                for(int e = 0; e < kernel_width; e++)
+                for (int e = 0; e < kernel_width; e++)
                 {
-                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
+                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size),
+                           channel_chunk_size);
                     out_ptr += input_c + pad_right;
                 }
             }
         }
     }
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -348,7 +367,8 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window
     const int pad_top        = _conv_info.pad_top();
     const int stride_x       = _conv_info.stride().first;
     const int stride_y       = _conv_info.stride().second;
-    const int pad_value      = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
+    const int pad_value =
+        is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
 
     Window window_in_out(window);
     // The first three dimensions of the input and output are increased by the inner loops
@@ -361,84 +381,57 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window
     Iterator out(dst, window_in_out);
 
     execute_window_loop(
-        window, [&](const Coordinates & id)
-    {
-        const int start_w = id[width_idx] * stride_x - pad_left;
-        const int start_h = id[height_idx] * stride_y - pad_top;
+        window,
+        [&](const Coordinates &id)
+        {
+            const int start_w = id[width_idx] * stride_x - pad_left;
+            const int start_h = id[height_idx] * stride_y - pad_top;
 
-        // Get pointers
-        const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y());
+            // Get pointers
+            const uint8_t *const input_ptr = in.ptr();
+            auto                 output_ptr =
+                reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) *
+                                                      dst->info()->strides_in_bytes().y());
 
-        // Linearize volume
-        if(is_nchw)
-        {
-            linearize_volume_nchw<T, has_pads>(input_ptr,
-                                               output_ptr,
-                                               _has_bias,
-                                               start_w,
-                                               start_h,
-                                               _kernel_width,
-                                               _kernel_height,
-                                               input_c,
-                                               input_w,
-                                               input_h,
-                                               input_stride_x,
-                                               input_stride_y,
-                                               input_stride_z,
-                                               pad_value,
-                                               _dilation.x(),
-                                               _dilation.y());
-        }
-        else
-        {
-            if(_input_pad_right > 0)
+            // Linearize volume
+            if (is_nchw)
             {
-                linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                                   output_ptr,
-                                                   _has_bias,
-                                                   start_w,
-                                                   start_h,
-                                                   _kernel_width,
-                                                   _kernel_height,
-                                                   input_w,
-                                                   input_h,
-                                                   input_c,
-                                                   input_stride_y,
-                                                   input_stride_z,
-                                                   pad_value,
-                                                   _dilation.x(),
-                                                   _dilation.y(),
-                                                   _input_pad_right);
+                linearize_volume_nchw<T, has_pads>(
+                    input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_c, input_w,
+                    input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y());
             }
             else
             {
-                linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                                   output_ptr,
-                                                   _has_bias,
-                                                   start_w,
-                                                   start_h,
-                                                   _kernel_width,
-                                                   _kernel_height,
-                                                   input_w,
-                                                   input_h,
-                                                   input_c,
-                                                   input_stride_y,
-                                                   input_stride_z,
-                                                   pad_value,
-                                                   _dilation.x(),
-                                                   _dilation.y());
+                if (_input_pad_right > 0)
+                {
+                    linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, _has_bias, start_w, start_h,
+                                                       _kernel_width, _kernel_height, input_w, input_h, input_c,
+                                                       input_stride_y, input_stride_z, pad_value, _dilation.x(),
+                                                       _dilation.y(), _input_pad_right);
+                }
+                else
+                {
+                    linearize_volume_nhwc<T, has_pads>(
+                        input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_w,
+                        input_h, input_c, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y());
+                }
             }
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
-void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+void CpuIm2ColKernel::configure(const ITensorInfo   *src,
+                                ITensorInfo         *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups,
+                                unsigned int         input_pad_right)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
     ARM_COMPUTE_UNUSED(num_groups);
 
     _data_layout                   = src->data_layout();
@@ -451,31 +444,34 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     _kernel_height   = kernel_dims.height;
     _input_pad_right = input_pad_right;
     _dilation        = dilation;
-    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx),
-                                         _kernel_width, _kernel_height,
-                                         _conv_info, _dilation);
+    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width,
+                                         _kernel_height, _conv_info, _dilation);
     _has_bias        = has_bias;
 
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<float, true, true>;
                 break;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
                 break;
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8_SIGNED:
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -484,26 +480,31 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     }
     else
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<float, true, false>;
                 break;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
                 break;
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             case DataType::QASYMM8_SIGNED:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -512,11 +513,13 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     }
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false, num_groups, _input_pad_right)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation,
+                                                                       false, num_groups, _input_pad_right)));
 
-    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx),
-                                                                             kernel_dims.width, kernel_dims.height,
-                                                                             conv_info, dilation);
+    std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height,
+                          conv_info, dilation);
 
     Window win = calculate_max_window(*src, Steps());
     win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
@@ -526,10 +529,17 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const
     ICpuKernel::configure(win);
 }
 
-Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                 bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status CpuIm2ColKernel::validate(const ITensorInfo   *src,
+                                 const ITensorInfo   *dst,
+                                 const Size2D        &kernel_dims,
+                                 const PadStrideInfo &conv_info,
+                                 bool                 has_bias,
+                                 const Size2D        &dilation,
+                                 unsigned int         num_groups,
+                                 unsigned int         input_pad_right)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
     return Status{};
 }