From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001
From: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Date: Wed, 27 Sep 2023 17:46:17 +0100
Subject: Apply clang-format on repository

Code is formatted as per a revised clang format configuration
file(not part of this delivery). Version 14.0.6 is used.

Exclusion List:
- files with .cl extension
- files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...)
And the following directories
- compute_kernel_writer/validation/
- tests/
- include/
- src/core/NEON/kernels/convolution/
- src/core/NEON/kernels/arm_gemm/
- src/core/NEON/kernels/arm_conv/
- data/

There will be a follow up for formatting of .cl files and the
files under tests/ and compute_kernel_writer/validation/.

Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
---
 src/cpu/kernels/pool3d/neon/impl.h      | 417 +++++++++++++++++---------------
 src/cpu/kernels/pool3d/neon/quantized.h | 403 +++++++++++++++---------------
 2 files changed, 428 insertions(+), 392 deletions(-)

(limited to 'src/cpu/kernels/pool3d/neon')
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
index 013e25537c..ce89199b5d 100644
--- a/src/cpu/kernels/pool3d/neon/impl.h
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -25,9 +25,10 @@
 #define SRC_CORE_POOLING_3D_LAYER_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool3d/neon/quantized.h"
 
 namespace arm_compute
@@ -37,8 +38,13 @@ namespace cpu
 namespace
 {
 template <typename T>
-void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_start_x, const int window_end_x, const int window_step_x)
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
 
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -71,80 +77,87 @@ void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmax(vres, data);
+                        }
                     }
                 }
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
             }
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
-            res = -std::numeric_limits<float>::infinity();
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+                res = -std::numeric_limits<float>::infinity();
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res                     = std::max(res, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res                     = std::max(res, data);
+                        }
                     }
                 }
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
             }
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+        },
+        out);
 }
 
 template <typename T>
-void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                    const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
     using vector_type = typename vtype::type;
@@ -183,95 +196,103 @@ void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
-        const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vadd(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vadd(vres, data);
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
+                // Divide by scale
+                res *= scale;
 
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
 }
 
 template <typename T>
-void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                   const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                   ITensor            *dst0,
+                                   Pooling3dLayerInfo &pool_info,
+                                   const Window       &window_out,
+                                   const int           window_start_x,
+                                   const int           window_end_x,
+                                   const int           window_step_x)
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
     using vector_type = typename vtype::type;
@@ -310,97 +331,100 @@ void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dL
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmla(vres, data, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmla(vres, data, data);
+                        }
                     }
                 }
-            }
-
-            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
+                const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-            // Calculate square-root
-            vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
+                // Calculate square-root
+                vres = wrapper::vinv(wrapper::vinvsqrt(vres));
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data * data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data * data;
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
+                // Divide by scale
+                res *= scale;
 
-            // Square root
-            res = std::sqrt(res);
+                // Square root
+                res = std::sqrt(res);
 
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
 }
 } // namespace
 
@@ -415,16 +439,19 @@ void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
     // Needed to handle loop left-over
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    switch(pool_info.pool_type)
+    switch (pool_info.pool_type)
     {
         case PoolingType::MAX:
-            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
             break;
         case PoolingType::AVG:
-            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
             break;
         case PoolingType::L2:
-            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                             window_step_x);
             break;
         default:
             ARM_COMPUTE_ERROR("Pool operation not supported");
@@ -440,7 +467,7 @@ void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
     // Needed to handle loop left-over
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    switch(pool_info.pool_type)
+    switch (pool_info.pool_type)
     {
         case PoolingType::MAX:
             max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
index ac14f5eafa..8819907901 100644
--- a/src/cpu/kernels/pool3d/neon/quantized.h
+++ b/src/cpu/kernels/pool3d/neon/quantized.h
@@ -26,17 +26,18 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_step_x)
+void avg_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
 {
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -89,144 +90,147 @@ void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
     // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
 
-        int x_off = window_start_x;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            int x_off = window_start_x;
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
                     }
                 }
-            }
 
-            if(src_qinfo != dst_qinfo)
-            {
-                const float32x4x4_t vres =
+                if (src_qinfo != dst_qinfo)
                 {
-                    {
+                    const float32x4x4_t vres = {{
                         vcvtq_f32_q32(vres1),
                         vcvtq_f32_q32(vres2),
                         vcvtq_f32_q32(vres3),
                         vcvtq_f32_q32(vres4),
-                    }
-                };
-                const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
-            }
-            else
-            {
-                const float32x4_t scale_v = vdupq_n_f32(scale);
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                    }};
+                    const auto          requantized_dst =
+                        vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+                }
+                else
+                {
+                    const float32x4_t scale_v = vdupq_n_f32(scale);
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                }
             }
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            q32_t res = static_cast<q32_t>(0.f);
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q32_t res = static_cast<q32_t>(0.f);
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
                     }
                 }
-            }
 
-            if(src_qinfo != dst_qinfo)
-            {
-                const float res_f           = static_cast<float>(res);
-                const float new_scale       = quant_rescale / scale;
-                const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f           = static_cast<float>(res);
+                    const float new_scale       = quant_rescale / scale;
+                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
 
-                // Store result
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
-            }
-            else
-            {
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                }
+                else
+                {
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
 
-                // Store result
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 template <typename T>
-void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_step_x)
+void max_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
 {
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -266,125 +270,130 @@ void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        int x_off = window_start_x;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+            int x_off = window_start_x;
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        vres = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
                 }
-            }
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                            requant_qinfo) :
-                            vres);
-        }
 
-        // Leftovers using half the window step
-        for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
-        {
-            q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo)
+                                    ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                           wrapper::vgethigh(vres), requant_qinfo)
+                                    : vres);
+            }
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Leftovers using half the window step
+            for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        vres = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
                 }
-            }
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                            (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res = std::numeric_limits<T>::min();
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res = std::numeric_limits<T>::min();
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-
-                        res = std::max(res, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            res = std::max(res, data);
+                        }
                     }
                 }
-            }
 
-            // Store result
-            if(src_qinfo != dst_qinfo)
-            {
-                const float res_f                           = static_cast<float>(res);
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
-            }
-            else
-            {
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                // Store result
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f                           = static_cast<float>(res);
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                }
+                else
+                {
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
-- 
cgit v1.2.1