1 files changed, 222 insertions, 195 deletions
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
index 013e25537c..ce89199b5d 100644
--- a/src/cpu/kernels/pool3d/neon/impl.h
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -25,9 +25,10 @@
 #define SRC_CORE_POOLING_3D_LAYER_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool3d/neon/quantized.h"
 
 namespace arm_compute
@@ -37,8 +38,13 @@ namespace cpu
 namespace
 {
 template <typename T>
-void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_start_x, const int window_end_x, const int window_step_x)
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
 
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -71,80 +77,87 @@ void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmax(vres, data);
+                        }
                     }
                 }
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
             }
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
-            res = -std::numeric_limits<float>::infinity();
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+                res = -std::numeric_limits<float>::infinity();
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res                     = std::max(res, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res                     = std::max(res, data);
+                        }
                     }
                 }
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
             }
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+        },
+        out);
 }
 
 template <typename T>
-void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                    const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
     using vector_type = typename vtype::type;
@@ -183,95 +196,103 @@ void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
-        const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vadd(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vadd(vres, data);
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
+                // Divide by scale
+                res *= scale;
 
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
 }
 
 template <typename T>
-void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                   const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                   ITensor            *dst0,
+                                   Pooling3dLayerInfo &pool_info,
+                                   const Window       &window_out,
+                                   const int           window_start_x,
+                                   const int           window_end_x,
+                                   const int           window_step_x)
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
     using vector_type = typename vtype::type;
@@ -310,97 +331,100 @@ void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dL
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
 
-        int x_off = window_start_x;
+            int x_off = window_start_x;
 
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmla(vres, data, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmla(vres, data, data);
+                        }
                     }
                 }
-            }
-
-            const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
+                const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-            // Calculate square-root
-            vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
+                // Calculate square-root
+                vres = wrapper::vinv(wrapper::vinvsqrt(vres));
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data * data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data * data;
+                        }
                     }
                 }
-            }
 
-            // Divide by scale
-            res *= scale;
+                // Divide by scale
+                res *= scale;
 
-            // Square root
-            res = std::sqrt(res);
+                // Square root
+                res = std::sqrt(res);
 
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
 }
 } // namespace
 
@@ -415,16 +439,19 @@ void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
     // Needed to handle loop left-over
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    switch(pool_info.pool_type)
+    switch (pool_info.pool_type)
     {
         case PoolingType::MAX:
-            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
             break;
         case PoolingType::AVG:
-            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
             break;
         case PoolingType::L2:
-            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                             window_step_x);
             break;
         default:
             ARM_COMPUTE_ERROR("Pool operation not supported");
@@ -440,7 +467,7 @@ void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye
     // Needed to handle loop left-over
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    switch(pool_info.pool_type)
+    switch (pool_info.pool_type)
     {
         case PoolingType::MAX:
             max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);