From 77e6c558963abfd36a632f6fe3235921f71a7a77 Mon Sep 17 00:00:00 2001
From: Pablo Tello <pablo.tello@arm.com>
Date: Tue, 4 Dec 2018 15:33:49 +0000
Subject: COMPMID-1634: Cleaned up NEPoolingLayer.

Reduced the binary size of NEPoolingLayerKernel.o form 266k to 95K

Change-Id: Ia1e79849430a5f34f5c1fa3fb15f23a61555a7f0
Reviewed-on: https://review.mlplatform.org/344
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../core/NEON/kernels/NEPoolingLayerKernel.h       |  147 +--
 src/core/NEON/kernels/NEPoolingLayerKernel.cpp     | 1033 +++++++-------------
 2 files changed, 454 insertions(+), 726 deletions(-)
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 6c4c1db289..5f45a90cef 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -78,101 +78,116 @@ public:
 private:
     /** Function to perform 2x2 pooling.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void pooling2_f32_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling for float16_t.
+    void pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 32-bit floating point values.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void pooling2_f16_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling for 8bit asymmetric fixed point.
+    void poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 32-bit floating point values (NHWC).
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void pooling2_qasymm8_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 3x3 pooling.
+    void poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 7x7 pooling.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void pooling3_f32_nchw(const Window &window_input, const Window &window);
+    void pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
     /** Function to perform 3x3 pooling.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void pooling3_f16_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 3x3 pooling for 8bit quantized fixed point.
+    void pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling for float16_t.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void pooling3_qasymm8_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 7x7 pooling.
+    void pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 3x3 pooling.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void pooling7_f32_nchw(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 8-bit quantized.
+    void pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 16-bit floating point values.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 8-bit quantized. (NHWC)
+    void poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 16-bit floating point values. (NHWC)
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 16-bit floating point values.
+    void poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling for 8bit asymmetric fixed point.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void poolingMxN_f16_nchw(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 16-bit floating point values. (NHWC)
+    void pooling2_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 3x3 pooling for 8bit quantized fixed point.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void poolingMxN_f16_nhwc(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 32-bit floating point values.
+    void pooling3_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 8-bit quantized.
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void poolingMxN_f32_nchw(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 32-bit floating point values (NHWC).
+    void poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform MxN pooling for 8-bit quantized. (NHWC)
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    template <PoolingType pooling_type, bool exclude_padding = false>
-    void poolingMxN_f32_nhwc(const Window &window_input, const Window &window);
+    void poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
     /** Common signature for all the specialised Pooling functions
      *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
+     * @param[in] window_input    Input region on which to execute the kernel.
+     * @param[in] window          Output region on which to execute the kernel.
+     * @param[in] pooling_type    Pooling operation to be computed.
+     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
-    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window);
+    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding);
 
 private:
     PoolingFunction  _func;
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 310560b48a..244741c947 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -52,8 +52,7 @@ using namespace misc::shape_calculator;
 
 namespace
 {
-template <bool exclude_padding, DataLayout data_layout>
-inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
     const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -72,8 +71,7 @@ inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, c
     return 1.f / ((end_y - start_y) * (end_x - start_x));
 }
 
-template <bool exclude_padding>
-inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
+inline void scale_vector_s16x8(bool exclude_padding, uint16x8_t &v, const Coordinates &id, int id_offset, int step,
                                const int pool_size, const int upper_bound_w, const int upper_bound_h,
                                const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 {
@@ -336,13 +334,9 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    const PoolingType   pool_type         = pool_info.pool_type();
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
-    const bool          exclude_padding   = pool_info.exclude_padding();
     const bool          is_global_pooling = pool_info.is_global_pooling();
     const int           pool_stride_x     = pad_stride_info.stride().first;
-    unsigned int        pool_size_x       = 0;
-    unsigned int        pool_size_y       = 0;
 
     // Get data layout
     const DataLayout data_layout = input->info()->data_layout();
@@ -350,18 +344,19 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Update pool size in case of global pooling
-    pool_size_x = is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
-    pool_size_y = is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
+    const Size2D pool_size(
+        is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size().width,
+        is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size().height);
 
     // Validate pool info before calling scaled_dimensions
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size_x, pool_size_y));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(pool_size.x(), pool_size.y()));
 
     // Check output dimensions
     unsigned int pooled_w, pooled_h;
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(idx_width),
                                                      input->info()->dimension(idx_height),
-                                                     pool_size_x,
-                                                     pool_size_y,
+                                                     pool_size.x(),
+                                                     pool_size.y(),
                                                      pad_stride_info);
 
     // Perform validation step
@@ -371,7 +366,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     _input     = input;
     _output    = output;
     _pool_info = pool_info;
-    _is_square = (pool_size_x == pool_size_y);
+    _is_square = (pool_size.x() == pool_size.y());
 
     // Get data type
     const DataType data_type = input->info()->data_type();
@@ -379,88 +374,37 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
 
     if(data_type == DataType::QASYMM8)
     {
-        if(pool_size_x == 2 && pool_stride_x < 3 && _is_square)
+        if(pool_size.x() == 2 && pool_stride_x < 3 && _is_square)
         {
-            switch(pool_type)
+            if(is_nchw)
             {
-                case PoolingType::AVG:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::AVG, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
-                    }
-                    break;
-                case PoolingType::MAX:
-                    if(is_nchw)
-                    {
-                        _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw<PoolingType::MAX>;
-                    }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                _func = &NEPoolingLayerKernel::pooling2_qasymm8_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
             }
         }
-        else if(pool_size_x == 3 && pool_stride_x < 3 && _is_square)
+        else if(pool_size.x() == 3 && pool_stride_x < 3 && _is_square)
         {
-            switch(pool_type)
+            if(is_nchw)
             {
-                case PoolingType::AVG:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::AVG, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
-                    }
-                    break;
-                case PoolingType::MAX:
-                    if(is_nchw)
-                    {
-                        _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw<PoolingType::MAX>;
-                    }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                _func = &NEPoolingLayerKernel::pooling3_qasymm8_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
             }
         }
         else
         {
-            switch(pool_type)
+            if(is_nchw)
             {
-                case PoolingType::AVG:
-                    if(is_nchw)
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::AVG, false>;
-                    }
-                    else
-                    {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::AVG, false>;
-                    }
-                    break;
-                case PoolingType::MAX:
-                    if(is_nchw)
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw<PoolingType::MAX>;
-                    }
-                    else
-                    {
-                        _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc<PoolingType::MAX>;
-                    }
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc;
             }
         }
     }
@@ -468,157 +412,56 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     {
         if(_is_square)
         {
-            switch(pool_size_x)
+            switch(pool_size.x())
             {
                 case 2:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling2_f16_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-                case 3:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling3_f16_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-                default:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-            }
-        }
-        else
-        {
-            switch(pool_type)
-            {
-                case PoolingType::AVG:
+                {
                     if(is_nchw)
                     {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::AVG, false>;
+                        _func = &NEPoolingLayerKernel::pooling2_f16_nchw;
                     }
                     else
                     {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::AVG, false>;
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
                     }
-                    break;
-                case PoolingType::L2:
+                }
+                break;
+                case 3:
+                {
                     if(is_nchw)
                     {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::L2, false>;
+                        _func = &NEPoolingLayerKernel::pooling3_f16_nchw;
                     }
                     else
                     {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::L2, false>;
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
                     }
-                    break;
-                case PoolingType::MAX:
+                }
+                break;
+                default:
+                {
                     if(is_nchw)
                     {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw<PoolingType::MAX, false>;
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
                     }
                     else
                     {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc<PoolingType::MAX, false>;
+                        _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
                     }
                     break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+                break;
+            }
+        }
+        else
+        {
+            if(is_nchw)
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_f16_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_f16_nhwc;
             }
         }
     }
@@ -626,206 +469,78 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
     {
         if(_is_square)
         {
-            switch(pool_size_x)
+            switch(pool_size.x())
             {
                 case 2:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling2_f32_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-                case 3:
-                    switch(pool_type)
-                    {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling3_f32_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
-                    }
-                    break;
-                case 7:
-                    switch(pool_type)
+                {
+                    if(is_nchw)
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::pooling7_f32_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::pooling2_f32_nchw;
                     }
-                    break;
-                default:
-                    switch(pool_type)
+                    else
                     {
-                        case PoolingType::AVG:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
-                            }
-                            break;
-                        case PoolingType::L2:
-                            if(is_nchw)
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
-                            }
-                            else
-                            {
-                                _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
-                            }
-                            break;
-                        case PoolingType::MAX:
-                            if(is_nchw)
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
-                            }
-                            else
-                            {
-                                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
-                            }
-                            break;
-                        default:
-                            ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
                     }
                     break;
-            }
-        }
-        else
-        {
-            switch(pool_type)
-            {
-                case PoolingType::AVG:
+                }
+                case 3:
+                {
                     if(is_nchw)
                     {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::AVG, false>;
+                        _func = &NEPoolingLayerKernel::pooling3_f32_nchw;
                     }
                     else
                     {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::AVG, false>;
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
                     }
                     break;
-                case PoolingType::L2:
+                }
+                case 7:
+                {
                     if(is_nchw)
                     {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::L2, false>;
+                        _func = &NEPoolingLayerKernel::pooling7_f32_nchw;
                     }
                     else
                     {
-                        _func = (exclude_padding) ? &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, true> : &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::L2, false>;
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
                     }
                     break;
-                case PoolingType::MAX:
+                }
+                default:
+                {
                     if(is_nchw)
                     {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw<PoolingType::MAX, false>;
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
                     }
                     else
                     {
-                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc<PoolingType::MAX, false>;
+                        _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
                     }
                     break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported pooling type!");
+                }
+            }
+        }
+        else
+        {
+            if(is_nchw)
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_f32_nchw;
+            }
+            else
+            {
+                _func = &NEPoolingLayerKernel::poolingMxN_f32_nhwc;
             }
         }
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size_x, pool_size_y);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, _num_elems_processed_per_iteration, _border_size, pooled_w, pooled_h, pool_size.x(), pool_size.y());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -879,9 +594,9 @@ void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, con
             uint16x8_t res_lower = vcombine_u16(vpsum.val[0], vpsum.val[1]);
 
             // Scale lower result
-            scale_vector_s16x8<exclude_padding>(res_lower, id, 0, scale_step_x,
-                                                pool_size, upper_bound_w, upper_bound_h,
-                                                pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            scale_vector_s16x8(exclude_padding, res_lower, id, 0, scale_step_x,
+                               pool_size, upper_bound_w, upper_bound_h,
+                               pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             lower_res = vmovn_u16(res_lower);
 
             // Compute upper result for stride_x == 1
@@ -907,9 +622,9 @@ void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, con
                 uint16x8_t res_upper = vcombine_u16(vpsum_shifted.val[0], vpsum_shifted.val[1]);
 
                 // Scale lower result
-                scale_vector_s16x8<exclude_padding>(res_upper, id, 1, 2,
-                                                    pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                scale_vector_s16x8(exclude_padding, res_upper, id, 1, 2,
+                                   pool_size, upper_bound_w, upper_bound_h,
+                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
                 upper_res = vmovn_u16(res_upper);
             }
         }
@@ -938,9 +653,10 @@ void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, con
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
+    ARM_COMPUTE_UNUSED(pooling_type);
+    ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -978,7 +694,7 @@ void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const W
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float       scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
             // Perform pooling
             const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
@@ -1008,9 +724,10 @@ void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const W
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
+    ARM_COMPUTE_UNUSED(pooling_type);
+    ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator      input(_input, window_input);
     Iterator      output(_output, window);
@@ -1042,7 +759,7 @@ void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const W
 
         if(pooling_type != PoolingType::MAX)
         {
-            const float       scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float       scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
             const float16x4_t scale_v = vdup_n_f16(scale);
 
             const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
@@ -1071,71 +788,7 @@ void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const W
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    constexpr int pool_size       = 2;
-    const int     pool_pad_right  = _pool_info.pad_stride_info().pad_right();
-    const int     pool_pad_top    = _pool_info.pad_stride_info().pad_top();
-    const int     pool_pad_left   = _pool_info.pad_stride_info().pad_left();
-    const int     pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t res         = {};
-        float       final_res   = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            top_data    = vmul_f32(top_data, top_data);
-            bottom_data = vmul_f32(bottom_data, bottom_data);
-        }
-
-        if(pooling_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
-            res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
-        }
-        else
-        {
-            const float32x2_t max_data = vmax_f32(top_data, bottom_data);
-            res                        = vpmax_f32(max_data, max_data);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
-    },
-    input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1212,202 +865,52 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con
                     vgetq_lane_u16(final_sum.val[1], 6),
                 };
 
-                scale_vector_s16x8<exclude_padding>(res, id, 0, 1,
-                                                    pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
-            }
-            else
-            {
-                // Scale lower result
-                scale_vector_s16x8<exclude_padding>(final_sum.val[0], id, 0, 1,
-                                                    pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                // Scale lower result
-                scale_vector_s16x8<exclude_padding>(final_sum.val[1], id, 8, 1,
-                                                    pool_size, upper_bound_w, upper_bound_h,
-                                                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
-                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
-            }
-        }
-        else
-        {
-            const uint8x16_t max_data        = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
-            const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
-            const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
-            const uint8x16_t final_max       = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
-
-            if(pool_stride_x == 2)
-            {
-                const uint8x8x2_t      table      = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
-                static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                const uint8x8_t        res        = vtbl2_u8(table, lookup_val);
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
-            }
-            else
-            {
-                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
-            }
-        }
-    },
-    input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
-    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
-    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
-    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
-        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
-        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
-        float32x2_t res         = {};
-        float       final_res   = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            top_data    = vmulq_f32(top_data, top_data);
-            middle_data = vmulq_f32(middle_data, middle_data);
-            bottom_data = vmulq_f32(bottom_data, bottom_data);
-        }
-
-        if(pooling_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
-            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
-            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
-            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
-            res                        = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
-    },
-    input, output);
-}
-
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window)
-{
-    Iterator input(_input, window_input);
-    Iterator output(_output, window);
-
-    constexpr const int pool_size       = 7;
-    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
-    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
-    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
-    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
-    std::array<const uint8_t *, pool_size> input_ptrs{ {} };
-    for(int i = 0; i < pool_size; ++i)
-    {
-        input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
-    }
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float32x2_t res       = {};
-        float       final_res = 0.f;
-        if(pooling_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float             scale   = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
-            // Get power of 2 in case of l2 pooling
-            if(pooling_type == PoolingType::L2)
-            {
-                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                scale_vector_s16x8(exclude_padding, res, id, 0, 1,
+                                   pool_size, upper_bound_w, upper_bound_h,
+                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(res));
             }
-            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
-            for(int i = 1; i < pool_size; ++i)
+            else
             {
-                data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
-                // Get power of 2 in case of l2 pooling
-                if(pooling_type == PoolingType::L2)
-                {
-                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-                }
-                sum_data = vaddq_f32(sum_data, data.val[0]);
-                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+                // Scale lower result
+                scale_vector_s16x8(exclude_padding, final_sum.val[0], id, 0, 1,
+                                   pool_size, upper_bound_w, upper_bound_h,
+                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                // Scale lower result
+                scale_vector_s16x8(exclude_padding, final_sum.val[1], id, 8, 1,
+                                   pool_size, upper_bound_w, upper_bound_h,
+                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                const uint8x16_t res = vcombine_u8(vmovn_u16(final_sum.val[0]), vmovn_u16(final_sum.val[1]));
+                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
             }
-            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
-            res = vmul_f32(vpadd_f32(res, res), scale_v);
         }
         else
         {
-            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
-            for(int i = 1; i < pool_size; ++i)
+            const uint8x16_t max_data        = vmaxq_u8(vmaxq_u8(top_data, bottom_data), middle_data);
+            const uint8x16_t max_data_shift1 = vextq_u8(max_data, max_data, 1);
+            const uint8x16_t max_data_shift2 = vextq_u8(max_data, max_data, 2);
+            const uint8x16_t final_max       = vmaxq_u8(vmaxq_u8(max_data, max_data_shift1), max_data_shift2);
+
+            if(pool_stride_x == 2)
             {
-                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
-                max_data                 = vmax2q_f32(max_data, data);
+                const uint8x8x2_t      table      = { { vget_low_u8(final_max), vget_high_u8(final_max) } };
+                static const uint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
+                const uint8x8_t        res        = vtbl2_u8(table, lookup_val);
+                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), res);
+            }
+            else
+            {
+                vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), final_max);
             }
-            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
-            res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
-            res = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
         }
-
-        // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
     },
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
+    ARM_COMPUTE_UNUSED(pooling_type);
+    ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1432,7 +935,7 @@ void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
 
@@ -1528,9 +1031,10 @@ void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
+    ARM_COMPUTE_UNUSED(pooling_type);
+    ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1564,8 +1068,8 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                                       pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                    pool_stride_y);
             const float16x8_t scale_v = vdupq_n_f16(scale);
 
             // Perform pooling
@@ -1625,8 +1129,7 @@ void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1650,7 +1153,7 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
             float32x4_t vres = vdupq_n_f32(0.0f);
@@ -1748,8 +1251,218 @@ void NEPoolingLayerKernel::poolingMxN_f32_nchw(const Window &window_input, const
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr int pool_size       = 2;
+    const int     pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int     pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int     pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int     pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int           pool_stride_x   = 0;
+    int           pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x2_t top_data    = vld1_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        float32x2_t bottom_data = vld1_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t res         = {};
+        float       final_res   = 0;
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmul_f32(top_data, top_data);
+            bottom_data = vmul_f32(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+            res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+        }
+        else
+        {
+            const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+            res                        = vpmax_f32(max_data, max_data);
+        }
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size       = 3;
+    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x4_t top_data    = vld1q_f32(reinterpret_cast<const float *>(input_top_ptr + input.offset()));
+        float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(input_middle_ptr + input.offset()));
+        float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(input_bottom_ptr + input.offset()));
+        float32x2_t res         = {};
+        float       final_res   = 0;
+
+        // Get power of 2 in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            top_data    = vmulq_f32(top_data, top_data);
+            middle_data = vmulq_f32(middle_data, middle_data);
+            bottom_data = vmulq_f32(bottom_data, bottom_data);
+        }
+
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
+        }
+        else
+        {
+            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
+            res                        = vpmax_f32(res, res);
+        }
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+
+    constexpr const int pool_size       = 7;
+    const int           pool_pad_right  = _pool_info.pad_stride_info().pad_right();
+    const int           pool_pad_top    = _pool_info.pad_stride_info().pad_top();
+    const int           pool_pad_left   = _pool_info.pad_stride_info().pad_left();
+    const int           pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
+
+    std::array<const uint8_t *, pool_size> input_ptrs{ {} };
+    for(int i = 0; i < pool_size; ++i)
+    {
+        input_ptrs[i] = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+    }
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        float32x2_t res       = {};
+        float       final_res = 0.f;
+        if(pooling_type != PoolingType::MAX)
+        {
+            // Calculate scale
+            float             scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float32x2_t scale_v = vdup_n_f32(scale);
+
+            // Perform pooling
+            float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            // Get power of 2 in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+            }
+            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+            for(int i = 1; i < pool_size; ++i)
+            {
+                data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                // Get power of 2 in case of l2 pooling
+                if(pooling_type == PoolingType::L2)
+                {
+                    data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                    data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                }
+                sum_data = vaddq_f32(sum_data, data.val[0]);
+                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+            }
+            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+            res = vmul_f32(vpadd_f32(res, res), scale_v);
+        }
+        else
+        {
+            float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[0] + input.offset()));
+            for(int i = 1; i < pool_size; ++i)
+            {
+                const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(input_ptrs[i] + input.offset()));
+                max_data                 = vmax2q_f32(max_data, data);
+            }
+            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
+            res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
+            res = vpmax_f32(res, res);
+        }
+        final_res = vget_lane_f32(res, 0);
+
+        // Calculate square-root in case of l2 pooling
+        if(pooling_type == PoolingType::L2)
+        {
+            final_res = sqrt(final_res);
+        }
+
+        // Store result
+        *(reinterpret_cast<float *>(output.ptr())) = final_res;
+    },
+    input, output);
+}
+
+void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1783,8 +1496,8 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
         if(pooling_type != PoolingType::MAX)
         {
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                                       pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                    pool_stride_y);
             const float32x4_t scale_v = vdupq_n_f32(scale);
 
             // Perform pooling
@@ -1837,8 +1550,7 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1865,7 +1577,7 @@ void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, c
             uint32_t   sres = 0;
 
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NCHW>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
             // Perform pooling
             for(int y = 0; y < pool_size_y; ++y)
@@ -1933,8 +1645,7 @@ void NEPoolingLayerKernel::poolingMxN_qasymm8_nchw(const Window &window_input, c
     input, output);
 }
 
-template <PoolingType pooling_type, bool exclude_padding>
-void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window)
+void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
     Iterator output(_output, window);
@@ -1973,8 +1684,8 @@ void NEPoolingLayerKernel::poolingMxN_qasymm8_nhwc(const Window &window_input, c
             uint32x4_t vres4 = vdupq_n_u32(0);
 
             // Calculate scale
-            const float scale = calculate_avg_scale<exclude_padding, DataLayout::NHWC>(id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                                       pool_stride_y);
+            const float scale = calculate_avg_scale(exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
+                                                    pool_stride_y);
             const float32x4_t scale_v = vdupq_n_f32(scale);
 
             // Perform pooling
@@ -2073,9 +1784,10 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    const unsigned int pool_stride_x = _pool_info.pad_stride_info().stride().first;
-    const unsigned int pool_stride_y = _pool_info.pad_stride_info().stride().second;
-    const unsigned int pool_size     = _pool_info.pool_size().width;
+    const unsigned int pool_stride_x   = _pool_info.pad_stride_info().stride().first;
+    const unsigned int pool_stride_y   = _pool_info.pad_stride_info().stride().second;
+    const unsigned int pool_size       = _pool_info.pool_size().width;
+    const bool         exclude_padding = _pool_info.exclude_padding();
 
     Window window_input(window);
     if(_input->info()->data_layout() == DataLayout::NCHW)
@@ -2093,6 +1805,7 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
                 }
                 break;
             }
+
             case DataType::F16:
             case DataType::F32:
             {
@@ -2115,5 +1828,5 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
     }
 
     // Run function
-    (this->*_func)(window_input, window);
+    (this->*_func)(window_input, window, _pool_info.pool_type(), exclude_padding);
 }
-- 
cgit v1.2.1