From 7333e1f10f5da9dc67b511d326121a843771a107 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Thu, 8 Oct 2020 10:25:49 +0100
Subject: COMPMID-3732: Remove OpenCL padding from CLPoolingLayer

- Refactor pooling layer kernels on OpenCL (F32/F16/QASYMM8) to avoid
  padding and improve performance
- Add test for checking zero padding requirement
- Fix issue with extracting the index. The issue was caused by the
  padding passed at compile time
- auto_init indices tensor in CLPoolingLayerKernel

Change-Id: I1ae5a2ef8c4ce787c80dcd73e35c17bb34623cb5
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4188
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/CLKernelLibrary.cpp                   |   3 +-
 src/core/CL/cl_kernels/helpers.h                  |   2 +-
 src/core/CL/cl_kernels/pooling_layer.cl           | 611 ++++++++++------------
 src/core/CL/cl_kernels/pooling_layer_quantized.cl | 134 ++---
 src/core/CL/kernels/CLPoolingLayerKernel.cpp      | 134 ++---
 tests/validation/CL/PoolingLayer.cpp              |  61 ++-
 6 files changed, 471 insertions(+), 474 deletions(-)

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 0b59ec8a71..0d0b7f69cb 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -326,8 +326,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "pooling_layer_7", "pooling_layer.cl" },
     { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
     { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
-    { "pooling_layer_2_nhwc_indices_fp32", "pooling_layer.cl" },
-    { "pooling_layer_2_nhwc_indices_fp16", "pooling_layer.cl" },
+    { "pooling_layer_2x2_nhwc", "pooling_layer.cl" },
     { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" },
     { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" },
     { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index 0b36a55895..0bdf16dab1 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -174,7 +174,7 @@
  */
 #define V_OFFS1(dt) (dt)(0)
 #define V_OFFS2(dt) (dt)(0, 1)
-#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS3(dt) (dt)(0, 1, 2)
 #define V_OFFS4(dt) (dt)(0, 1, 2, 3)
 #define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
 #define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 9e6521b300..e69c3c35e9 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "repeat.h"
 
 #if defined(POOL_AVG) || defined(POOL_L2)
 #define POOL_OP(x, y) ((x) + (y))
@@ -38,8 +39,6 @@
 #define DIV_OP(x, y) (x * (1.f / y))
 #define SQRT_OP(x) sqrt((x))
 
-#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(1.f / y))
-
 #if STRIDE_X == 1
 #define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
 #elif STRIDE_X == 2 /* STRIDE_X == 1 */
@@ -481,122 +480,6 @@ __kernel void pooling_layer_MxN_nchw(
 }
 #endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
 
-ACC_DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
-                                       const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x = get_global_id(1) * stride_x - pad_x;
-#if defined(DST_DEPTH)
-    int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
-#else  /* defined(DST_DEPTH) */
-    int       start_y    = get_global_id(2) * stride_y - pad_y;
-#endif /* defined(DST_DEPTH) */
-
-#if !defined(EXCLUDE_PADDING)
-    upper_bound_w += pad_x;
-    upper_bound_h += pad_y;
-#endif /* defined(EXCLUDE_PADDING) */
-    const int end_x = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NHWC)
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Get pixels pointer
-#if defined(DST_DEPTH)
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-#else  /* defined(DST_DEPTH) */
-    Tensor3D  input      = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* defined(DST_DEPTH) */
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-    vdata = INITIAL_VALUE;
-
-    const int idx_width = get_global_id(1) * STRIDE_X;
-#if defined(DST_DEPTH)
-    const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
-#else  /* defined(DST_DEPTH) */
-    const int idx_height = get_global_id(2) * STRIDE_Y;
-#endif /* defined(DST_DEPTH) */
-
-    for(int y = 0; y < POOL_SIZE_Y; ++y)
-    {
-        int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
-        for(int x = 0; x < POOL_SIZE_X; ++x)
-        {
-            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
-            x1     = select(x1, PAD_X - idx_width - 1, y != y1);
-
-#if defined(DST_DEPTH)
-            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
-#else  /* defined(DST_DEPTH) */
-            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0    = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
-#endif /* defined(DST_DEPTH) */
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            vdata = POOL_OP(vdata, CONVERT(data0, VEC_DATA_TYPE(ACC_DATA_TYPE, 8)));
-        }
-    }
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    vdata = DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    vdata = SQRT_OP(vdata);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    vstore8(CONVERT(vdata, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)output.ptr);
-}
-
 #if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
 
 inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
@@ -631,65 +514,6 @@ inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint
     return;
 }
 
-inline void offset_no_padding_nhwc_3D(const Tensor3D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3)
-{
-    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
-
-    const int x = get_global_id(0);
-    const int y = get_global_id(1) * STRIDE_X;
-    const int z = get_global_id(2) * STRIDE_Y;
-
-    //x axis: component, y axis: width, z axis: height
-    const uint padded_offset = input->offset_first_element_in_bytes
-                               + x * 8 * input->stride_x
-                               + y * input->stride_y
-                               + z * input->stride_z;
-
-    const uint offset_base = padded_offset
-                             - (z + 1) * PAD_TENSOR_TOP * input->stride_y    /* Top padding for each z plane */
-                             - y * pad_horiz * sizeof(DATA_TYPE)             /* Horizontal padding for each row */
-                             - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */
-                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-    *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE);
-    *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-    *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE);
-    *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
-    return;
-}
-
-#if defined(DST_DEPTH)
-inline void offset_no_padding_nhwc_4D(const Tensor4D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3)
-{
-    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
-    const int z_max     = get_global_size(2) / BATCH_SIZE;
-
-    const int x = get_global_id(0);
-    const int y = get_global_id(1) * STRIDE_X;
-    const int z = (get_global_id(2) % z_max) * STRIDE_Y;
-    const int w = get_global_id(2) / z_max;
-
-    const unsigned int padded_offset = input->offset_first_element_in_bytes
-                                       + x * 8 * input->stride_x
-                                       + y * input->stride_y
-                                       + z * input->stride_z;
-
-    const unsigned int offset_base = padded_offset
-                                     - (z + 1) * PAD_TENSOR_TOP * input->stride_y    /* Top padding for each z plane */
-                                     - y * pad_horiz * sizeof(DATA_TYPE)             /* Horizontal padding for each row */
-                                     - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */
-                                     - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-    *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE);
-    *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-    *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE);
-    *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
-    return;
-}
-#endif //defined(DST_DEPTH)
-
 #endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
 
 /** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
@@ -832,115 +656,154 @@ __kernel void pooling_layer_2_nchw_indices_fp16(
 #endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
 }
 
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC.
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
  *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
  * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_stride_w                      Stride of the indices tensor in W dimension (in bytes)
- * @param[in]  indices_step_w                        indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void pooling_layer_2_nhwc_indices_fp32(
+__kernel void pooling_layer_MxN_nhwc(
     TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output),
-    TENSOR4D_DECLARATION(indices))
+    TENSOR4D_DECLARATION(output))
 {
-    // Get pixels pointer
-#if defined(DST_DEPTH)
-    Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
-    Tensor4D output  = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-    Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH);
-#else  /* defined(DST_DEPTH) */
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-#endif /* defined(DST_DEPTH) */
-
-#if defined(DST_DEPTH)
-    // Load data
-    float8 data_top0    = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 0, 0));
-    float8 data_top1    = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 0, 0));
-    float8 data_bottom0 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 1, 0));
-    float8 data_bottom1 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 1, 0));
-#else  /* defined(DST_DEPTH) */
-    // Load data
-    float8   data_top0    = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
-    float8   data_top1    = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-    float8   data_bottom0 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 1));
-    float8   data_bottom1 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 1));
-#endif /* defined(DST_DEPTH) */
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else //DST_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // DST_BATCH_SIZE != 1
+
+    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
+                                                      offset_c +
+                                                      idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
+                                                        offset_c +
+                                                        idx_out_w * output_stride_y +
+                                                        idx_out_h * output_stride_z +
+                                                        idx_out_n * output_stride_w;
+
+#if ((defined(POOL_AVG) || defined(POOL_L2)))
+#if defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#else // defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(EXCLUDE_PADDING)
+#endif // ((defined(POOL_AVG) || defined(POOL_L2)))
 
-    float8 data_top_max    = POOL_OP(data_top0, data_top1);
-    float8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1);
-    float8 data_max        = POOL_OP(data_top_max, data_bottom_max);
-    vstore8(data_max, 0, (__global float *)output.ptr);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
 
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) data0;
+#if defined(FP_MIXED_PRECISION)
+            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
 
-    uint offset_x0 = 0;
-    uint offset_x1 = 0;
-    uint offset_x2 = 0;
-    uint offset_x3 = 0;
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif // defined(POOL_L2)
+            res0 = POOL_OP(res0, data0);
 
-#if defined(DST_DEPTH)
-    offset_no_padding_nhwc_4D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3);
-#else  /* defined(DST_DEPTH) */
-    offset_no_padding_nhwc_3D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3);
-#endif /* defined(DST_DEPTH) */
+#if ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
+            filter_size++;
+#endif // ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
+        }
+    }
 
-    uint8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-    uint8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-    uint8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-    uint8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
 
-    uint8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1));
-    uint8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1));
-    uint8 index  = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max));
-    vstore8(index, 0, (__global uint *)indices.ptr);
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
 
-#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
 }
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
 
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC.
+/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
  *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
  * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
  * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -960,79 +823,151 @@ __kernel void pooling_layer_2_nhwc_indices_fp32(
  * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
  * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_stride_w                      Stride of the indices tensor in W dimension (in bytes)
- * @param[in]  indices_step_w                        indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
  */
-__kernel void pooling_layer_2_nhwc_indices_fp16(
+__kernel void pooling_layer_2x2_nhwc(
     TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output),
-    TENSOR4D_DECLARATION(indices))
+    TENSOR4D_DECLARATION(output)
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+    ,
+    TENSOR4D_DECLARATION(indices)
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+)
 {
-    // Get pixels pointer
-#if defined(DST_DEPTH)
-    Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
-    Tensor4D output  = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-    Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH);
-#else  /* defined(DST_DEPTH) */
-    Tensor3D input        = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output       = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices      = CONVERT_TO_TENSOR3D_STRUCT(indices);
-#endif /* defined(DST_DEPTH) */
-
-#if defined(DST_DEPTH)
-    // Load data
-    half8 data_top0    = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 0, 0));
-    half8 data_top1    = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 0, 0));
-    half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 1, 0));
-    half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 1, 0));
-#else  /* defined(DST_DEPTH) */
-    // Load data
-    half8 data_top0    = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
-    half8 data_top1    = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
-    half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 1));
-    half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 1));
-#endif /* defined(DST_DEPTH) */
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else //SRC_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // SRC_BATCH_SIZE != 1
+
+    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
+                                                      idx_out_c * sizeof(DATA_TYPE) +
+                                                      idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
+                                                        idx_out_c * sizeof(DATA_TYPE) +
+                                                        idx_out_w * output_stride_y +
+                                                        idx_out_h * output_stride_z +
+                                                        idx_out_n * output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
+
+    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+
+    int x0 = pool_x_s + idx_in_w;
+    int y0 = pool_y_s + idx_in_h;
+    int x1 = pool_x_e - 1 + idx_in_w;
+    int y1 = pool_y_e - 1 + idx_in_h;
+
+    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
 
-    half8 data_top_max    = POOL_OP(data_top0, data_top1);
-    half8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1);
-    half8 data_max        = POOL_OP(data_top_max, data_bottom_max);
-    vstore8(data_max, 0, (__global half *)output.ptr);
+#if defined(FP_MIXED_PRECISION)
+    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+    data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+    data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+    data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if !defined(POOL_MAX)
+    if(filter_size != 4)
+    {
+        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(pool_x_e == pool_x_s));
+        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(pool_y_e == pool_y_s));
+        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))((pool_x_e == pool_x_s) || (pool_y_e == pool_y_s)));
+    }
+#endif // !defined(POOL_MAX)
 
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 *= data0;
+    data1 *= data1;
+    data2 *= data2;
+    data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = data0;
+    res0 = POOL_OP(res0, data1);
+    res0 = POOL_OP(res0, data2);
+    res0 = POOL_OP(res0, data3);
 
-    uint offset_x0_int = 0;
-    uint offset_x1_int = 0;
-    uint offset_x2_int = 0;
-    uint offset_x3_int = 0;
-
-#if defined(DST_DEPTH)
-    offset_no_padding_nhwc_4D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int);
-#else  /* defined(DST_DEPTH) */
-    offset_no_padding_nhwc_3D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int);
-#endif /* defined(DST_DEPTH) */
-
-    ushort offset_x0 = (ushort)offset_x0_int;
-    ushort offset_x1 = (ushort)offset_x1_int;
-    ushort offset_x2 = (ushort)offset_x2_int;
-    ushort offset_x3 = (ushort)offset_x3_int;
-
-    ushort8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-    ushort8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-    ushort8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-    ushort8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
-
-    ushort8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1));
-    ushort8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1));
-    ushort8 index  = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max));
-    vstore8(CONVERT(index, uint8), 0, (__global uint *)indices.ptr);
-
-#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */
-}
\ No newline at end of file
+#if defined(POOL_AVG) || defined(POOL_L2)
+#if defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#else // !defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
+#endif // defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+
+    // This part is used to return the index of the maximum value
+    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
+
+    // note: Batch dimension does not contribute in the offset contribution
+    VEC_DATA_TYPE(uint, VEC_SIZE) base_index = (uint)idx_out_c;
+
+    base_index += VEC_OFFS(VEC_DATA_TYPE(uint, VEC_SIZE), VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE) index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE) index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE) index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE) index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+
+    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
+
+    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes +
+                                                         idx_out_c * sizeof(uint) +
+                                                         idx_out_w * indices_stride_y +
+                                                         idx_out_h * indices_stride_z +
+                                                         idx_out_n * indices_stride_w;
+
+    // Store result
+    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(SELECT_DATA_TYPE) && defined(ACC_DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index fe13464b1e..04fef98cd0 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -47,8 +47,6 @@
 
 #define DIV_OP(x, y) (x * (1.f / y))
 
-#define DIV_OP_NHWC(x, y) (convert_float8(x) * (float8)(1.f / y))
-
 #if defined(POOL_L2)
 #error "L2 pooling is not supported"
 #endif /* defined(POOL_L2) */
@@ -155,34 +153,22 @@ __kernel void pooling_layer_MxN_quantized_nchw(
     *(__global DATA_TYPE *)output.ptr = result_q8;
 }
 
-int calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
-                             const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x = get_global_id(1) * stride_x - pad_x;
-#if defined(DST_DEPTH)
-    int start_y = (get_global_id(2) % DST_DEPTH) * stride_y - pad_y;
-#else  /* defined(DST_DEPTH) */
-    int       start_y    = get_global_id(2) * stride_y - pad_y;
-#endif /* defined(DST_DEPTH) */
-
-    const int end_x = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y = min(start_y + pool_size_y, upper_bound_h);
-
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NHWC)
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
  *
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=uchar. Supported data types are QASYMM8/QASYMM8_SIGNED
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=int
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
  * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ * @note If the output has be requantized, -DOFFSET_IN1, -DOFFSET_OUT, -DSCALE_IN1 and -DSCALE_OUT muste be passed at compile time
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -209,57 +195,75 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
     TENSOR4D_DECLARATION(input),
     TENSOR4D_DECLARATION(output))
 {
-    // Get pixels pointer
-#if defined(DST_DEPTH)
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-#else  /* defined(DST_DEPTH) */
-    Tensor3D  input      = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D  output     = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* defined(DST_DEPTH) */
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else //DST_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // DST_BATCH_SIZE != 1
 
-    int8 vdata = INITIAL_VALUE;
+    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
+                                                      offset_c +
+                                                      idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
+                                                        offset_c +
+                                                        idx_out_w * output_stride_y +
+                                                        idx_out_h * output_stride_z +
+                                                        idx_out_n * output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
 
-    const int idx_width = get_global_id(1) * STRIDE_X;
-#if defined(DST_DEPTH)
-    const int idx_height = (get_global_id(2) % DST_DEPTH) * STRIDE_Y;
-#else  /* defined(DST_DEPTH) */
-    const int idx_height = get_global_id(2) * STRIDE_Y;
-#endif /* defined(DST_DEPTH) */
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
 
-    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    for(int y = pool_y_s; y < pool_y_e; ++y)
     {
-        int y1 = select(y, PAD_Y - idx_height, y + idx_height - PAD_Y < 0 || y + idx_height - PAD_Y >= MAX_HEIGHT);
-        for(int x = 0; x < POOL_SIZE_X; ++x)
+        for(int x = pool_x_s; x < pool_x_e; ++x)
         {
-            int x1 = select(x, PAD_X - idx_width - 1, x + idx_width - PAD_X < 0 || x + idx_width - PAD_X >= MAX_WIDTH);
-            x1     = select(x1, PAD_X - idx_width - 1, y != y1);
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data;
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) data0;
 
-#if defined(DST_DEPTH)
-            VEC_TYPE(8)
-            data = vload8(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
-#else  /* defined(DST_DEPTH) */
-            VEC_TYPE(8)
-            data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
-#endif /* defined(DST_DEPTH) */
+            data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+            data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
 
-            int8 data0 = convert_int8(data);
-            vdata      = POOL_OP(vdata, data0);
+            res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+            filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
         }
     }
 
 #if defined(POOL_AVG)
-    // Divide by pool region in case of average pooling
-    vdata = convert_int8(round(DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))));
-#endif /* defined(POOL_AVG) */
+    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
 
-    VEC_TYPE(8)
-    out_q8 = CONVERT(vdata, VEC_TYPE(8));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    REQUANTIZE(8, out_q8, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q8);
+    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
 #endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
 
     // Store result
-    vstore8(out_q8, 0, (__global DATA_TYPE *)output.ptr);
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
 }
-#endif /* defined(DATA_TYPE) && defined(INITIAL_VALUE) */
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(SELECT_DATA_TYPE) && defined(ACC_DATA_TYPE)
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index d60e196b7f..1771834aac 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -50,10 +50,14 @@ namespace
 // Internal window config info
 using CLPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
 
-void auto_init(const ITensorInfo *input, ITensorInfo *output, PoolingLayerInfo pool_info)
+void auto_init(const ITensorInfo *input, ITensorInfo *output, ITensorInfo *indices, PoolingLayerInfo pool_info)
 {
     TensorShape out_shape = compute_pool_shape(*input, pool_info);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(out_shape));
+    if(indices)
+    {
+        auto_init_if_empty(*indices, input->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
+    }
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
@@ -63,16 +67,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
                                     "Unsupported combination of parameters!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (input->data_layout() == DataLayout::NHWC),
-                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+
     // Check indices
     if(indices)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+
+        if(indices->total_size() != 0)
+        {
+            TensorInfo idx_info(TensorInfo(compute_pool_shape(*input, pool_info), 1, DataType::U32));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
+        }
     }
 
     // Checks performed when output is configured
@@ -108,9 +115,9 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
     const int  pool_pad_top    = pad_stride_info.pad_top();
     const int  pool_pad_left   = pad_stride_info.pad_left();
     const int  pool_pad_bottom = pad_stride_info.pad_bottom();
-    BorderSize border_size     = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
+    BorderSize border_size     = BorderSize();
 
-    auto_init(input, output, pool_info);
+    auto_init(input, output, indices, pool_info);
     pooled_w = output->tensor_shape()[idx_width];
     pooled_h = output->tensor_shape()[idx_height];
 
@@ -126,6 +133,8 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
     {
         case DataLayout::NCHW:
         {
+            // Initialize border size
+            border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
             // Change the number of elements processed per iteration
             // for pooling 3x3 with stride less equal than 3
             const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
@@ -165,27 +174,17 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
         }
         case DataLayout::NHWC:
         {
-            num_elems_processed_per_iteration = 8;
+            // Initialize border size
+            border_size = BorderSize();
+            num_elems_processed_per_iteration = adjust_vec_size(4, output->dimension(0));
             win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-            AccessWindowStatic input_access(input,
-                                            0, -1,
-                                            ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-            // Update indices window
-            if(indices)
+            if(indices != nullptr)
             {
-                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
-                window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
-                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
-            }
-            else
-            {
-                window_changed = update_window_and_padding(win, input_access, output_access);
+                indices->set_valid_region(ValidRegion(Coordinates(), indices->tensor_shape()));
             }
 
-            output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+            output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
             break;
         }
         default:
@@ -228,6 +227,7 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
     const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
     const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
     const int           pool_size_x     = pool_info.is_global_pooling ? input->info()->dimension(idx_width) : pool_info.pool_size.width;
     const int           pool_size_y     = pool_info.is_global_pooling ? input->info()->dimension(idx_height) : pool_info.pool_size.height;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
@@ -246,17 +246,11 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
     ICLKernel::configure_internal(std::get<1>(win_config));
 
-    if(_data_layout == DataLayout::NCHW)
-    {
-        CLPoolingConfig pooling_config     = std::get<2>(win_config);
-        _num_elems_processed_per_iteration = pooling_config.first;
-        _border_size                       = pooling_config.second;
-    }
-    else
-    {
-        _border_size                       = BorderSize(1, 0, 0, 0);
-        _num_elems_processed_per_iteration = 8;
-    }
+    CLPoolingConfig pooling_config     = std::get<2>(win_config);
+    _num_elems_processed_per_iteration = pooling_config.first;
+    _border_size                       = pooling_config.second;
+
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
 
     // Tensor paddings are used to calculate the indicies for MAX pooling
     if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
@@ -282,7 +276,8 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
     }
 
     // Check output dimensions
-    auto_init(input->info(), output->info(), pool_info);
+    auto_init(input->info(), output->info(), indices ? indices->info() : nullptr, pool_info);
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
 
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -314,19 +309,20 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
         build_opts.add_option("-DINITIAL_VALUE=0");
     }
 
-    const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
-    const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-    const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
-    build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
-    build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
 
     // Create kernel
     switch(_data_layout)
     {
         case DataLayout::NCHW:
         {
-            build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-            build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
+            const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
+            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
+            build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
+            build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+
             if(pool_type != PoolingType::MAX)
             {
                 build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
@@ -365,26 +361,38 @@ void CLPoolingLayerKernel::configure(const CLCompileContext &compile_context, co
         }
         case DataLayout::NHWC:
         {
+            // Floating point mixed precision is support on F16 only
+            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+
+            // Wider accumulation is required to avoid accuracy loss
+            // Case 1: Floating point mixed precision (fp16 input data and fp32 accumulation)
+            // Cast 2: Quantized (int8/uint8 input data and int32 accumulation )
+            DataType acc_data_type = data_type;
+
+            if(use_fp_mixed_precision)
+            {
+                acc_data_type = DataType::F32;
+            }
+            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+            {
+                acc_data_type = DataType::S32;
+            }
+
+            build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
+            build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
             build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
-            build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-            build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
-            build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
-                                     "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
-            build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
-                                     "-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->tensor_shape().total_size_upper(3)));
-
-            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+            build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+            build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+            build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
+            build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
+            build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch_size)));
+            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % _num_elems_processed_per_iteration));
+            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
             {
-                if(data_type == DataType::F32)
-                {
-                    std::string kernel_name = "pooling_layer_2_nhwc_indices_fp32";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-                else if(data_type == DataType::F16)
-                {
-                    std::string kernel_name = "pooling_layer_2_nhwc_indices_fp16";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
+                build_opts.add_option_if(_indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
+
+                std::string kernel_name = "pooling_layer_2x2_nhwc";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
             }
             else
             {
@@ -452,7 +460,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
                 unsigned int idx = 0;
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, slice);
-                if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_3D_tensor_argument(idx, _indices, slice);
                 }
@@ -463,14 +471,14 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         }
         case DataLayout::NHWC:
         {
-            const size_t total_batches = _output->info()->tensor_shape().total_size_upper(3);
+            const size_t batch_size = _output->info()->tensor_shape().total_size_upper(3);
 
             Window slice    = window_collapsed.first_slice_window_4D();
             Window in_slice = window_collapsed.first_slice_window_4D();
             in_slice.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration));
             in_slice.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), pool_stride_x));
             in_slice.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), pool_stride_y));
-            in_slice.set(3, Window::Dimension(0, total_batches, 1));
+            in_slice.set(3, Window::Dimension(0, batch_size, 1));
             do
             {
                 // Set inputs
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index eefad4ab2c..071b58323c 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -85,6 +85,39 @@ const auto                           pool_data_layout_dataset = framework::datas
 
 const auto pool_fp_mixed_precision_dataset = framework::dataset::make("FpMixedPrecision", { true, false });
 
+/** Zero padding test */
+bool validate_zero_padding(unsigned int width, DataType data_type)
+{
+    const PoolingLayerInfo pool_info(PoolingType::MAX, Size2D(2U, 2U), DataLayout::NHWC);
+
+    TensorShape shape(width, 23, 11, 1);
+
+    // Create tensors
+    CLTensor src = create_tensor<CLTensor>(shape, data_type);
+    CLTensor idx;
+    CLTensor dst;
+
+    src.info()->set_quantization_info(QuantizationInfo(1.f / 256.f, 0));
+    dst.info()->set_quantization_info(QuantizationInfo(1.f / 256.f, 0));
+
+    CLPoolingLayer pool;
+
+    if(is_data_type_quantized(data_type))
+    {
+        pool.configure(&src, &dst, pool_info, nullptr);
+
+        // Padding can be added along rhs and bias's X dimension
+        return src.info()->padding().empty() && dst.info()->padding().empty();
+    }
+    else
+    {
+        pool.configure(&src, &dst, pool_info, &idx);
+
+        // Padding can be added along rhs and bias's X dimension
+        return src.info()->padding().empty() && dst.info()->padding().empty() && idx.info()->padding().empty();
+    }
+}
+
 } // namespace
 
 TEST_SUITE(CL)
@@ -94,17 +127,15 @@ TEST_SUITE(PoolingLayer)
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Window shrink
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid pad/size combination
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Invalid pad/size combination
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8), // Invalid parameters
                                                        TensorInfo(TensorShape(15U, 13U, 5U), 1, DataType::F32),     // Non-rectangular Global Pooling
                                                        TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),     // Invalid output Global Pooling
-                                                       TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::QASYMM8), // Invalid exclude_padding = false with quantized type, no actual padding and NHWC
+                                                       TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::QASYMM8),
                                                        TensorInfo(TensorShape(13U, 13U, 5U), 1, DataType::F32),
                                                      }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(30U, 11U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(25U, 16U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8),
@@ -114,7 +145,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        TensorInfo(TensorShape(1U, 1U, 5U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("PoolInfo",  { PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
-                                                       PoolingLayerInfo(PoolingType::AVG, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
                                                        PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NCHW, PadStrideInfo(1, 1, 2, 0)),
                                                        PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 2)),
                                                        PoolingLayerInfo(PoolingType::L2, 3, DataLayout::NCHW, PadStrideInfo(1, 1, 0, 0)),
@@ -123,11 +153,32 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        PoolingLayerInfo(PoolingType::AVG, 2, DataLayout::NHWC, PadStrideInfo(), false),
                                                        PoolingLayerInfo(PoolingType::AVG, DataLayout::NCHW),
                                                       })),
-               framework::dataset::make("Expected", { false, false, false, false, false, true, false, false, true })),
+               framework::dataset::make("Expected", { false, false, false, false, true, false, true, true })),
                input_info, output_info, pool_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLPoolingLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info)) == expected, framework::LogLevel::ERRORS);
 }
+
+/** Validate zero padding tests
+ *
+ * A series of validation tests to check that no padding is added as part of configuration for 4 different scenarios.
+ *
+ * Checks performed in order:
+ *     - First dimension multiple of 16
+ *     - First dimension non-multiple of 16
+ *     - First dimension less than 16 (vec_size for qasymm8) but multiple
+ *     - First dimension less than 16 (vec_size for qasymm8) non-multiple
+ *     - Tensor with only one element
+ */
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(
+framework::dataset::make("Width",    { 32U, 37U, 12U, 13U, 1U }),
+framework::dataset::make("DataType", { DataType::F32, DataType::QASYMM8 })),
+width, data_type)
+{
+    bool status = validate_zero_padding(width, data_type);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
 // clang-format on
 // *INDENT-ON*
 
-- 
cgit v1.2.1