From 40471d12a19088df4af6ad80e5c0437d724dd8fa Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Mon, 26 Apr 2021 08:39:28 +0100
Subject: Add optimization for global pooling in pooling_layer.cl

- Simplify the implementation when the pooling size has the same spatial
  dimensions of the input tensor
- Rework the heuristic for F32/F16
- Add test for validating the global pooling path
- Fix compare_dimensions in validation. The validation fails because we have different
  number of dimensions for NCHW and NHWC (e.g. 1,1,2,1(NCHW) -> 2,1,1,1(NHWC)

Change-Id: Iba680cb30bf2a5d0952265a4cc9794f368549ca5
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5510
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/pooling_layer.cl     | 157 ++++++++++++++++------------
 src/core/gpu/cl/kernels/ClPoolingKernel.cpp |   4 +-
 tests/validation/CL/PoolingLayer.cpp        | 121 +++++++++++++++++----
 tests/validation/Validation.h               |  18 ++--
 4 files changed, 197 insertions(+), 103 deletions(-)

diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 00250a08a5..b30145b11e 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "helpers.h"
 #include "repeat.h"
+#include "tile_helpers.h"
 
 #if defined(POOL_AVG) || defined(POOL_L2)
 #define POOL_OP(x, y) ((x) + (y))
@@ -506,7 +507,7 @@ inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint
 #if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
     *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
 #else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-    *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
+    *offset_top   = (uint)(offset_base / sizeof(DATA_TYPE));
 #endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
 
     *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
@@ -703,56 +704,79 @@ __kernel void pooling_layer_MxN_nhwc(
 {
     // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
     // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-    int idx_out_w = get_global_id(1);
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
 #if DST_BATCH_SIZE != 1
     // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = get_global_id(2) % DST_HEIGHT;
-    int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else //DST_BATCH_SIZE != 1
-    int idx_out_h = get_global_id(2);
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
+#else  //DST_BATCH_SIZE != 1
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
+    ;
     int idx_out_n = 0;
 #endif // DST_BATCH_SIZE != 1
 
-    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
 
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT
+    // Global pooling path
+
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+
+#pragma unroll 8
+    for(int y = 0; y < POOL_SIZE_X * POOL_SIZE_Y; ++y)
+    {
+        VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+        data0;
+#if defined(FP_MIXED_PRECISION)
+        // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+        data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+        data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+        // Raise to power of 2 for L2 Pooling
+        data0 *= data0;
+#endif // defined(POOL_L2)
+
+        res0 = POOL_OP(res0, data0);
+
+        in_base_ptr += input_stride_y;
+    }
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT
 
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
-                                                      offset_c +
-                                                      idx_out_n * input_stride_w;
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
 
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
-                                                        offset_c +
-                                                        idx_out_w * output_stride_y +
-                                                        idx_out_h * output_stride_z +
-                                                        idx_out_n * output_stride_w;
+    int pool_x_s    = max((int)0, -idx_in_w);
+    int pool_x_e    = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s    = max((int)0, -idx_in_h);
+    int pool_y_e    = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
 
-#if ((defined(POOL_AVG) || defined(POOL_L2)))
 #if defined(EXCLUDE_PADDING)
-    int filter_size = 0;
-#else // defined(EXCLUDE_PADDING)
+    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#else  // defined(EXCLUDE_PADDING)
     int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
 #endif // defined(EXCLUDE_PADDING)
-#endif // ((defined(POOL_AVG) || defined(POOL_L2)))
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = INITIAL_VALUE;
 
     for(int y = pool_y_s; y < pool_y_e; ++y)
     {
         for(int x = pool_x_s; x < pool_x_e; ++x)
         {
-            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) data0;
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+            data0;
 #if defined(FP_MIXED_PRECISION)
             // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
             data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else // defined(FP_MIXED_PRECISION)
-            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#else  // defined(FP_MIXED_PRECISION)
+            data0   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
 #endif // defined(FP_MIXED_PRECISION)
 
 #if defined(POOL_L2)
@@ -760,15 +784,13 @@ __kernel void pooling_layer_MxN_nhwc(
             data0 *= data0;
 #endif // defined(POOL_L2)
             res0 = POOL_OP(res0, data0);
-
-#if ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
-            filter_size++;
-#endif // ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
         }
     }
 
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT
+
 #if defined(POOL_AVG) || defined(POOL_L2)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+            res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
 #endif // defined(POOL_AVG) || defined(POOL_L2)
 
 #if defined(POOL_L2)
@@ -778,9 +800,10 @@ __kernel void pooling_layer_MxN_nhwc(
 
     // Store result
 #if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
     STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else // defined(FP_MIXED_PRECISION)
+#else  // defined(FP_MIXED_PRECISION)
     STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
 #endif // defined(FP_MIXED_PRECISION)
 }
@@ -853,23 +876,18 @@ __kernel void pooling_layer_2x2_nhwc(
     // If batch size != 1, the batch size dimension is collapsed over the height dimension
     int idx_out_h = get_global_id(2) % DST_HEIGHT;
     int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else //SRC_BATCH_SIZE != 1
+#else  //SRC_BATCH_SIZE != 1
     int idx_out_h = get_global_id(2);
     int idx_out_n = 0;
 #endif // SRC_BATCH_SIZE != 1
 
-    int idx_in_w  = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h  = idx_out_h * STRIDE_Y - PAD_Y;
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
 
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
-                                                      idx_out_c * sizeof(DATA_TYPE) +
-                                                      idx_out_n * input_stride_w;
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
 
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
-                                                        idx_out_c * sizeof(DATA_TYPE) +
-                                                        idx_out_w * output_stride_y +
-                                                        idx_out_h * output_stride_z +
-                                                        idx_out_n * output_stride_w;
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
 
     int pool_x_s = max((int)0, -idx_in_w);
     int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
@@ -891,11 +909,11 @@ __kernel void pooling_layer_2x2_nhwc(
     data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
     data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
     data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else // defined(FP_MIXED_PRECISION)
-    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
-    data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
-    data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
-    data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#else  // defined(FP_MIXED_PRECISION)
+    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
 #endif // defined(FP_MIXED_PRECISION)
 
 #if !defined(POOL_MAX)
@@ -931,7 +949,7 @@ __kernel void pooling_layer_2x2_nhwc(
 #if defined(POOL_AVG) || defined(POOL_L2)
 #if defined(EXCLUDE_PADDING)
     res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#else // !defined(EXCLUDE_PADDING)
+#else  // !defined(EXCLUDE_PADDING)
     res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
 #endif // defined(EXCLUDE_PADDING)
 #endif // defined(POOL_AVG) || defined(POOL_L2)
@@ -943,9 +961,10 @@ __kernel void pooling_layer_2x2_nhwc(
 
     // Store result
 #if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
     STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else // defined(FP_MIXED_PRECISION)
+#else  // defined(FP_MIXED_PRECISION)
     STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
 #endif // defined(FP_MIXED_PRECISION)
 
@@ -955,24 +974,26 @@ __kernel void pooling_layer_2x2_nhwc(
     // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
 
     // note: Batch dimension does not contribute in the offset contribution
-    VEC_DATA_TYPE(uint, VEC_SIZE) base_index = (uint)idx_out_c;
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    base_index = (uint)idx_out_c;
 
     base_index += VEC_OFFS(uint, VEC_SIZE);
 
-    VEC_DATA_TYPE(uint, VEC_SIZE) index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE) index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE) index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE) index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
 
     index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
     index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
     index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
 
-    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes +
-                                                         idx_out_c * sizeof(uint) +
-                                                         idx_out_w * indices_stride_y +
-                                                         idx_out_h * indices_stride_z +
-                                                         idx_out_n * indices_stride_w;
+    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
+                                           indices_stride_w;
 
     // Store result
     STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
diff --git a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp b/src/core/gpu/cl/kernels/ClPoolingKernel.cpp
index 78243402bf..a432877a1d 100644
--- a/src/core/gpu/cl/kernels/ClPoolingKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClPoolingKernel.cpp
@@ -173,9 +173,11 @@ std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITenso
         }
         case DataLayout::NHWC:
         {
+            const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4;
+
             // Initialize border size
             border_size                       = BorderSize();
-            num_elems_processed_per_iteration = adjust_vec_size(4, dst->dimension(0));
+            num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0));
             win                               = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
             break;
         }
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index f42c187f8f..0153e659ae 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -159,12 +159,12 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::Datase
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLPoolingLayerMixedDataLayoutFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
-                                                                                                        combine(combine(combine(combine(datasets::PoolingTypes(),
-                                                                                                        framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
-                                                                                                        framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 0, 0) })),
-                                                                                                        framework::dataset::make("ExcludePadding", { false })),
-                                                                                                        framework::dataset::make("DataType", DataType::F32))),
-                                                                                                        pool_data_layout_dataset))
+                       combine(combine(combine(combine(datasets::PoolingTypes(),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 0, 0) })),
+                                       framework::dataset::make("ExcludePadding", { false })),
+                               framework::dataset::make("DataType", DataType::F32))),
+                       pool_data_layout_dataset))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -188,6 +188,44 @@ FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<float>, fra
     validate(CLAccessor(_target_indices), _ref_indices);
 }
 
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
+                                                                             TensorShape(27U, 13U, 2U, 4U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size2D(27, 13) })),
+                                    framework::dataset::make("PadStride", PadStrideInfo(1, 1, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F32)),
+                                    framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<float>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(79U, 37U, 11U),
+                                                                             TensorShape(79U, 37U, 11U, 4U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size2D(79, 37) })),
+                                    framework::dataset::make("PadStride", PadStrideInfo(1, 1, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F32)),
+                                    framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
@@ -216,6 +254,45 @@ FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<half>, fram
     validate(CLAccessor(_target), _reference, tolerance_f32);
     validate(CLAccessor(_target_indices), _ref_indices);
 }
+
+TEST_SUITE(GlobalPooling)
+// *INDENT-OFF*
+// clang-format off
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<half>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U),
+                                                                             TensorShape(27U, 13U, 2U, 4U)
+                                                                            }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size2D(27, 13) })),
+                                    framework::dataset::make("PadStride", PadStrideInfo(1, 1, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)),
+                                    framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPoolingLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(
+                                    framework::dataset::make("InputShape", { TensorShape(79U, 37U, 11U),
+                                                                             TensorShape(79U, 37U, 11U, 4U)
+                                                                           }),
+                                    framework::dataset::make("PoolingType", { PoolingType::AVG, PoolingType::L2, PoolingType::MAX })),
+                                    framework::dataset::make("PoolingSize", { Size2D(79, 37) })),
+                                    framework::dataset::make("PadStride", PadStrideInfo(1, 1, 0, 0))),
+                                    framework::dataset::make("ExcludePadding", false)),
+                                    framework::dataset::make("DataType", DataType::F16)),
+                                    framework::dataset::make("DataLayout", DataLayout::NHWC)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f16);
+}
+// clang-format on
+// *INDENT-ON*
+TEST_SUITE_END() // GlobalPooling
+
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
@@ -238,14 +315,14 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<uint8_t>, framew
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLPoolingLayerQuantizedMixedDataLayoutFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                    combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
-                                                                                                                    framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
-                                                                                                                    framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
-                                                                                                                    framework::dataset::make("ExcludePadding", { true })),
-                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8))),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
-                                                                                                                    framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 255.f, 10) })),
-                                                                                                                    framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 255.f, 10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 255.f, 5) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -264,14 +341,14 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerQuantizedFixture<int8_t>, framewo
     validate(CLAccessor(_target), _reference, tolerance_qasymm8_s);
 }
 FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, CLPoolingLayerQuantizedMixedDataLayoutFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                    combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
-                                                                                                                    framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
-                                                                                                                    framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
-                                                                                                                    framework::dataset::make("ExcludePadding", { true })),
-                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
-                                                                                                                    framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })),
-                                                                                                                    framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })))
+                       combine(combine(combine(combine(framework::dataset::make("PoolingType", { PoolingType::MAX, PoolingType::AVG }),
+                                                       framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                               framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
+                                       framework::dataset::make("ExcludePadding", { true })),
+                               framework::dataset::make("DataType", DataType::QASYMM8_SIGNED))),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+                       framework::dataset::make("InputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })),
+                       framework::dataset::make("OutputQuantInfo", { QuantizationInfo(1.f / 127.f, -10) })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8_s);
diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index a75562bac2..f1ce0fecc7 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h
@@ -159,11 +159,13 @@ bool compare_dimensions(const Dimensions<T> &dimensions1, const Dimensions<T> &d
     {
         // In case a 1D/2D shape becomes 3D after permutation, the permuted tensor will have two/one dimension(s) more and the first (two) value(s) will be 1
         // clang-format off
-        if((dimensions1.num_dimensions() != dimensions2.num_dimensions()) &&
-           ((dimensions1.num_dimensions() != (dimensions2.num_dimensions() + 1)) || (dimensions1.x() != 1)) &&
-           ((dimensions1.num_dimensions() != (dimensions2.num_dimensions() + 2)) || (dimensions1.x() != 1) || (dimensions1.y() != 1)))
+        const auto max_dims = std::max(dimensions1.num_dimensions(), dimensions2.num_dimensions());
+        for(unsigned int i = 3; i < max_dims; ++i)
         {
-            return false;
+            if(dimensions1[i] != dimensions2[i])
+            {
+                return false;
+            }
         }
         // clang-format on
 
@@ -171,14 +173,6 @@ bool compare_dimensions(const Dimensions<T> &dimensions1, const Dimensions<T> &d
         {
             return false;
         }
-
-        for(unsigned int i = 3; i < dimensions1.num_dimensions(); ++i)
-        {
-            if(dimensions1[i] != dimensions2[i])
-            {
-                return false;
-            }
-        }
     }
 
     return true;
-- 
cgit v1.2.1