From 72b56875b9bb30a9ed1d2ad38ec51fc88e435c35 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Tue, 29 Jun 2021 10:08:46 +0100
Subject: Enable global pooling optimization on OpenCL

- Add loop unrolling on X and use POOL_X and POOL_Y defines for the for
  loop

Resolves COMPMID-4573

Change-Id: I33cb825cfb55912ccb0ab9d03bd33a3dab4c8b44
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5872
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/pooling_layer.cl | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'src/core/CL/cl_kernels/pooling_layer.cl')

diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 8944c9b1ac..d63a2e51e8 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -711,8 +711,8 @@ __kernel void pooling_layer_MxN_nhwc(
     int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
     int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
 #else  //DST_BATCH_SIZE != 1
-    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
-    int idx_out_n = 0;
+    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
+    int idx_out_n   = 0;
 #endif // DST_BATCH_SIZE != 1
 
     __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
@@ -726,10 +726,10 @@ __kernel void pooling_layer_MxN_nhwc(
     int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
     int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
 
-    int pool_x_s    = max((int)0, -idx_in_w);
-    int pool_x_e    = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s    = max((int)0, -idx_in_h);
-    int pool_y_e    = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
 
 #if defined(EXCLUDE_PADDING)
     int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
@@ -737,17 +737,27 @@ __kernel void pooling_layer_MxN_nhwc(
     int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
 #endif // defined(EXCLUDE_PADDING)
 
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    // Global pooling path
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+#pragma unroll 8
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
     for(int y = pool_y_s; y < pool_y_e; ++y)
     {
+#pragma unroll 8
         for(int x = pool_x_s; x < pool_x_e; ++x)
         {
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
             VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
             data0;
 #if defined(FP_MIXED_PRECISION)
             // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
             data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
 #else  // defined(FP_MIXED_PRECISION)
-            data0   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
 #endif // defined(FP_MIXED_PRECISION)
 
 #if defined(POOL_L2)
@@ -757,9 +767,9 @@ __kernel void pooling_layer_MxN_nhwc(
             res0 = POOL_OP(res0, data0);
         }
     }
-    
+
 #if defined(POOL_AVG) || defined(POOL_L2)
-            res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
 #endif // defined(POOL_AVG) || defined(POOL_L2)
 
 #if defined(POOL_L2)
-- 
cgit v1.2.1