Enable global pooling optimization on OpenCL

- Add loop unrolling on X and use POOL_X and POOL_Y defines for the for loop Resolves COMPMID-4573 Change-Id: I33cb825cfb55912ccb0ab9d03bd33a3dab4c8b44 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5872 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2021-06-29 10:08:46 +0100
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2021-06-29 14:09:52 +0000
commit: 72b56875b9bb30a9ed1d2ad38ec51fc88e435c35 (patch)
tree: 0ce37505da5cfd8a4b75000432d66e75a2b48a32
parent: 93b75e0c072c3cc5654fcdf6aed1068b40012081 (diff)
download: ComputeLibrary-72b56875b9bb30a9ed1d2ad38ec51fc88e435c35.tar.gz
3 files changed, 53 insertions, 9 deletions
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 8944c9b1ac..d63a2e51e8 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -711,8 +711,8 @@ __kernel void pooling_layer_MxN_nhwc(
     int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
     int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
 #else  //DST_BATCH_SIZE != 1
-    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
-    int idx_out_n = 0;
+    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
+    int idx_out_n   = 0;
 #endif // DST_BATCH_SIZE != 1
 
     __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
@@ -726,10 +726,10 @@ __kernel void pooling_layer_MxN_nhwc(
     int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
     int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
 
-    int pool_x_s    = max((int)0, -idx_in_w);
-    int pool_x_e    = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s    = max((int)0, -idx_in_h);
-    int pool_y_e    = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
 
 #if defined(EXCLUDE_PADDING)
     int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
@@ -737,17 +737,27 @@ __kernel void pooling_layer_MxN_nhwc(
     int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
 #endif // defined(EXCLUDE_PADDING)
 
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    // Global pooling path
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+#pragma unroll 8
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
     for(int y = pool_y_s; y < pool_y_e; ++y)
     {
+#pragma unroll 8
         for(int x = pool_x_s; x < pool_x_e; ++x)
         {
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
             VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
             data0;
 #if defined(FP_MIXED_PRECISION)
             // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
             data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
 #else  // defined(FP_MIXED_PRECISION)
-            data0   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
 #endif // defined(FP_MIXED_PRECISION)
 
 #if defined(POOL_L2)
@@ -757,9 +767,9 @@ __kernel void pooling_layer_MxN_nhwc(
             res0 = POOL_OP(res0, data0);
         }
     }
-    
+
 #if defined(POOL_AVG) || defined(POOL_L2)
-            res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
 #endif // defined(POOL_AVG) || defined(POOL_L2)
 
 #if defined(POOL_L2)
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 0f5d5c5101..237a5a517c 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -349,6 +349,30 @@ void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &d
     }
 }
 
+void add_padding_y(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout)
+{
+    if(data_layout == DataLayout::NHWC)
+    {
+        constexpr unsigned int lower = 1U;
+        constexpr unsigned int upper = 4U;
+
+        std::uniform_int_distribution<unsigned int> distribution(lower, upper);
+        size_t                                      seed_offset = 0;
+
+        for(ITensor *tensor : tensors)
+        {
+            ARM_COMPUTE_ERROR_ON(!tensor->info()->is_resizable());
+
+            std::mt19937 gen(library->seed() + seed_offset++);
+
+            const unsigned int top    = distribution(gen);
+            const unsigned int bottom = distribution(gen);
+
+            tensor->info()->extend_padding(PaddingSize(top, 0U, bottom, 0U));
+        }
+    }
+}
+
 template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
 template void get_tile(const SimpleTensor<half> &in, SimpleTensor<half> &roi, const Coordinates &coord);
 template void get_tile(const SimpleTensor<int> &in, SimpleTensor<int> &roi, const Coordinates &coord);
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index 00e588e7b7..a8804ad7e7 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -240,6 +240,16 @@ std::pair<int, int> get_symm_quantized_per_channel_bounds(const QuantizationInfo
  * @note This function adds padding to the input tensors only if data_layout == DataLayout::NHWC
  */
 void add_padding_x(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout = DataLayout::NHWC, bool only_right_pad = false);
+
+/** Add random padding along the Y axis (between 1 and 4 rows per side) to all the input tensors.
+ *  This is used in our validation suite in order to simulate implicit padding addition after configuring, but before allocating.
+ *
+ * @param[in] tensors     List of tensors to add padding to
+ * @param[in] data_layout (Optional) Data layout of the operator
+ *
+ * @note This function adds padding to the input tensors only if data_layout == DataLayout::NHWC
+ */
+void add_padding_y(std::initializer_list<ITensor *> tensors, const DataLayout &data_layout = DataLayout::NHWC);
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2021-06-29 10:08:46 +0100
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2021-06-29 14:09:52 +0000
commit	72b56875b9bb30a9ed1d2ad38ec51fc88e435c35 (patch)
tree	0ce37505da5cfd8a4b75000432d66e75a2b48a32
parent	93b75e0c072c3cc5654fcdf6aed1068b40012081 (diff)
download	ComputeLibrary-72b56875b9bb30a9ed1d2ad38ec51fc88e435c35.tar.gz