From 61ef5bf586606d6282526641cf2244121d07c6fd Mon Sep 17 00:00:00 2001
From: Diego Lopez Recas <Diego.LopezRecas@arm.com>
Date: Mon, 11 Dec 2017 12:36:55 +0000
Subject: IVGCVSW-847 Fix {NEON/CL}PoolingLayerKernel config

Also, add validation test that hits the discovered failure for CL.

Change-Id: I5573e0a3f169b85d5fb7299e7c48d74be7165208
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112717
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 src/core/CL/kernels/CLPoolingLayerKernel.cpp   | 51 +++++++++++---------------
 src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 23 +++++-------
 tests/framework/datasets/JoinDataset.h         | 12 +++---
 tests/validation/CL/PoolingLayer.cpp           | 17 +++++++--
 4 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index ac368c77ef..860cc92266 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,39 +123,26 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
     const int input_width  = input->dimension(0);
     const int input_height = input->dimension(1);
 
-    unsigned int num_elems_processed_per_iteration = 1;
+    // Change the number of elements processed per iteration
+    // for pooling 3x3 with stride less equal than 3
+    const bool         can_optimize                      = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
+    const unsigned int num_elems_processed_per_iteration = can_optimize ? 4 : 1;
+    const int          num_elems_read_per_iteration      = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size;
 
-    if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type))
-    {
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
-
-        int num_elems_read_per_iteration = pool_size;
-        if(is_pool3x3_stride_le3)
-        {
-            // Change the number of elements processed and the number of elements read per iteration
-            // for pooling 3x3 with stride less equal than 3
-            num_elems_processed_per_iteration = 4;
-            num_elems_read_per_iteration      = pool_size * (pool_stride_x + 1);
-        }
+    // Number of iterations in X dimension
+    const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
 
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+    // Upper limit for the number of right/bottom border elements that are accessed
+    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+    const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
-        border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-    }
-    else
-    {
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-    }
+    border_size.right  = std::max(upper_bound_w, pool_pad_x);
+    border_size.bottom = std::max(upper_bound_h, pool_pad_y);
 
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    AccessWindowRectangle  input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
+    AccessWindowRectangle input_access(input, -pool_pad_x, -pool_pad_y, num_elems_read_per_iteration, pool_size,
+                                       pool_stride_x * num_elems_processed_per_iteration, pool_stride_y);
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
@@ -305,8 +292,12 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
     {
         // Upsample input by pool size
         Window in_slice(slice);
-        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
-        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
+        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x,
+                                                     (in_slice.x().end() - pool_pad_x) * pool_stride_x,
+                                                     pool_stride_x * _num_elems_processed_per_iteration));
+        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y,
+                                                     (in_slice.y().end() - pool_pad_y) * pool_stride_y,
+                                                     pool_stride_y));
 
         // Set inputs
         unsigned int idx = 0;
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index ac183d2f30..ff4802c5e0 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -317,7 +317,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
             break;
     }
 
-    const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
+    // Number of iterations in X dimension
+    const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
+
+    // Upper limit for the number of right/bottom border elements that are accessed
+    const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
     const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
     border_size         = BorderSize(pool_pad_y, pool_pad_x);
@@ -363,32 +367,25 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    int                 pool_pad_x        = 0;
-    int                 pool_pad_y        = 0;
-    int                 pool_stride_x     = 0;
-    int                 pool_stride_y     = 0;
-    unsigned int        pooled_w          = 0;
-    unsigned int        pooled_h          = 0;
-    PoolingType         pool_type         = pool_info.pool_type();
-    int                 pool_size         = pool_info.pool_size();
+    const PoolingType   pool_type         = pool_info.pool_type();
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info();
     const bool          exclude_padding   = pool_info.exclude_padding();
     const bool          is_global_pooling = pool_info.is_global_pooling();
-    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int           pool_stride_x     = pad_stride_info.stride().first;
 
     // Update pool size in case of global pooling
-    pool_size = is_global_pooling ? input->info()->dimension(0) : pool_size;
+    const int pool_size = is_global_pooling ? input->info()->dimension(0) : pool_info.pool_size();
 
     // Validate pool info before calling scaled_dimensions
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_pool_info(input->info(), pool_info, pool_size));
 
     // Check output dimensions
+    unsigned int pooled_w, pooled_h;
     std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
                                                      input->info()->dimension(1),
                                                      pool_size,
                                                      pool_size,
-                                                     pool_info.pad_stride_info());
+                                                     pad_stride_info);
 
     // Output auto initialization if not yet initialized
     auto_init(input->info(), output->info(), pooled_w, pooled_h);
diff --git a/tests/framework/datasets/JoinDataset.h b/tests/framework/datasets/JoinDataset.h
index eded6e0259..d682c19d6b 100644
--- a/tests/framework/datasets/JoinDataset.h
+++ b/tests/framework/datasets/JoinDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,8 +47,10 @@ template <typename T, typename U>
 class JoinDataset : public Dataset
 {
 private:
-    using iter1_type = typename T::iterator;
-    using iter2_type = typename U::iterator;
+    using T_noref    = typename std::remove_reference<T>::type;
+    using U_noref    = typename std::remove_reference<U>::type;
+    using iter1_type = typename T_noref::iterator;
+    using iter2_type = typename U_noref::iterator;
 
 public:
     /** Construct dataset from the given datasets.
@@ -65,12 +67,12 @@ public:
     JoinDataset(JoinDataset &&) = default;
 
     /** Type of the dataset. */
-    using type = typename T::type;
+    using type = typename T_noref::type;
 
     /** Iterator for the dataset. */
     struct iterator
     {
-        iterator(const T *dataset1, const U *dataset2)
+        iterator(const T_noref *dataset1, const U_noref *dataset2)
             : _iter1{ dataset1->begin() }, _iter2{ dataset2->begin() }, _first_size{ dataset1->size() }
         {
         }
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index ee639376c5..4e5e5aa2e7 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,6 +43,12 @@ namespace validation
 {
 namespace
 {
+/** Failing data set */
+const auto PoolingLayerDatasetSpecial = ((((framework::dataset::make("Shape", TensorShape{ 60U, 52U, 3U, 5U })
+                                            * framework::dataset::make("PoolType", PoolingType::AVG))
+                                           * framework::dataset::make("PoolingSize", 100))
+                                          * framework::dataset::make("PadStride", PadStrideInfo(5, 5, 50, 50)))
+                                         * framework::dataset::make("ExcludePadding", true));
 /** Input data set for floating-point data types */
 const auto PoolingLayerDatasetFP = combine(combine(combine(datasets::PoolingTypes(), framework::dataset::make("PoolingSize", { 2, 3, 4, 7, 9 })),
                                                    framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 2, 1, 1), PadStrideInfo(2, 2, 1, 0) })),
@@ -74,7 +80,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),     // Mismatching data type
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),     // Window shrink
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QS8, 4),     // Mismatching fixed point position
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QS16, 11),   // Window shrink
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QS16, 11),
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),     // Invalid pad/size combination
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32, 0),     // Invalid pad/size combination
                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::QASYMM8, 0), // Invalid parameters
@@ -104,7 +110,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        PoolingLayerInfo(PoolingType::MAX),
                                                        PoolingLayerInfo(PoolingType::AVG),
                                                       })),
-               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false, true })),
+               framework::dataset::make("Expected", { false, false, false, true, false, false, false, false, false, true })),
                input_info, output_info, pool_info, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLPoolingLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), pool_info)) == expected, framework::LogLevel::ERRORS);
@@ -117,6 +123,11 @@ using CLPoolingLayerFixture = PoolingLayerValidationFixture<CLTensor, CLAccessor
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSpecial, CLPoolingLayerFixture<float>, framework::DatasetMode::ALL, PoolingLayerDatasetSpecial * framework::dataset::make("DataType", DataType::F32))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPoolingLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFP, framework::dataset::make("DataType",
                                                                                                     DataType::F32))))
 {
-- 
cgit v1.2.1