From 5e281814c5110724d99fe8ee64bdf42ef2c31bce Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Tue, 6 Jul 2021 13:19:41 +0100
Subject: Fix manual LOOP_UNROLLING

The issue is caused by the number of iterations passed to
LOOP_UNROLLING. When we use the manual LOOP_UNROLLING, the number of
iterations must be less than or equal to 128.
To overcome this problem, we create a utility function to check if
any of the critical iterations (kernel dimensions) are beyond that
limit. If so, the utility function, disable the manual loop unrolling.

Resolves COMPMID-4609

Change-Id: I7221c967609e462a5abd1cbb74e2a120f344fcb3
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5913
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/CL/CLHelpers.h                    | 13 +++++++++++
 src/core/CL/CLHelpers.cpp                          | 12 ++++++++++
 .../CLDepthwiseConvolutionLayerNativeKernel.cpp    | 11 +++++++--
 tests/datasets/DepthwiseConvolutionLayerDataset.h  | 10 +++++++++
 tests/validation/CL/DepthwiseConvolutionLayer.cpp  | 13 +++++++++++
 .../fixtures/DepthwiseConvolutionLayerFixture.h    | 26 ++++++++++++----------
 6 files changed, 71 insertions(+), 14 deletions(-)
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index 180211c558..ce7e675b56 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -46,6 +46,9 @@ enum class DataType;
 /** Max vector width of an OpenCL vector */
 static constexpr unsigned int max_cl_vector_width = 16;
 
+/** Max number of manual loop unrolling */
+static constexpr int max_manual_loop_unrolling = 128;
+
 /** Translates a tensor data type to the appropriate OpenCL type.
  *
  * @param[in] dt @ref DataType to be translated to OpenCL type.
@@ -244,5 +247,15 @@ void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint);
  */
 bool export_weights_to_cl_image(const ITensorInfo *tensor);
 
+/* Helper function to force unroll with pragma when any of the input values (iterations) are greater than @ref max_manual_loop_unrolling
+ *
+ * This function passes UNROLL_WITH_PRAGMA at compile time when any of the input values are greater than @ref max_manual_loop_unrolling
+ *
+ * @param[in] built_opts OpenCL kernel build options
+ * @param[in] values     Input values (iterations)
+ *
+ */
+void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values);
+
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLHELPERS_H */
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 3323929742..5c53455eeb 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -466,4 +466,16 @@ bool export_weights_to_cl_image(const ITensorInfo *tensor)
     return true;
 }
 
+void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values)
+{
+    for(const int value : values)
+    {
+        if(value > max_manual_loop_unrolling)
+        {
+            built_opts.add_option("-DUNROLL_WITH_PRAGMA");
+            return;
+        }
+    }
+}
+
 } // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 4bde303f1e..1437b5bebb 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -83,7 +83,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     }
 
     const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation };
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
@@ -237,9 +237,16 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
     build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(conv_info.dilation.y()));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DM0_A=" + support::cpp11::to_string(weights->info()->dimension(1) + m0 - 1));
+    build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
     build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+
+    // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
+    set_unroll_with_pragma(build_opts, { static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+                                         static_cast<int>(_weights->info()->dimension(1)),
+                                         static_cast<int>(_weights->info()->dimension(2))
+                                       });
+
     if(biases != nullptr)
     {
         build_opts.add_option(std::string("-DHAS_BIAS"));
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 3b17910eac..82ea40ff52 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -155,6 +155,16 @@ public:
     }
 };
 
+/** Dataset containing large kernel size for generic depthwise convolution. */
+class LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+    LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset()
+    {
+        add_config(TensorShape(6U, 210U, 8U), Size2D(4U, 194U), PadStrideInfo(1, 1, 0, 0));
+    }
+};
+
 /** Dataset containing small, 3x3 depthwise convolution shapes. */
 class SmallDepthwiseConvolutionLayerDataset3x3 final : public DepthwiseConvolutionLayerDataset
 {
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 79a2678b44..b2cff2b792 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -382,6 +382,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>,
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
 TEST_SUITE(Dilation)
 
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
@@ -418,6 +419,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>,
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
 FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
                            large_depth_multipliers),
                            framework::dataset::make("DataType",
@@ -428,6 +430,17 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
+FIXTURE_DATA_TEST_CASE_NEW(RunLargeKernelSize, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
+                           combine(combine(combine(combine(datasets::LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset(),
+                                                           framework::dataset::make("DepthMultiplier", { 1 })),
+                                                   framework::dataset::make("DataType",
+                                                                            DataType::F32)),
+                                           framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+                                   ActivationFunctionsDataset))
+{
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
                                                                                                                      depth_multipliers),
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index 0e02ae28ca..ddbab7fe13 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -113,6 +113,13 @@ public:
             target_to_use = &_target;
         }
 
+        add_padding_x({ &_src, &_biases }, _data_layout);
+        add_padding_x({ &_weights }, _data_layout, true);
+        if(!_in_place)
+        {
+            add_padding_x({ &_target }, _data_layout);
+        }
+
         // Create Depthwise Convolution configure function
         _dwc.configure(&_src, &_weights, &_biases, target_to_use, _pad_stride_info, _depth_multiplier, _act_info, _dilation);
 
@@ -124,12 +131,6 @@ public:
 
     void allocate_and_run_target()
     {
-        add_padding_x({ &_src, &_weights, &_biases }, _data_layout);
-        if(!_in_place)
-        {
-            add_padding_x({ &_target }, _data_layout);
-        }
-
         // Allocate tensors
         _src.allocator()->allocate();
         _weights.allocator()->allocate();
@@ -317,6 +318,10 @@ public:
         _biases  = create_tensor<TensorType>(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout);
         _target  = create_tensor<TensorType>(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout);
 
+        add_padding_x({ &_src, &_biases, &_target }, _data_layout);
+        add_padding_x({ &_weights }, _data_layout, true);
+        add_padding_y({ &_src, &_target }, _data_layout);
+
         // Create Depthwise Convolution configure function
         const ConvolutionInfo info
         {
@@ -332,9 +337,6 @@ public:
 
     void allocate_and_run_target()
     {
-        add_padding_x({ &_src, &_weights, &_biases, &_target }, _data_layout);
-        add_padding_y({ &_src, &_target }, _data_layout);
-
         // Allocate tensors
         _src.allocator()->allocate();
         _weights.allocator()->allocate();
@@ -482,6 +484,9 @@ public:
             _conv_info, _depth_multiplier, _act_info, _dilation
         };
 
+        add_padding_x({ &_src, &_biases, &_target }, _data_layout);
+        add_padding_x({ &_weights }, _data_layout, _export_to_cl_image); // Don't add left padding if cl image will be used
+
         // Create Depthwise Convolution configure function
         _dwc.configure(&_src, &_weights, &_biases, target_to_use, dwc_info, conv_kernel_info);
 
@@ -493,9 +498,6 @@ public:
 
     void allocate_and_run_target()
     {
-        add_padding_x({ &_src, &_biases, &_target }, _data_layout);
-        add_padding_x({ &_weights }, _data_layout, _export_to_cl_image); // Don't add left padding if cl image will be used
-
         // Allocate tensors
         _src.allocator()->allocate();
         _weights.allocator()->allocate();
-- 
cgit v1.2.1