COMPMID-1910: Improve CLTuner reducing the number of LWS to test

Change-Id: I842120a2bcddc5bf8677ee4d0b1f9d379771b36b Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/622 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2019-01-31 17:06:54 +0000
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2019-02-04 16:15:19 +0000
commit: a74923ce6f4077ab2aef3651818c45f73fef97fd (patch)
tree: 4ddea5a721c4b0e48a4b0dbb4b6cea69663ed7c4
parent: 3dbfd23d68b9450fc3e8bf97d92bc211e78e8979 (diff)
download: ComputeLibrary-a74923ce6f4077ab2aef3651818c45f73fef97fd.tar.gz
2 files changed, 52 insertions, 31 deletions
diff --git a/examples/graph_shufflenet.cpp b/examples/graph_shufflenet.cpp
index 39c2227022..e6016f0451 100644
--- a/examples/graph_shufflenet.cpp
+++ b/examples/graph_shufflenet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@ public:
         // Set default layout if needed (Single kernel grouped convolution not yet supported int NHWC)
         if(!common_opts.data_layout->is_set())
         {
-            common_params.data_layout = DataLayout::NCHW;
+            common_params.data_layout = DataLayout::NHWC;
         }
 
         // Checks
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5f82cd3fbe..c09914cea2 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,37 @@
 #include <limits>
 #include <string>
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+/** Utility function used to initialize the LWS values to test.
+ *  Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
+ *
+ * @param[in, out] lws         Vector of LWS to test for a specific dimension
+ * @param[in]      gws         Size of the GWS
+ * @param[in]      lws_max     Max LKWS value allowed to be tested
+ * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
+ */
+void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+{
+    lws.push_back(1);
+
+    for(unsigned int i = 2; i <= lws_max; ++i)
+    {
+        // Power of two condition
+        const bool is_power_of_two = (i & (i - 1)) == 0;
+
+        // Condition for the module accordingly with the mod_let_one flag
+        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
+
+        if(mod_cond || is_power_of_two)
+        {
+            lws.push_back(i);
+        }
+    }
+}
+} // namespace
 
 CLTuner::CLTuner(bool tune_new_kernels)
     : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
@@ -145,43 +175,33 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
 
     cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
 
+    cl::NDRange gws     = ICLKernel::gws_from_window(kernel.window());
     cl::NDRange opt_lws = cl::NullRange;
 
-    const int x_step = std::max(1, kernel.window().x().step());
-    const int y_step = std::max(1, kernel.window().y().step());
-    const int z_step = std::max(1, kernel.window().z().step());
-    const int x_end  = kernel.window().x().end() - kernel.window().x().start() / x_step > 1 ? 16 : 1;
-    const int y_end  = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1;
-    const int z_end  = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1;
-
-    // First run using the default LWS
-    {
-        cl::NDRange lws_test = cl::NullRange;
-
-        kernel.set_lws_hint(lws_test);
-
-        // Run the kernel
-        kernel.run(kernel.window(), _queue_profiler);
+    const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
+    const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
+    const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
 
-        _queue_profiler.finish();
+    std::vector<unsigned int> lws_x;
+    std::vector<unsigned int> lws_y;
+    std::vector<unsigned int> lws_z;
 
-        const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-        const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-        const cl_ulong diff  = end - start;
-        _kernel_event        = nullptr;
+    // Initialize the LWS values to test
+    initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
+    initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
+    initialize_lws_values(lws_z, gws[2], lws_z_max, false);
 
-        min_exec_time = diff;
-    }
-
-    for(int z = 1; z <= z_end; ++z)
+    for(const auto &z : lws_z)
     {
-        for(int y = 1; y <= y_end; ++y)
+        for(const auto &y : lws_y)
         {
-            for(int x = 1; x <= x_end; ++x)
+            for(const auto &x : lws_x)
             {
                 cl::NDRange lws_test = cl::NDRange(x, y, z);
 
-                const bool invalid_lws = (x * y * z > static_cast<int>(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1);
+                bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+
+                invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
 
                 if(invalid_lws)
                 {
@@ -278,3 +298,4 @@ void CLTuner::save_to_file(const std::string &filename) const
     }
     fs.close();
 }
+} // namespace arm_compute
+\ No newline at end of file
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2019-01-31 17:06:54 +0000
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2019-02-04 16:15:19 +0000
commit	a74923ce6f4077ab2aef3651818c45f73fef97fd (patch)
tree	4ddea5a721c4b0e48a4b0dbb4b6cea69663ed7c4
parent	3dbfd23d68b9450fc3e8bf97d92bc211e78e8979 (diff)
download	ComputeLibrary-a74923ce6f4077ab2aef3651818c45f73fef97fd.tar.gz