1 files changed, 48 insertions, 74 deletions
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index a262d6b95c..8f8d3e7c3a 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/tuners/CLLWSList.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Error.h"
@@ -31,42 +32,13 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
+#include <memory>
 #include <string>
 
 namespace arm_compute
 {
-namespace
-{
-/** Utility function used to initialize the LWS values to test.
- *  Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
- *
- * @param[in, out] lws         Vector of LWS to test for a specific dimension
- * @param[in]      gws         Size of the GWS
- * @param[in]      lws_max     Max LKWS value allowed to be tested
- * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
- */
-void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
-{
-    lws.push_back(1);
-
-    for(unsigned int i = 2; i <= lws_max; ++i)
-    {
-        // Power of two condition
-        const bool is_power_of_two = (i & (i - 1)) == 0;
-
-        // Condition for the module accordingly with the mod_let_one flag
-        const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
-
-        if(mod_cond || is_power_of_two)
-        {
-            lws.push_back(i);
-        }
-    }
-}
-} // namespace
-
 CLTuner::CLTuner(bool tune_new_kernels)
-    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+    : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuner_mode(CLTunerMode::EXHAUSTIVE)
 {
 }
 
@@ -88,6 +60,15 @@ bool CLTuner::tune_new_kernels() const
     return _tune_new_kernels;
 }
 
+void CLTuner::set_tuner_mode(CLTunerMode mode)
+{
+    _tuner_mode = mode;
+}
+CLTunerMode CLTuner::get_tuner_mode() const
+{
+    return _tuner_mode;
+}
+
 void CLTuner::tune_kernel_static(ICLKernel &kernel)
 {
     ARM_COMPUTE_UNUSED(kernel);
@@ -182,61 +163,54 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
     };
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
 
-    cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
+    cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
 
-    cl::NDRange gws     = ICLKernel::gws_from_window(kernel.window());
-    cl::NDRange opt_lws = cl::NullRange;
+    // Run the kernel with default lws to be used as baseline
+    kernel.run(kernel.window(), queue_profiler);
 
-    const unsigned int lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 64u);
-    const unsigned int lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 32u);
-    const unsigned int lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 32u);
+    queue_profiler.finish();
 
-    std::vector<unsigned int> lws_x;
-    std::vector<unsigned int> lws_y;
-    std::vector<unsigned int> lws_z;
+    const cl_ulong start         = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+    const cl_ulong end           = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+    cl_ulong       min_exec_time = end - start;
+    _kernel_event                = nullptr;
 
-    // Initialize the LWS values to test
-    initialize_lws_values(lws_x, gws[0], lws_x_max, gws[2] > 16);
-    initialize_lws_values(lws_y, gws[1], lws_y_max, gws[2] > 16);
-    initialize_lws_values(lws_z, gws[2], lws_z_max, false);
+    cl::NDRange opt_lws = cl::NullRange;
 
-    for(const auto &z : lws_z)
+    //Construct the list of LWS values to be tested based on the tuner mode.
+    auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws);
+    for(size_t i = 0; i < lws_list->size(); ++i)
     {
-        for(const auto &y : lws_y)
-        {
-            for(const auto &x : lws_x)
-            {
-                cl::NDRange lws_test = cl::NDRange(x, y, z);
-
-                bool invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
+        cl::NDRange lws_test    = (*lws_list)[i];
+        auto        x           = lws_test[0];
+        auto        y           = lws_test[1];
+        auto        z           = lws_test[2];
+        bool        invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
 
-                invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
+        invalid_lws = invalid_lws || (x > gws[0]) || (y > gws[1]) || (z > gws[2]);
 
-                if(invalid_lws)
-                {
-                    continue;
-                }
-
-                //Set the Local-Workgroup-Size
-                kernel.set_lws_hint(lws_test);
+        if(invalid_lws)
+        {
+            continue;
+        }
 
-                // Run the kernel
-                kernel.run(kernel.window(), queue_profiler);
+        //Set the Local-Workgroup-Size
+        kernel.set_lws_hint(lws_test);
 
-                queue_profiler.finish();
+        // Run the kernel
+        kernel.run(kernel.window(), queue_profiler);
 
-                const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
-                const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
-                const cl_ulong diff  = end - start;
-                _kernel_event        = nullptr;
+        queue_profiler.finish();
 
-                // Check the execution time
-                if(diff < min_exec_time)
-                {
-                    min_exec_time = diff;
-                    opt_lws       = cl::NDRange(x, y, z);
-                }
-            }
+        const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+        const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+        const cl_ulong diff  = end - start;
+        _kernel_event        = nullptr;
+        // Check the execution time
+        if(diff < min_exec_time)
+        {
+            min_exec_time = diff;
+            opt_lws       = cl::NDRange(x, y, z);
         }
     }