From 85e6f518ace17547d6f35ed0e1cfbc39ffb95736 Mon Sep 17 00:00:00 2001
From: Gian Marco <gianmarco.iodice@arm.com>
Date: Thu, 1 Feb 2018 16:57:48 +0000
Subject: COMPMID-891 - Use OpenCL timer in CLTuner

Change-Id: I84a914c13b162c4f74321c9cafc30a18ad4ebbdb
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118797
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 arm_compute/core/CL/OpenCL.h                       |   5 +-
 arm_compute/runtime/CL/CLTuner.h                   |  42 +++++++-
 examples/cl_sgemm.cpp                              |  29 +-----
 src/core/CL/OpenCL.cpp                             |  62 +++++++++++-
 .../CLDepthwiseConvolutionLayer3x3Kernel.cpp       |  14 +++
 src/runtime/CL/CLTuner.cpp                         | 108 +++++++++++++++++----
 6 files changed, 213 insertions(+), 47 deletions(-)

diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index 18ba2902bf..405d5cebd7 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,6 +93,9 @@ public:
     DECLARE_FUNCTION_PTR(clRetainEvent);
     DECLARE_FUNCTION_PTR(clGetPlatformIDs);
     DECLARE_FUNCTION_PTR(clGetKernelWorkGroupInfo);
+    DECLARE_FUNCTION_PTR(clGetCommandQueueInfo);
+    DECLARE_FUNCTION_PTR(clGetKernelInfo);
+    DECLARE_FUNCTION_PTR(clGetEventProfilingInfo);
 
 #undef DECLARE_FUNCTION_PTR
 
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 8a7b96aa09..386994682d 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,6 +58,16 @@ public:
     // Inherited methods overridden:
     void tune_kernel(ICLKernel &kernel) override;
 
+    /** Set the OpenCL kernel event
+     *
+     * @note The interceptor can use this function to store the event associated to the OpenCL kernel
+     *
+     * @param[in] kernel_event The OpenCL kernel event
+     */
+    void set_cl_kernel_event(cl_event kernel_event);
+
+    std::function<decltype(clEnqueueNDRangeKernel)> real_function;
+
 private:
     /** Find optimal LWS using brute-force approach
      *
@@ -68,6 +78,36 @@ private:
     cl::NDRange find_optimal_lws(ICLKernel &kernel);
 
     std::unordered_map<std::string, cl::NDRange> _lws_table;
+    cl::CommandQueue _queue;
+    cl::CommandQueue _queue_profiler;
+    cl::Event        _kernel_event;
+};
+
+/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
+class Interceptor
+{
+public:
+    explicit Interceptor(CLTuner &tuner);
+
+    /** clEnqueueNDRangeKernel interface
+     *
+     * @param[in] command_queue           A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
+     * @param[in] kernel                  A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
+     * @param[in] work_dim                The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
+     * @param[in] gwo                     Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
+     * @param[in] gws                     Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
+     * @param[in] lws                     Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
+     * @param[in] num_events_in_wait_list Number of events in the waiting list
+     * @param[in] event_wait_list         Event waiting list
+     * @param[in] event                   OpenCL kernel event
+     *
+     * @return the OpenCL status
+     */
+    cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                      const cl_event *event_wait_list, cl_event *event);
+
+private:
+    CLTuner &_tuner;
 };
 }
 #endif /*__ARM_COMPUTE_CLTUNER_H__ */
diff --git a/examples/cl_sgemm.cpp b/examples/cl_sgemm.cpp
index f2c63985f6..fa57885450 100644
--- a/examples/cl_sgemm.cpp
+++ b/examples/cl_sgemm.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -175,32 +175,7 @@ public:
     }
     void do_teardown() override
     {
-        if(output_filename.empty()) /* If the inputs were not files, print the results */
-        {
-            std::cout << "\nMatrix 1:" << std::endl;
-            src0.map(true);
-            src0.print(std::cout, IOFormatInfo());
-            src0.unmap();
-
-            std::cout << "Matrix 2:" << std::endl;
-            src1.map(true);
-            src1.print(std::cout, IOFormatInfo());
-            src1.unmap();
-
-            std::cout << "Matrix 3:" << std::endl;
-            src2.map(true);
-            src2.print(std::cout, IOFormatInfo());
-            src2.unmap();
-
-            std::cout << "Alpha:" << alpha << "\n\n";
-            std::cout << "Beta:" << beta << "\n\n";
-
-            std::cout << "Output Matrix:" << std::endl;
-            dst.map(true);
-            dst.print(std::cout, IOFormatInfo());
-            dst.unmap();
-        }
-        else /* Save to .npy file */
+        if(!output_filename.empty()) /* Save to .npy file */
         {
             save_to_npy(dst, output_filename, is_fortran);
         }
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 726279c6ea..06d10a450e 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,6 +108,9 @@ bool CLSymbols::load(const std::string &library)
     LOAD_FUNCTION_PTR(clRetainEvent, handle);
     LOAD_FUNCTION_PTR(clGetPlatformIDs, handle);
     LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo, handle);
+    LOAD_FUNCTION_PTR(clGetCommandQueueInfo, handle);
+    LOAD_FUNCTION_PTR(clGetKernelInfo, handle);
+    LOAD_FUNCTION_PTR(clGetEventProfilingInfo, handle);
 
 #undef LOAD_FUNCTION_PTR
 
@@ -729,3 +732,60 @@ clGetKernelWorkGroupInfo(cl_kernel                 kernel,
         return CL_OUT_OF_RESOURCES;
     }
 }
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue      command_queue,
+                      cl_command_queue_info param_name,
+                      size_t                param_value_size,
+                      void                 *param_value,
+                      size_t               *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int
+clGetKernelInfo(cl_kernel      kernel,
+                cl_kernel_info param_name,
+                size_t         param_value_size,
+                void          *param_value,
+                size_t        *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
+cl_int
+clGetEventProfilingInfo(cl_event          event,
+                        cl_profiling_info param_name,
+                        size_t            param_value_size,
+                        void             *param_value,
+                        size_t           *param_value_size_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
\ No newline at end of file
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index 1c0fe9984f..a9167ee859 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -174,6 +174,20 @@ void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, con
     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
 
     ICLKernel::configure(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "depthwise_convolution3x3_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
 void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 351f6751c3..917d56756a 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -24,21 +24,47 @@
 #include "arm_compute/runtime/CL/CLTuner.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include <chrono>
 #include <limits>
 #include <string>
 
 using namespace arm_compute;
 
 CLTuner::CLTuner()
-    : _lws_table()
+    : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event()
 {
 }
 
+void CLTuner::set_cl_kernel_event(cl_event kernel_event)
+{
+    _kernel_event = kernel_event;
+}
+
 void CLTuner::tune_kernel(ICLKernel &kernel)
 {
+    if(real_function == nullptr)
+    {
+        real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+
+        // Get the default queue
+        _queue = CLScheduler::get().queue();
+
+        // Check if we can use the OpenCL timer with the default queue
+        cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
+
+        if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+        {
+            // Set the queue for profiling
+            _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
+        }
+        else
+        {
+            _queue_profiler = _queue;
+        }
+    }
+
     // Get the configuration ID from the kernel
     const std::string &config_id = kernel.config_id();
 
@@ -49,6 +75,9 @@ void CLTuner::tune_kernel(ICLKernel &kernel)
 
         if(p == _lws_table.end())
         {
+            // Set profiler queue
+            CLScheduler::get().set_queue(_queue_profiler);
+
             // Find the optimal LWS for the kernel
             cl::NDRange opt_lws = find_optimal_lws(kernel);
 
@@ -57,6 +86,9 @@ void CLTuner::tune_kernel(ICLKernel &kernel)
 
             // Set Local-Workgroup-Size
             kernel.set_lws_hint(opt_lws);
+
+            // Restore queue
+            CLScheduler::get().set_queue(_queue);
         }
         else
         {
@@ -68,11 +100,12 @@ void CLTuner::tune_kernel(ICLKernel &kernel)
 
 cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
 {
-    cl::CommandQueue q = CLScheduler::get().queue();
+    // Start intercepting enqueues:
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
 
-    double min_exec_time = std::numeric_limits<double>::max();
+    cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
 
-    cl::NDRange opt_lws = cl::NDRange(1, 1, 1);
+    cl::NDRange opt_lws = cl::NullRange;
 
     const int x_step = std::max(1, kernel.window().x().step());
     const int y_step = std::max(1, kernel.window().y().step());
@@ -81,43 +114,64 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
     const int y_end  = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1;
     const int z_end  = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1;
 
+    // First run using the default LWS
+    {
+        cl::NDRange lws_test = cl::NullRange;
+
+        kernel.set_lws_hint(lws_test);
+
+        // Run the kernel
+        kernel.run(kernel.window(), _queue_profiler);
+
+        CLScheduler::get().sync();
+
+        const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+        const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+        const cl_ulong diff  = end - start;
+
+        min_exec_time = diff;
+    }
+
     for(int z = 1; z <= z_end; ++z)
     {
         for(int y = 1; y <= y_end; ++y)
         {
             for(int x = 1; x <= x_end; ++x)
             {
-                if(x == 1 && y == 1 && z == 1)
+                cl::NDRange lws_test = cl::NDRange(x, y, z);
+
+                const bool invalid_lws = (x * y * z > static_cast<int>(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1);
+
+                if(invalid_lws)
                 {
                     continue;
                 }
 
-                cl::NDRange lws_test = cl::NDRange(x, y, z);
-
                 //Set the Local-Workgroup-Size
                 kernel.set_lws_hint(lws_test);
 
-                auto t_start = std::chrono::high_resolution_clock::now();
-
-                // Run
-                kernel.run(kernel.window(), q);
+                // Run the kernel
+                kernel.run(kernel.window(), _queue_profiler);
 
                 CLScheduler::get().sync();
 
-                auto t_stop = std::chrono::high_resolution_clock::now();
-
-                std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
+                const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+                const cl_ulong end   = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+                const cl_ulong diff  = end - start;
 
                 // Check the execution time
-                if(fp_nano.count() < min_exec_time)
+                if(diff < min_exec_time)
                 {
-                    min_exec_time = fp_nano.count();
+                    min_exec_time = diff;
                     opt_lws       = cl::NDRange(x, y, z);
                 }
             }
         }
     }
 
+    // Restore real function
+    CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function;
+
     return opt_lws;
 }
 
@@ -130,4 +184,24 @@ void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange
 const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
 {
     return _lws_table;
+}
+
+Interceptor::Interceptor(CLTuner &tuner)
+    : _tuner(tuner)
+{
+}
+
+cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                               const cl_event *event_wait_list, cl_event *event)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+    ARM_COMPUTE_UNUSED(event);
+
+    cl_event tmp;
+    cl_int   retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+
+    // Set OpenCL event
+    _tuner.set_cl_kernel_event(tmp);
+
+    return retval;
 }
\ No newline at end of file
-- 
cgit v1.2.1