From 85e6f518ace17547d6f35ed0e1cfbc39ffb95736 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Thu, 1 Feb 2018 16:57:48 +0000 Subject: COMPMID-891 - Use OpenCL timer in CLTuner Change-Id: I84a914c13b162c4f74321c9cafc30a18ad4ebbdb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118797 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- src/runtime/CL/CLTuner.cpp | 108 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 17 deletions(-) (limited to 'src/runtime/CL/CLTuner.cpp') diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 351f6751c3..917d56756a 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -24,21 +24,47 @@ #include "arm_compute/runtime/CL/CLTuner.h" #include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include #include #include using namespace arm_compute; CLTuner::CLTuner() - : _lws_table() + : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event() { } +void CLTuner::set_cl_kernel_event(cl_event kernel_event) +{ + _kernel_event = kernel_event; +} + void CLTuner::tune_kernel(ICLKernel &kernel) { + if(real_function == nullptr) + { + real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + + // Get the default queue + _queue = CLScheduler::get().queue(); + + // Check if we can use the OpenCL timer with the default queue + cl_command_queue_properties props = _queue.getInfo(); + + if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + { + // Set the queue for profiling + _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); + } + else + { + _queue_profiler = _queue; + } + } + // Get the configuration ID from the kernel const std::string &config_id = kernel.config_id(); @@ -49,6 +75,9 @@ void CLTuner::tune_kernel(ICLKernel &kernel) if(p == _lws_table.end()) { + // Set profiler queue + CLScheduler::get().set_queue(_queue_profiler); + // Find the optimal LWS for the kernel cl::NDRange opt_lws = find_optimal_lws(kernel); @@ -57,6 +86,9 @@ void CLTuner::tune_kernel(ICLKernel &kernel) // Set Local-Workgroup-Size kernel.set_lws_hint(opt_lws); + + // Restore queue + CLScheduler::get().set_queue(_queue); } else { @@ -68,11 +100,12 @@ void CLTuner::tune_kernel(ICLKernel &kernel) cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) { - cl::CommandQueue q = CLScheduler::get().queue(); + // Start intercepting enqueues: + CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this); - double min_exec_time = std::numeric_limits::max(); + cl_ulong min_exec_time = std::numeric_limits::max(); - cl::NDRange opt_lws = cl::NDRange(1, 1, 1); + cl::NDRange opt_lws = cl::NullRange; const int x_step = std::max(1, kernel.window().x().step()); const int y_step = std::max(1, kernel.window().y().step()); @@ -81,43 +114,64 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) const int y_end = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1; const int z_end = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1; + // First run using the default LWS + { + cl::NDRange lws_test = cl::NullRange; + + kernel.set_lws_hint(lws_test); + + // Run the kernel + kernel.run(kernel.window(), _queue_profiler); + + CLScheduler::get().sync(); + + const cl_ulong start = _kernel_event.getProfilingInfo(); + const cl_ulong end = _kernel_event.getProfilingInfo(); + const cl_ulong diff = end - start; + + min_exec_time = diff; + } + for(int z = 1; z <= z_end; ++z) { for(int y = 1; y <= y_end; ++y) { for(int x = 1; x <= x_end; ++x) { - if(x == 1 && y == 1 && z == 1) + cl::NDRange lws_test = cl::NDRange(x, y, z); + + const bool invalid_lws = (x * y * z > static_cast(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1); + + if(invalid_lws) { continue; } - cl::NDRange lws_test = cl::NDRange(x, y, z); - //Set the Local-Workgroup-Size kernel.set_lws_hint(lws_test); - auto t_start = std::chrono::high_resolution_clock::now(); - - // Run - kernel.run(kernel.window(), q); + // Run the kernel + kernel.run(kernel.window(), _queue_profiler); CLScheduler::get().sync(); - auto t_stop = std::chrono::high_resolution_clock::now(); - - std::chrono::duration fp_nano = t_stop - t_start; + const cl_ulong start = _kernel_event.getProfilingInfo(); + const cl_ulong end = _kernel_event.getProfilingInfo(); + const cl_ulong diff = end - start; // Check the execution time - if(fp_nano.count() < min_exec_time) + if(diff < min_exec_time) { - min_exec_time = fp_nano.count(); + min_exec_time = diff; opt_lws = cl::NDRange(x, y, z); } } } } + // Restore real function + CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function; + return opt_lws; } @@ -130,4 +184,24 @@ void CLTuner::import_lws_table(const std::unordered_map &CLTuner::export_lws_table() { return _lws_table; +} + +Interceptor::Interceptor(CLTuner &tuner) + : _tuner(tuner) +{ +} + +cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) +{ + ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported"); + ARM_COMPUTE_UNUSED(event); + + cl_event tmp; + cl_int retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + + // Set OpenCL event + _tuner.set_cl_kernel_event(tmp); + + return retval; } \ No newline at end of file -- cgit v1.2.1