From 8db8318e2292d79f1971ae72198a3031f8ea65bd Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Tue, 27 Feb 2018 13:08:00 +0000 Subject: COMPMID-978 Load/Store tuning data from file Change-Id: I1d1f402df3a58704c021b9866d489844fb5e7d7a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122395 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- src/runtime/CL/CLTuner.cpp | 210 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 161 insertions(+), 49 deletions(-) (limited to 'src/runtime/CL/CLTuner.cpp') diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index cf5b5bce2d..29ddbea460 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -27,44 +27,81 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include +#include #include #include using namespace arm_compute; -CLTuner::CLTuner() - : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event() +namespace { -} +/* Function to be used to intercept kernel enqueues and store their OpenCL Event */ +class Interceptor +{ +public: + explicit Interceptor(CLTuner &tuner); + + /** clEnqueueNDRangeKernel interface + * + * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue. + * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same. + * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. + * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0). + * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function. + * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group + * @param[in] num_events_in_wait_list Number of events in the waiting list + * @param[in] event_wait_list Event waiting list + * @param[in] event OpenCL kernel event + * + * @return the OpenCL status + */ + cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +private: + CLTuner &_tuner; +}; -void CLTuner::set_cl_kernel_event(cl_event kernel_event) +Interceptor::Interceptor(CLTuner &tuner) + : _tuner(tuner) { - _kernel_event = kernel_event; } -void CLTuner::tune_kernel(ICLKernel &kernel) +cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) { - if(real_function == nullptr) - { - real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported"); + ARM_COMPUTE_UNUSED(event); - // Get the default queue - _queue = CLScheduler::get().queue(); + cl_event tmp; + cl_int retval = _tuner.real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); - // Check if we can use the OpenCL timer with the default queue - cl_command_queue_properties props = _queue.getInfo(); + // Set OpenCL event + _tuner.set_cl_kernel_event(tmp); - if((props & CL_QUEUE_PROFILING_ENABLE) == 0) - { - // Set the queue for profiling - _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); - } - else - { - _queue_profiler = _queue; - } - } + return retval; +} + +} // namespace + +CLTuner::CLTuner(bool tune_new_kernels) + : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels) +{ +} +void CLTuner::set_cl_kernel_event(cl_event kernel_event) +{ + _kernel_event = kernel_event; +} + +void CLTuner::set_tune_new_kernels(bool tune_new_kernels) +{ + _tune_new_kernels = tune_new_kernels; +} + +void CLTuner::tune_kernel(ICLKernel &kernel) +{ // Get the configuration ID from the kernel const std::string &config_id = kernel.config_id(); @@ -75,20 +112,17 @@ void CLTuner::tune_kernel(ICLKernel &kernel) if(p == _lws_table.end()) { - // Set profiler queue - CLScheduler::get().set_queue(_queue_profiler); - - // Find the optimal LWS for the kernel - cl::NDRange opt_lws = find_optimal_lws(kernel); + if(_tune_new_kernels) + { + // Find the optimal LWS for the kernel + cl::NDRange opt_lws = find_optimal_lws(kernel); - // Insert the optimal LWS in the table - _lws_table.emplace(config_id, opt_lws); + // Insert the optimal LWS in the table + add_lws_to_table(config_id, opt_lws); - // Set Local-Workgroup-Size - kernel.set_lws_hint(opt_lws); - - // Restore queue - CLScheduler::get().set_queue(_queue); + // Set Local-Workgroup-Size + kernel.set_lws_hint(opt_lws); + } } else { @@ -98,8 +132,36 @@ void CLTuner::tune_kernel(ICLKernel &kernel) } } +void CLTuner::add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws) +{ + _lws_table.emplace(kernel_id, optimal_lws); +} + cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) { + if(real_clEnqueueNDRangeKernel == nullptr) + { + real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + + // Get the default queue + _queue = CLScheduler::get().queue(); + + // Check if we can use the OpenCL timer with the default queue + cl_command_queue_properties props = _queue.getInfo(); + + if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + { + // Set the queue for profiling + _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); + } + else + { + _queue_profiler = _queue; + } + } + // Set profiler queue + CLScheduler::get().set_queue(_queue_profiler); + // Start intercepting enqueues: CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this); @@ -170,7 +232,10 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) } // Restore real function - CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function; + CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel; + + // Restore queue + CLScheduler::get().set_queue(_queue); return opt_lws; } @@ -181,27 +246,74 @@ void CLTuner::import_lws_table(const std::unordered_map &CLTuner::export_lws_table() +const std::unordered_map &CLTuner::export_lws_table() const { return _lws_table; } -Interceptor::Interceptor(CLTuner &tuner) - : _tuner(tuner) +CLFileTuner::CLFileTuner(std::string file_path, bool update_file, bool tune_new_kernels) + : CLTuner(tune_new_kernels), filename(std::move(file_path)), _update_file(update_file) { + std::ifstream fs(filename); + if(fs.is_open()) + { + std::string line; + while(!std::getline(fs, line).fail()) + { + std::istringstream ss(line); + std::string token; + if(!std::getline(ss, token, ';')) + { + continue; + } + std::string kernel_id = token; + cl::NDRange lws(1, 1, 1); + bool lws_is_valid = true; + for(int i = 0; i < 3; i++) + { + if(std::getline(ss, token, ';').fail()) + { + lws_is_valid = false; + break; + } + lws.get()[i] = support::cpp11::stoi(token); + } + if(!lws_is_valid) + { + continue; // Skip this kernel + } + + // If all dimensions are 0: reset to NullRange (i.e nullptr) + if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0) + { + lws = cl::NullRange; + } + add_lws_to_table(kernel_id, lws); + } + fs.close(); + } } -cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event) +void CLFileTuner::save_to_file() const { - ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported"); - ARM_COMPUTE_UNUSED(event); - - cl_event tmp; - cl_int retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + const std::unordered_map &table = export_lws_table(); + std::ofstream fs(filename); + for(auto kernel_data : table) + { + fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl; + } + fs.close(); +} - // Set OpenCL event - _tuner.set_cl_kernel_event(tmp); +CLFileTuner::~CLFileTuner() +{ + if(_update_file) + { + save_to_file(); + } +} - return retval; +void CLFileTuner::set_update_file(bool update_file) +{ + _update_file = update_file; } -- cgit v1.2.1