diff options
author | Anthony Barbier <anthony.barbier@arm.com> | 2018-02-27 13:08:00 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:48:33 +0000 |
commit | 8db8318e2292d79f1971ae72198a3031f8ea65bd (patch) | |
tree | 1dd901c4c6036137596250d53fc43b5ca28ae4fe | |
parent | 861f0db548befac0cd5fb28fe2fa8ea1828c715d (diff) | |
download | ComputeLibrary-8db8318e2292d79f1971ae72198a3031f8ea65bd.tar.gz |
COMPMID-978 Load/Store tuning data from file
Change-Id: I1d1f402df3a58704c021b9866d489844fb5e7d7a
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122395
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r-- | arm_compute/runtime/CL/CLTuner.h | 67 | ||||
-rw-r--r-- | src/graph/Graph.cpp | 13 | ||||
-rw-r--r-- | src/runtime/CL/CLTuner.cpp | 210 |
3 files changed, 208 insertions, 82 deletions
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h index 386994682d..251848814d 100644 --- a/arm_compute/runtime/CL/CLTuner.h +++ b/arm_compute/runtime/CL/CLTuner.h @@ -37,12 +37,27 @@ class ICLKernel; class CLTuner : public ICLTuner { public: - /** Constructor */ - CLTuner(); + /** Constructor + * + * @param[in] tune_new_kernels Find the optimal local workgroup size for kernels which are not present in the table ? + * + */ + CLTuner(bool tune_new_kernels = true); /** Destructor */ ~CLTuner() = default; + /* Setter for tune_new_kernels option + * + * @param[in] tune_new_kernels Find the optimal local workgroup size for kernels which are not present in the table ? + */ + void set_tune_new_kernels(bool tune_new_kernels); + /** Manually add a LWS for a kernel + * + * @param[in] kernel_id Unique identifiant of the kernel + * @param[in] optimal_lws Optimal local workgroup size to use for the given kernel + */ + void add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws); /** Import LWS table * * @param[in] lws_table The unordered_map container to import @@ -53,10 +68,7 @@ public: * * return The lws table as unordered_map container */ - const std::unordered_map<std::string, cl::NDRange> &export_lws_table(); - - // Inherited methods overridden: - void tune_kernel(ICLKernel &kernel) override; + const std::unordered_map<std::string, cl::NDRange> &export_lws_table() const; /** Set the OpenCL kernel event * @@ -66,7 +78,10 @@ public: */ void set_cl_kernel_event(cl_event kernel_event); - std::function<decltype(clEnqueueNDRangeKernel)> real_function; + std::function<decltype(clEnqueueNDRangeKernel)> real_clEnqueueNDRangeKernel; + + // Inherited methods overridden: + void tune_kernel(ICLKernel &kernel) override; private: /** Find optimal LWS using brute-force approach @@ -81,33 +96,37 @@ private: cl::CommandQueue _queue; cl::CommandQueue _queue_profiler; cl::Event _kernel_event; + bool _tune_new_kernels; }; -/* Function to be used to intercept kernel enqueues and store their OpenCL Event */ -class Interceptor +class CLFileTuner : public CLTuner { public: - explicit Interceptor(CLTuner &tuner); + /** Constructor + * + * @param[in] file_path File to load/store the tuning information from + * @param[in] update_file If true, save the new LWS table to the file on exit. + * @param[in] tune_new_kernels Find the optimal local workgroup size for kernels which are not present in the table ? + */ + CLFileTuner(std::string file_path = "acl_tuner.csv", bool update_file = false, bool tune_new_kernels = false); - /** clEnqueueNDRangeKernel interface + /** Save the content of the LWS table to file + */ + void save_to_file() const; + /* Setter for update_file option * - * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue. - * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same. - * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. - * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0). - * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function. - * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group - * @param[in] num_events_in_wait_list Number of events in the waiting list - * @param[in] event_wait_list Event waiting list - * @param[in] event OpenCL kernel event + * @param[in] update_file If true, save the new LWS table to the file on exit. + */ + void set_update_file(bool update_file); + /** Destructor * - * @return the OpenCL status + * Will save the LWS table to the file if the CLFileTuner was created with update_file enabled. */ - cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event); + ~CLFileTuner(); + const std::string filename; private: - CLTuner &_tuner; + bool _update_file; }; } #endif /*__ARM_COMPUTE_CLTUNER_H__ */ diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp index b6c6822c36..b1698e4672 100644 --- a/src/graph/Graph.cpp +++ b/src/graph/Graph.cpp @@ -62,7 +62,7 @@ public: std::unique_ptr<INode> _current_node{ nullptr }; ITensorObject *_current_output{ nullptr }; bool _info_enabled{ false }; - CLTuner _tuner{}; + CLFileTuner _tuner{}; private: ITensorObject *_current_input{ nullptr }; @@ -85,14 +85,9 @@ void Graph::graph_init(const bool use_cl_tuner) // Check if OpenCL is available and initialize the scheduler if(opencl_is_available()) { - if(use_cl_tuner) - { - arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner); - } - else - { - arm_compute::CLScheduler::get().default_init(); - } + _pimpl->_tuner.set_tune_new_kernels(use_cl_tuner); + _pimpl->_tuner.set_update_file(use_cl_tuner); + arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner); } } diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index cf5b5bce2d..29ddbea460 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -27,44 +27,81 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include <fstream> +#include <iostream> #include <limits> #include <string> using namespace arm_compute; -CLTuner::CLTuner() - : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event() +namespace { -} +/* Function to be used to intercept kernel enqueues and store their OpenCL Event */ +class Interceptor +{ +public: + explicit Interceptor(CLTuner &tuner); + + /** clEnqueueNDRangeKernel interface + * + * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue. + * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same. + * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. + * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0). + * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function. + * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group + * @param[in] num_events_in_wait_list Number of events in the waiting list + * @param[in] event_wait_list Event waiting list + * @param[in] event OpenCL kernel event + * + * @return the OpenCL status + */ + cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +private: + CLTuner &_tuner; +}; -void CLTuner::set_cl_kernel_event(cl_event kernel_event) +Interceptor::Interceptor(CLTuner &tuner) + : _tuner(tuner) { - _kernel_event = kernel_event; } -void CLTuner::tune_kernel(ICLKernel &kernel) +cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) { - if(real_function == nullptr) - { - real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported"); + ARM_COMPUTE_UNUSED(event); - // Get the default queue - _queue = CLScheduler::get().queue(); + cl_event tmp; + cl_int retval = _tuner.real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); - // Check if we can use the OpenCL timer with the default queue - cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>(); + // Set OpenCL event + _tuner.set_cl_kernel_event(tmp); - if((props & CL_QUEUE_PROFILING_ENABLE) == 0) - { - // Set the queue for profiling - _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); - } - else - { - _queue_profiler = _queue; - } - } + return retval; +} + +} // namespace + +CLTuner::CLTuner(bool tune_new_kernels) + : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels) +{ +} +void CLTuner::set_cl_kernel_event(cl_event kernel_event) +{ + _kernel_event = kernel_event; +} + +void CLTuner::set_tune_new_kernels(bool tune_new_kernels) +{ + _tune_new_kernels = tune_new_kernels; +} + +void CLTuner::tune_kernel(ICLKernel &kernel) +{ // Get the configuration ID from the kernel const std::string &config_id = kernel.config_id(); @@ -75,20 +112,17 @@ void CLTuner::tune_kernel(ICLKernel &kernel) if(p == _lws_table.end()) { - // Set profiler queue - CLScheduler::get().set_queue(_queue_profiler); - - // Find the optimal LWS for the kernel - cl::NDRange opt_lws = find_optimal_lws(kernel); + if(_tune_new_kernels) + { + // Find the optimal LWS for the kernel + cl::NDRange opt_lws = find_optimal_lws(kernel); - // Insert the optimal LWS in the table - _lws_table.emplace(config_id, opt_lws); + // Insert the optimal LWS in the table + add_lws_to_table(config_id, opt_lws); - // Set Local-Workgroup-Size - kernel.set_lws_hint(opt_lws); - - // Restore queue - CLScheduler::get().set_queue(_queue); + // Set Local-Workgroup-Size + kernel.set_lws_hint(opt_lws); + } } else { @@ -98,8 +132,36 @@ void CLTuner::tune_kernel(ICLKernel &kernel) } } +void CLTuner::add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws) +{ + _lws_table.emplace(kernel_id, optimal_lws); +} + cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) { + if(real_clEnqueueNDRangeKernel == nullptr) + { + real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + + // Get the default queue + _queue = CLScheduler::get().queue(); + + // Check if we can use the OpenCL timer with the default queue + cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>(); + + if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + { + // Set the queue for profiling + _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); + } + else + { + _queue_profiler = _queue; + } + } + // Set profiler queue + CLScheduler::get().set_queue(_queue_profiler); + // Start intercepting enqueues: CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this); @@ -170,7 +232,10 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) } // Restore real function - CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function; + CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel; + + // Restore queue + CLScheduler::get().set_queue(_queue); return opt_lws; } @@ -181,27 +246,74 @@ void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange _lws_table = lws_table; } -const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table() +const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table() const { return _lws_table; } -Interceptor::Interceptor(CLTuner &tuner) - : _tuner(tuner) +CLFileTuner::CLFileTuner(std::string file_path, bool update_file, bool tune_new_kernels) + : CLTuner(tune_new_kernels), filename(std::move(file_path)), _update_file(update_file) { + std::ifstream fs(filename); + if(fs.is_open()) + { + std::string line; + while(!std::getline(fs, line).fail()) + { + std::istringstream ss(line); + std::string token; + if(!std::getline(ss, token, ';')) + { + continue; + } + std::string kernel_id = token; + cl::NDRange lws(1, 1, 1); + bool lws_is_valid = true; + for(int i = 0; i < 3; i++) + { + if(std::getline(ss, token, ';').fail()) + { + lws_is_valid = false; + break; + } + lws.get()[i] = support::cpp11::stoi(token); + } + if(!lws_is_valid) + { + continue; // Skip this kernel + } + + // If all dimensions are 0: reset to NullRange (i.e nullptr) + if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0) + { + lws = cl::NullRange; + } + add_lws_to_table(kernel_id, lws); + } + fs.close(); + } } -cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, cl_event *event) +void CLFileTuner::save_to_file() const { - ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported"); - ARM_COMPUTE_UNUSED(event); - - cl_event tmp; - cl_int retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + const std::unordered_map<std::string, cl::NDRange> &table = export_lws_table(); + std::ofstream fs(filename); + for(auto kernel_data : table) + { + fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl; + } + fs.close(); +} - // Set OpenCL event - _tuner.set_cl_kernel_event(tmp); +CLFileTuner::~CLFileTuner() +{ + if(_update_file) + { + save_to_file(); + } +} - return retval; +void CLFileTuner::set_update_file(bool update_file) +{ + _update_file = update_file; } |