aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnthony Barbier <anthony.barbier@arm.com>2018-02-27 13:08:00 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:48:33 +0000
commit8db8318e2292d79f1971ae72198a3031f8ea65bd (patch)
tree1dd901c4c6036137596250d53fc43b5ca28ae4fe
parent861f0db548befac0cd5fb28fe2fa8ea1828c715d (diff)
downloadComputeLibrary-8db8318e2292d79f1971ae72198a3031f8ea65bd.tar.gz
COMPMID-978 Load/Store tuning data from file
Change-Id: I1d1f402df3a58704c021b9866d489844fb5e7d7a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/122395 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/runtime/CL/CLTuner.h67
-rw-r--r--src/graph/Graph.cpp13
-rw-r--r--src/runtime/CL/CLTuner.cpp210
3 files changed, 208 insertions, 82 deletions
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 386994682d..251848814d 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -37,12 +37,27 @@ class ICLKernel;
class CLTuner : public ICLTuner
{
public:
- /** Constructor */
- CLTuner();
+ /** Constructor
+ *
+ * @param[in] tune_new_kernels Find the optimal local workgroup size for kernels which are not present in the table ?
+ *
+ */
+ CLTuner(bool tune_new_kernels = true);
/** Destructor */
~CLTuner() = default;
+ /* Setter for tune_new_kernels option
+ *
+ * @param[in] tune_new_kernels Find the optimal local workgroup size for kernels which are not present in the table ?
+ */
+ void set_tune_new_kernels(bool tune_new_kernels);
+ /** Manually add a LWS for a kernel
+ *
+ * @param[in] kernel_id Unique identifiant of the kernel
+ * @param[in] optimal_lws Optimal local workgroup size to use for the given kernel
+ */
+ void add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws);
/** Import LWS table
*
* @param[in] lws_table The unordered_map container to import
@@ -53,10 +68,7 @@ public:
*
* return The lws table as unordered_map container
*/
- const std::unordered_map<std::string, cl::NDRange> &export_lws_table();
-
- // Inherited methods overridden:
- void tune_kernel(ICLKernel &kernel) override;
+ const std::unordered_map<std::string, cl::NDRange> &export_lws_table() const;
/** Set the OpenCL kernel event
*
@@ -66,7 +78,10 @@ public:
*/
void set_cl_kernel_event(cl_event kernel_event);
- std::function<decltype(clEnqueueNDRangeKernel)> real_function;
+ std::function<decltype(clEnqueueNDRangeKernel)> real_clEnqueueNDRangeKernel;
+
+ // Inherited methods overridden:
+ void tune_kernel(ICLKernel &kernel) override;
private:
/** Find optimal LWS using brute-force approach
@@ -81,33 +96,37 @@ private:
cl::CommandQueue _queue;
cl::CommandQueue _queue_profiler;
cl::Event _kernel_event;
+ bool _tune_new_kernels;
};
-/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
-class Interceptor
+class CLFileTuner : public CLTuner
{
public:
- explicit Interceptor(CLTuner &tuner);
+ /** Constructor
+ *
+ * @param[in] file_path File to load/store the tuning information from
+ * @param[in] update_file If true, save the new LWS table to the file on exit.
+ * @param[in] tune_new_kernels Find the optimal local workgroup size for kernels which are not present in the table ?
+ */
+ CLFileTuner(std::string file_path = "acl_tuner.csv", bool update_file = false, bool tune_new_kernels = false);
- /** clEnqueueNDRangeKernel interface
+ /** Save the content of the LWS table to file
+ */
+ void save_to_file() const;
+ /* Setter for update_file option
*
- * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
- * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
- * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
- * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
- * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
- * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
- * @param[in] num_events_in_wait_list Number of events in the waiting list
- * @param[in] event_wait_list Event waiting list
- * @param[in] event OpenCL kernel event
+ * @param[in] update_file If true, save the new LWS table to the file on exit.
+ */
+ void set_update_file(bool update_file);
+ /** Destructor
*
- * @return the OpenCL status
+ * Will save the LWS table to the file if the CLFileTuner was created with update_file enabled.
*/
- cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list, cl_event *event);
+ ~CLFileTuner();
+ const std::string filename;
private:
- CLTuner &_tuner;
+ bool _update_file;
};
}
#endif /*__ARM_COMPUTE_CLTUNER_H__ */
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index b6c6822c36..b1698e4672 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -62,7 +62,7 @@ public:
std::unique_ptr<INode> _current_node{ nullptr };
ITensorObject *_current_output{ nullptr };
bool _info_enabled{ false };
- CLTuner _tuner{};
+ CLFileTuner _tuner{};
private:
ITensorObject *_current_input{ nullptr };
@@ -85,14 +85,9 @@ void Graph::graph_init(const bool use_cl_tuner)
// Check if OpenCL is available and initialize the scheduler
if(opencl_is_available())
{
- if(use_cl_tuner)
- {
- arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner);
- }
- else
- {
- arm_compute::CLScheduler::get().default_init();
- }
+ _pimpl->_tuner.set_tune_new_kernels(use_cl_tuner);
+ _pimpl->_tuner.set_update_file(use_cl_tuner);
+ arm_compute::CLScheduler::get().default_init(&_pimpl->_tuner);
}
}
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index cf5b5bce2d..29ddbea460 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -27,44 +27,81 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include <fstream>
+#include <iostream>
#include <limits>
#include <string>
using namespace arm_compute;
-CLTuner::CLTuner()
- : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event()
+namespace
{
-}
+/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
+class Interceptor
+{
+public:
+ explicit Interceptor(CLTuner &tuner);
+
+ /** clEnqueueNDRangeKernel interface
+ *
+ * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
+ * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
+ * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
+ * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
+ * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
+ * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
+ * @param[in] num_events_in_wait_list Number of events in the waiting list
+ * @param[in] event_wait_list Event waiting list
+ * @param[in] event OpenCL kernel event
+ *
+ * @return the OpenCL status
+ */
+ cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_event *event);
+
+private:
+ CLTuner &_tuner;
+};
-void CLTuner::set_cl_kernel_event(cl_event kernel_event)
+Interceptor::Interceptor(CLTuner &tuner)
+ : _tuner(tuner)
{
- _kernel_event = kernel_event;
}
-void CLTuner::tune_kernel(ICLKernel &kernel)
+cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_event *event)
{
- if(real_function == nullptr)
- {
- real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+ ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+ ARM_COMPUTE_UNUSED(event);
- // Get the default queue
- _queue = CLScheduler::get().queue();
+ cl_event tmp;
+ cl_int retval = _tuner.real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
- // Check if we can use the OpenCL timer with the default queue
- cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
+ // Set OpenCL event
+ _tuner.set_cl_kernel_event(tmp);
- if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
- {
- // Set the queue for profiling
- _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
- }
- else
- {
- _queue_profiler = _queue;
- }
- }
+ return retval;
+}
+
+} // namespace
+
+CLTuner::CLTuner(bool tune_new_kernels)
+ : real_clEnqueueNDRangeKernel(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event(), _tune_new_kernels(tune_new_kernels)
+{
+}
+void CLTuner::set_cl_kernel_event(cl_event kernel_event)
+{
+ _kernel_event = kernel_event;
+}
+
+void CLTuner::set_tune_new_kernels(bool tune_new_kernels)
+{
+ _tune_new_kernels = tune_new_kernels;
+}
+
+void CLTuner::tune_kernel(ICLKernel &kernel)
+{
// Get the configuration ID from the kernel
const std::string &config_id = kernel.config_id();
@@ -75,20 +112,17 @@ void CLTuner::tune_kernel(ICLKernel &kernel)
if(p == _lws_table.end())
{
- // Set profiler queue
- CLScheduler::get().set_queue(_queue_profiler);
-
- // Find the optimal LWS for the kernel
- cl::NDRange opt_lws = find_optimal_lws(kernel);
+ if(_tune_new_kernels)
+ {
+ // Find the optimal LWS for the kernel
+ cl::NDRange opt_lws = find_optimal_lws(kernel);
- // Insert the optimal LWS in the table
- _lws_table.emplace(config_id, opt_lws);
+ // Insert the optimal LWS in the table
+ add_lws_to_table(config_id, opt_lws);
- // Set Local-Workgroup-Size
- kernel.set_lws_hint(opt_lws);
-
- // Restore queue
- CLScheduler::get().set_queue(_queue);
+ // Set Local-Workgroup-Size
+ kernel.set_lws_hint(opt_lws);
+ }
}
else
{
@@ -98,8 +132,36 @@ void CLTuner::tune_kernel(ICLKernel &kernel)
}
}
+void CLTuner::add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws)
+{
+ _lws_table.emplace(kernel_id, optimal_lws);
+}
+
cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
{
+ if(real_clEnqueueNDRangeKernel == nullptr)
+ {
+ real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+
+ // Get the default queue
+ _queue = CLScheduler::get().queue();
+
+ // Check if we can use the OpenCL timer with the default queue
+ cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
+
+ if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ {
+ // Set the queue for profiling
+ _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
+ }
+ else
+ {
+ _queue_profiler = _queue;
+ }
+ }
+ // Set profiler queue
+ CLScheduler::get().set_queue(_queue_profiler);
+
// Start intercepting enqueues:
CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
@@ -170,7 +232,10 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
}
// Restore real function
- CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function;
+ CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel;
+
+ // Restore queue
+ CLScheduler::get().set_queue(_queue);
return opt_lws;
}
@@ -181,27 +246,74 @@ void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange
_lws_table = lws_table;
}
-const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
+const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table() const
{
return _lws_table;
}
-Interceptor::Interceptor(CLTuner &tuner)
- : _tuner(tuner)
+CLFileTuner::CLFileTuner(std::string file_path, bool update_file, bool tune_new_kernels)
+ : CLTuner(tune_new_kernels), filename(std::move(file_path)), _update_file(update_file)
{
+ std::ifstream fs(filename);
+ if(fs.is_open())
+ {
+ std::string line;
+ while(!std::getline(fs, line).fail())
+ {
+ std::istringstream ss(line);
+ std::string token;
+ if(!std::getline(ss, token, ';'))
+ {
+ continue;
+ }
+ std::string kernel_id = token;
+ cl::NDRange lws(1, 1, 1);
+ bool lws_is_valid = true;
+ for(int i = 0; i < 3; i++)
+ {
+ if(std::getline(ss, token, ';').fail())
+ {
+ lws_is_valid = false;
+ break;
+ }
+ lws.get()[i] = support::cpp11::stoi(token);
+ }
+ if(!lws_is_valid)
+ {
+ continue; // Skip this kernel
+ }
+
+ // If all dimensions are 0: reset to NullRange (i.e nullptr)
+ if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0)
+ {
+ lws = cl::NullRange;
+ }
+ add_lws_to_table(kernel_id, lws);
+ }
+ fs.close();
+ }
}
-cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
- const cl_event *event_wait_list, cl_event *event)
+void CLFileTuner::save_to_file() const
{
- ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
- ARM_COMPUTE_UNUSED(event);
-
- cl_event tmp;
- cl_int retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+ const std::unordered_map<std::string, cl::NDRange> &table = export_lws_table();
+ std::ofstream fs(filename);
+ for(auto kernel_data : table)
+ {
+ fs << kernel_data.first << ";" << kernel_data.second[0] << ";" << kernel_data.second[1] << ";" << kernel_data.second[2] << std::endl;
+ }
+ fs.close();
+}
- // Set OpenCL event
- _tuner.set_cl_kernel_event(tmp);
+CLFileTuner::~CLFileTuner()
+{
+ if(_update_file)
+ {
+ save_to_file();
+ }
+}
- return retval;
+void CLFileTuner::set_update_file(bool update_file)
+{
+ _update_file = update_file;
}