aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2018-02-01 16:57:48 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:45:42 +0000
commit85e6f518ace17547d6f35ed0e1cfbc39ffb95736 (patch)
treeb3cc77dfeaafe646c06abdbc9a03f88f7aa196c4
parent76faef88284e6fd51f53b23063374d3d3a884e4f (diff)
downloadComputeLibrary-85e6f518ace17547d6f35ed0e1cfbc39ffb95736.tar.gz
COMPMID-891 - Use OpenCL timer in CLTuner
Change-Id: I84a914c13b162c4f74321c9cafc30a18ad4ebbdb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118797 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--arm_compute/core/CL/OpenCL.h5
-rw-r--r--arm_compute/runtime/CL/CLTuner.h42
-rw-r--r--examples/cl_sgemm.cpp29
-rw-r--r--src/core/CL/OpenCL.cpp62
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp14
-rw-r--r--src/runtime/CL/CLTuner.cpp108
6 files changed, 213 insertions, 47 deletions
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index 18ba2902bf..405d5cebd7 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -93,6 +93,9 @@ public:
DECLARE_FUNCTION_PTR(clRetainEvent);
DECLARE_FUNCTION_PTR(clGetPlatformIDs);
DECLARE_FUNCTION_PTR(clGetKernelWorkGroupInfo);
+ DECLARE_FUNCTION_PTR(clGetCommandQueueInfo);
+ DECLARE_FUNCTION_PTR(clGetKernelInfo);
+ DECLARE_FUNCTION_PTR(clGetEventProfilingInfo);
#undef DECLARE_FUNCTION_PTR
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 8a7b96aa09..386994682d 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -58,6 +58,16 @@ public:
// Inherited methods overridden:
void tune_kernel(ICLKernel &kernel) override;
+ /** Set the OpenCL kernel event
+ *
+ * @note The interceptor can use this function to store the event associated to the OpenCL kernel
+ *
+ * @param[in] kernel_event The OpenCL kernel event
+ */
+ void set_cl_kernel_event(cl_event kernel_event);
+
+ std::function<decltype(clEnqueueNDRangeKernel)> real_function;
+
private:
/** Find optimal LWS using brute-force approach
*
@@ -68,6 +78,36 @@ private:
cl::NDRange find_optimal_lws(ICLKernel &kernel);
std::unordered_map<std::string, cl::NDRange> _lws_table;
+ cl::CommandQueue _queue;
+ cl::CommandQueue _queue_profiler;
+ cl::Event _kernel_event;
+};
+
+/* Function to be used to intercept kernel enqueues and store their OpenCL Event */
+class Interceptor
+{
+public:
+ explicit Interceptor(CLTuner &tuner);
+
+ /** clEnqueueNDRangeKernel interface
+ *
+ * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue.
+ * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same.
+ * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS.
+ * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0).
+ * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function.
+ * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group
+ * @param[in] num_events_in_wait_list Number of events in the waiting list
+ * @param[in] event_wait_list Event waiting list
+ * @param[in] event OpenCL kernel event
+ *
+ * @return the OpenCL status
+ */
+ cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_event *event);
+
+private:
+ CLTuner &_tuner;
};
}
#endif /*__ARM_COMPUTE_CLTUNER_H__ */
diff --git a/examples/cl_sgemm.cpp b/examples/cl_sgemm.cpp
index f2c63985f6..fa57885450 100644
--- a/examples/cl_sgemm.cpp
+++ b/examples/cl_sgemm.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -175,32 +175,7 @@ public:
}
void do_teardown() override
{
- if(output_filename.empty()) /* If the inputs were not files, print the results */
- {
- std::cout << "\nMatrix 1:" << std::endl;
- src0.map(true);
- src0.print(std::cout, IOFormatInfo());
- src0.unmap();
-
- std::cout << "Matrix 2:" << std::endl;
- src1.map(true);
- src1.print(std::cout, IOFormatInfo());
- src1.unmap();
-
- std::cout << "Matrix 3:" << std::endl;
- src2.map(true);
- src2.print(std::cout, IOFormatInfo());
- src2.unmap();
-
- std::cout << "Alpha:" << alpha << "\n\n";
- std::cout << "Beta:" << beta << "\n\n";
-
- std::cout << "Output Matrix:" << std::endl;
- dst.map(true);
- dst.print(std::cout, IOFormatInfo());
- dst.unmap();
- }
- else /* Save to .npy file */
+ if(!output_filename.empty()) /* Save to .npy file */
{
save_to_npy(dst, output_filename, is_fortran);
}
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 726279c6ea..06d10a450e 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -108,6 +108,9 @@ bool CLSymbols::load(const std::string &library)
LOAD_FUNCTION_PTR(clRetainEvent, handle);
LOAD_FUNCTION_PTR(clGetPlatformIDs, handle);
LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo, handle);
+ LOAD_FUNCTION_PTR(clGetCommandQueueInfo, handle);
+ LOAD_FUNCTION_PTR(clGetKernelInfo, handle);
+ LOAD_FUNCTION_PTR(clGetEventProfilingInfo, handle);
#undef LOAD_FUNCTION_PTR
@@ -729,3 +732,60 @@ clGetKernelWorkGroupInfo(cl_kernel kernel,
return CL_OUT_OF_RESOURCES;
}
}
+
+cl_int
+clGetCommandQueueInfo(cl_command_queue command_queue,
+ cl_command_queue_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr;
+ if(func != nullptr)
+ {
+ return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int
+clGetKernelInfo(cl_kernel kernel,
+ cl_kernel_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr;
+ if(func != nullptr)
+ {
+ return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
+cl_int
+clGetEventProfilingInfo(cl_event event,
+ cl_profiling_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr;
+ if(func != nullptr)
+ {
+ return func(event, param_name, param_value_size, param_value, param_value_size_ret);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+} \ No newline at end of file
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
index 1c0fe9984f..a9167ee859 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -174,6 +174,20 @@ void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, con
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
ICLKernel::configure(win);
+
+ // Set config_id for enabling LWS tuning
+ _config_id = "depthwise_convolution3x3_";
+ _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(1));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(input->info()->dimension(2));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(0));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(output->info()->dimension(1));
}
void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 351f6751c3..917d56756a 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -24,21 +24,47 @@
#include "arm_compute/runtime/CL/CLTuner.h"
#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include <chrono>
#include <limits>
#include <string>
using namespace arm_compute;
CLTuner::CLTuner()
- : _lws_table()
+ : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event()
{
}
+void CLTuner::set_cl_kernel_event(cl_event kernel_event)
+{
+ _kernel_event = kernel_event;
+}
+
void CLTuner::tune_kernel(ICLKernel &kernel)
{
+ if(real_function == nullptr)
+ {
+ real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+
+ // Get the default queue
+ _queue = CLScheduler::get().queue();
+
+ // Check if we can use the OpenCL timer with the default queue
+ cl_command_queue_properties props = _queue.getInfo<CL_QUEUE_PROPERTIES>();
+
+ if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+ {
+ // Set the queue for profiling
+ _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
+ }
+ else
+ {
+ _queue_profiler = _queue;
+ }
+ }
+
// Get the configuration ID from the kernel
const std::string &config_id = kernel.config_id();
@@ -49,6 +75,9 @@ void CLTuner::tune_kernel(ICLKernel &kernel)
if(p == _lws_table.end())
{
+ // Set profiler queue
+ CLScheduler::get().set_queue(_queue_profiler);
+
// Find the optimal LWS for the kernel
cl::NDRange opt_lws = find_optimal_lws(kernel);
@@ -57,6 +86,9 @@ void CLTuner::tune_kernel(ICLKernel &kernel)
// Set Local-Workgroup-Size
kernel.set_lws_hint(opt_lws);
+
+ // Restore queue
+ CLScheduler::get().set_queue(_queue);
}
else
{
@@ -68,11 +100,12 @@ void CLTuner::tune_kernel(ICLKernel &kernel)
cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
{
- cl::CommandQueue q = CLScheduler::get().queue();
+ // Start intercepting enqueues:
+ CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this);
- double min_exec_time = std::numeric_limits<double>::max();
+ cl_ulong min_exec_time = std::numeric_limits<cl_ulong>::max();
- cl::NDRange opt_lws = cl::NDRange(1, 1, 1);
+ cl::NDRange opt_lws = cl::NullRange;
const int x_step = std::max(1, kernel.window().x().step());
const int y_step = std::max(1, kernel.window().y().step());
@@ -81,43 +114,64 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
const int y_end = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1;
const int z_end = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1;
+ // First run using the default LWS
+ {
+ cl::NDRange lws_test = cl::NullRange;
+
+ kernel.set_lws_hint(lws_test);
+
+ // Run the kernel
+ kernel.run(kernel.window(), _queue_profiler);
+
+ CLScheduler::get().sync();
+
+ const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+ const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+ const cl_ulong diff = end - start;
+
+ min_exec_time = diff;
+ }
+
for(int z = 1; z <= z_end; ++z)
{
for(int y = 1; y <= y_end; ++y)
{
for(int x = 1; x <= x_end; ++x)
{
- if(x == 1 && y == 1 && z == 1)
+ cl::NDRange lws_test = cl::NDRange(x, y, z);
+
+ const bool invalid_lws = (x * y * z > static_cast<int>(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1);
+
+ if(invalid_lws)
{
continue;
}
- cl::NDRange lws_test = cl::NDRange(x, y, z);
-
//Set the Local-Workgroup-Size
kernel.set_lws_hint(lws_test);
- auto t_start = std::chrono::high_resolution_clock::now();
-
- // Run
- kernel.run(kernel.window(), q);
+ // Run the kernel
+ kernel.run(kernel.window(), _queue_profiler);
CLScheduler::get().sync();
- auto t_stop = std::chrono::high_resolution_clock::now();
-
- std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
+ const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+ const cl_ulong end = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+ const cl_ulong diff = end - start;
// Check the execution time
- if(fp_nano.count() < min_exec_time)
+ if(diff < min_exec_time)
{
- min_exec_time = fp_nano.count();
+ min_exec_time = diff;
opt_lws = cl::NDRange(x, y, z);
}
}
}
}
+ // Restore real function
+ CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function;
+
return opt_lws;
}
@@ -130,4 +184,24 @@ void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange
const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
{
return _lws_table;
+}
+
+Interceptor::Interceptor(CLTuner &tuner)
+ : _tuner(tuner)
+{
+}
+
+cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+ const cl_event *event_wait_list, cl_event *event)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported");
+ ARM_COMPUTE_UNUSED(event);
+
+ cl_event tmp;
+ cl_int retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+
+ // Set OpenCL event
+ _tuner.set_cl_kernel_event(tmp);
+
+ return retval;
} \ No newline at end of file