From 85e6f518ace17547d6f35ed0e1cfbc39ffb95736 Mon Sep 17 00:00:00 2001 From: Gian Marco Date: Thu, 1 Feb 2018 16:57:48 +0000 Subject: COMPMID-891 - Use OpenCL timer in CLTuner Change-Id: I84a914c13b162c4f74321c9cafc30a18ad4ebbdb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/118797 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- arm_compute/core/CL/OpenCL.h | 5 +- arm_compute/runtime/CL/CLTuner.h | 42 +++++++- examples/cl_sgemm.cpp | 29 +----- src/core/CL/OpenCL.cpp | 62 +++++++++++- .../CLDepthwiseConvolutionLayer3x3Kernel.cpp | 14 +++ src/runtime/CL/CLTuner.cpp | 108 +++++++++++++++++---- 6 files changed, 213 insertions(+), 47 deletions(-) diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h index 18ba2902bf..405d5cebd7 100644 --- a/arm_compute/core/CL/OpenCL.h +++ b/arm_compute/core/CL/OpenCL.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -93,6 +93,9 @@ public: DECLARE_FUNCTION_PTR(clRetainEvent); DECLARE_FUNCTION_PTR(clGetPlatformIDs); DECLARE_FUNCTION_PTR(clGetKernelWorkGroupInfo); + DECLARE_FUNCTION_PTR(clGetCommandQueueInfo); + DECLARE_FUNCTION_PTR(clGetKernelInfo); + DECLARE_FUNCTION_PTR(clGetEventProfilingInfo); #undef DECLARE_FUNCTION_PTR diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h index 8a7b96aa09..386994682d 100644 --- a/arm_compute/runtime/CL/CLTuner.h +++ b/arm_compute/runtime/CL/CLTuner.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -58,6 +58,16 @@ public: // Inherited methods overridden: void tune_kernel(ICLKernel &kernel) override; + /** Set the OpenCL kernel event + * + * @note The interceptor can use this function to store the event associated to the OpenCL kernel + * + * @param[in] kernel_event The OpenCL kernel event + */ + void set_cl_kernel_event(cl_event kernel_event); + + std::function real_function; + private: /** Find optimal LWS using brute-force approach * @@ -68,6 +78,36 @@ private: cl::NDRange find_optimal_lws(ICLKernel &kernel); std::unordered_map _lws_table; + cl::CommandQueue _queue; + cl::CommandQueue _queue_profiler; + cl::Event _kernel_event; +}; + +/* Function to be used to intercept kernel enqueues and store their OpenCL Event */ +class Interceptor +{ +public: + explicit Interceptor(CLTuner &tuner); + + /** clEnqueueNDRangeKernel interface + * + * @param[in] command_queue A valid command-queue. The kernel will be queued for execution on the device associated with command_queue. + * @param[in] kernel A valid kernel object. The OpenCL context associated with kernel and command_queue must be the same. + * @param[in] work_dim The number of dimensions used to specify the global work-items and work-items in the work-group. work_dim must be greater than zero and less than or equal to CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS. + * @param[in] gwo Global-Workgroup-Offset. It can be used to specify an array of work_dim unsigned values that describe the offset used to calculate the global ID of a work-item. If global_work_offset is NULL, the global IDs start at offset (0, 0, ... 0). + * @param[in] gws Global-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of global work-items in work_dim dimensions that will execute the kernel function. + * @param[in] lws Local-Workgroup-Size. Points to an array of work_dim unsigned values that describe the number of work-items that make up a work-group + * @param[in] num_events_in_wait_list Number of events in the waiting list + * @param[in] event_wait_list Event waiting list + * @param[in] event OpenCL kernel event + * + * @return the OpenCL status + */ + cl_int operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event); + +private: + CLTuner &_tuner; }; } #endif /*__ARM_COMPUTE_CLTUNER_H__ */ diff --git a/examples/cl_sgemm.cpp b/examples/cl_sgemm.cpp index f2c63985f6..fa57885450 100644 --- a/examples/cl_sgemm.cpp +++ b/examples/cl_sgemm.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -175,32 +175,7 @@ public: } void do_teardown() override { - if(output_filename.empty()) /* If the inputs were not files, print the results */ - { - std::cout << "\nMatrix 1:" << std::endl; - src0.map(true); - src0.print(std::cout, IOFormatInfo()); - src0.unmap(); - - std::cout << "Matrix 2:" << std::endl; - src1.map(true); - src1.print(std::cout, IOFormatInfo()); - src1.unmap(); - - std::cout << "Matrix 3:" << std::endl; - src2.map(true); - src2.print(std::cout, IOFormatInfo()); - src2.unmap(); - - std::cout << "Alpha:" << alpha << "\n\n"; - std::cout << "Beta:" << beta << "\n\n"; - - std::cout << "Output Matrix:" << std::endl; - dst.map(true); - dst.print(std::cout, IOFormatInfo()); - dst.unmap(); - } - else /* Save to .npy file */ + if(!output_filename.empty()) /* Save to .npy file */ { save_to_npy(dst, output_filename, is_fortran); } diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index 726279c6ea..06d10a450e 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -108,6 +108,9 @@ bool CLSymbols::load(const std::string &library) LOAD_FUNCTION_PTR(clRetainEvent, handle); LOAD_FUNCTION_PTR(clGetPlatformIDs, handle); LOAD_FUNCTION_PTR(clGetKernelWorkGroupInfo, handle); + LOAD_FUNCTION_PTR(clGetCommandQueueInfo, handle); + LOAD_FUNCTION_PTR(clGetKernelInfo, handle); + LOAD_FUNCTION_PTR(clGetEventProfilingInfo, handle); #undef LOAD_FUNCTION_PTR @@ -729,3 +732,60 @@ clGetKernelWorkGroupInfo(cl_kernel kernel, return CL_OUT_OF_RESOURCES; } } + +cl_int +clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr; + if(func != nullptr) + { + return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int +clGetKernelInfo(cl_kernel kernel, + cl_kernel_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr; + if(func != nullptr) + { + return func(kernel, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int +clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr; + if(func != nullptr) + { + return func(event, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} \ No newline at end of file diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp index 1c0fe9984f..a9167ee859 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp @@ -174,6 +174,20 @@ void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, con output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); ICLKernel::configure(win); + + // Set config_id for enabling LWS tuning + _config_id = "depthwise_convolution3x3_"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(1)); } void CLDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, cl::CommandQueue &queue) diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 351f6751c3..917d56756a 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -24,21 +24,47 @@ #include "arm_compute/runtime/CL/CLTuner.h" #include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include #include #include using namespace arm_compute; CLTuner::CLTuner() - : _lws_table() + : real_function(nullptr), _lws_table(), _queue(), _queue_profiler(), _kernel_event() { } +void CLTuner::set_cl_kernel_event(cl_event kernel_event) +{ + _kernel_event = kernel_event; +} + void CLTuner::tune_kernel(ICLKernel &kernel) { + if(real_function == nullptr) + { + real_function = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + + // Get the default queue + _queue = CLScheduler::get().queue(); + + // Check if we can use the OpenCL timer with the default queue + cl_command_queue_properties props = _queue.getInfo(); + + if((props & CL_QUEUE_PROFILING_ENABLE) == 0) + { + // Set the queue for profiling + _queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE); + } + else + { + _queue_profiler = _queue; + } + } + // Get the configuration ID from the kernel const std::string &config_id = kernel.config_id(); @@ -49,6 +75,9 @@ void CLTuner::tune_kernel(ICLKernel &kernel) if(p == _lws_table.end()) { + // Set profiler queue + CLScheduler::get().set_queue(_queue_profiler); + // Find the optimal LWS for the kernel cl::NDRange opt_lws = find_optimal_lws(kernel); @@ -57,6 +86,9 @@ void CLTuner::tune_kernel(ICLKernel &kernel) // Set Local-Workgroup-Size kernel.set_lws_hint(opt_lws); + + // Restore queue + CLScheduler::get().set_queue(_queue); } else { @@ -68,11 +100,12 @@ void CLTuner::tune_kernel(ICLKernel &kernel) cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) { - cl::CommandQueue q = CLScheduler::get().queue(); + // Start intercepting enqueues: + CLSymbols::get().clEnqueueNDRangeKernel_ptr = Interceptor(*this); - double min_exec_time = std::numeric_limits::max(); + cl_ulong min_exec_time = std::numeric_limits::max(); - cl::NDRange opt_lws = cl::NDRange(1, 1, 1); + cl::NDRange opt_lws = cl::NullRange; const int x_step = std::max(1, kernel.window().x().step()); const int y_step = std::max(1, kernel.window().y().step()); @@ -81,43 +114,64 @@ cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel) const int y_end = kernel.window().y().end() - kernel.window().y().start() / y_step > 1 ? 16 : 1; const int z_end = kernel.window().z().end() - kernel.window().z().start() / z_step > 1 ? 8 : 1; + // First run using the default LWS + { + cl::NDRange lws_test = cl::NullRange; + + kernel.set_lws_hint(lws_test); + + // Run the kernel + kernel.run(kernel.window(), _queue_profiler); + + CLScheduler::get().sync(); + + const cl_ulong start = _kernel_event.getProfilingInfo(); + const cl_ulong end = _kernel_event.getProfilingInfo(); + const cl_ulong diff = end - start; + + min_exec_time = diff; + } + for(int z = 1; z <= z_end; ++z) { for(int y = 1; y <= y_end; ++y) { for(int x = 1; x <= x_end; ++x) { - if(x == 1 && y == 1 && z == 1) + cl::NDRange lws_test = cl::NDRange(x, y, z); + + const bool invalid_lws = (x * y * z > static_cast(kernel.get_max_workgroup_size())) || (x == 1 && y == 1 && z == 1); + + if(invalid_lws) { continue; } - cl::NDRange lws_test = cl::NDRange(x, y, z); - //Set the Local-Workgroup-Size kernel.set_lws_hint(lws_test); - auto t_start = std::chrono::high_resolution_clock::now(); - - // Run - kernel.run(kernel.window(), q); + // Run the kernel + kernel.run(kernel.window(), _queue_profiler); CLScheduler::get().sync(); - auto t_stop = std::chrono::high_resolution_clock::now(); - - std::chrono::duration fp_nano = t_stop - t_start; + const cl_ulong start = _kernel_event.getProfilingInfo(); + const cl_ulong end = _kernel_event.getProfilingInfo(); + const cl_ulong diff = end - start; // Check the execution time - if(fp_nano.count() < min_exec_time) + if(diff < min_exec_time) { - min_exec_time = fp_nano.count(); + min_exec_time = diff; opt_lws = cl::NDRange(x, y, z); } } } } + // Restore real function + CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_function; + return opt_lws; } @@ -130,4 +184,24 @@ void CLTuner::import_lws_table(const std::unordered_map &CLTuner::export_lws_table() { return _lws_table; +} + +Interceptor::Interceptor(CLTuner &tuner) + : _tuner(tuner) +{ +} + +cl_int Interceptor::operator()(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) +{ + ARM_COMPUTE_ERROR_ON_MSG(event != nullptr, "Not supported"); + ARM_COMPUTE_UNUSED(event); + + cl_event tmp; + cl_int retval = _tuner.real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp); + + // Set OpenCL event + _tuner.set_cl_kernel_event(tmp); + + return retval; } \ No newline at end of file -- cgit v1.2.1