diff options
Diffstat (limited to 'src/runtime/CL/CLScheduler.cpp')
-rw-r--r-- | src/runtime/CL/CLScheduler.cpp | 137 |
1 files changed, 105 insertions, 32 deletions
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index e78eaa482f..f0a42f55fd 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,10 +24,9 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/runtime/CL/CLHelpers.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/CL/tuners/Tuners.h" + +#include "src/core/CL/ICLKernel.h" namespace arm_compute { @@ -49,6 +48,11 @@ GPUTarget CLScheduler::target() const return _target; } +CLGEMMHeuristicsHandle *CLScheduler::gemm_heuristics() const +{ + return _gemm_heuristics; +} + void CLScheduler::set_queue(cl::CommandQueue queue) { _queue = std::move(queue); @@ -78,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event() void CLScheduler::tune_kernel_static(ICLKernel &kernel) { - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { _cl_tuner->tune_kernel_static(kernel); } @@ -92,7 +96,16 @@ bool CLScheduler::is_initialised() const std::once_flag CLScheduler::_initialize_symbols; CLScheduler::CLScheduler() - : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _cl_default_static_tuner(nullptr) + : _context(), + _queue(), + _target(GPUTarget::MIDGARD), + _is_initialised(false), + _cl_tuner(nullptr), + _gemm_heuristics(nullptr), + _backend_type(CLBackendType::Native), + _job_chaining_enabled(true), + _job_chaining_size(1), + _job_chaining_count(0) { } @@ -103,37 +116,45 @@ CLScheduler &CLScheduler::get() return scheduler; } -void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner) +void CLScheduler::default_init_with_context(cl::Device &device, + cl::Context &ctx, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h) { - if(!_is_initialised) + if (!_is_initialised) { const std::string cl_kernels_folder("./cl_kernels/"); cl::CommandQueue queue = cl::CommandQueue(ctx, device); CLKernelLibrary::get().init(cl_kernels_folder, ctx, device); - init(ctx, queue, device, cl_tuner); - _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target); - _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner; + init(ctx, queue, device, cl_tuner, gemm_h); + _cl_tuner = cl_tuner; } } -void CLScheduler::default_init(ICLTuner *cl_tuner) +void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) { - if(!_is_initialised) + if (!_is_initialised) { cl::Context ctx; cl::Device dev; cl_int err; - std::tie(ctx, dev, err) = create_opencl_context_and_device(); + std::tie(ctx, dev, err) = create_opencl_context_and_device(cl_backend_type); ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context"); cl::CommandQueue queue = cl::CommandQueue(ctx, dev); CLKernelLibrary::get().init("./cl_kernels/", ctx, dev); - init(ctx, queue, dev, cl_tuner); - // Create a default static tuner and set if none was provided - _cl_default_static_tuner = tuners::TunerFactory::create_tuner(_target); + init(ctx, queue, dev, cl_tuner, gemm_h); } - // Set CL tuner - _cl_tuner = (cl_tuner == nullptr) ? _cl_default_static_tuner.get() : cl_tuner; + // Set CL tuner and GEMM heuristics + _cl_tuner = cl_tuner; + _gemm_heuristics = gemm_h; +} + +void CLScheduler::default_reinit(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type) +{ + _is_initialised = false; + + default_init(cl_tuner, gemm_h, cl_backend_type); } void CLScheduler::set_context(cl::Context context) @@ -142,34 +163,86 @@ void CLScheduler::set_context(cl::Context context) CLKernelLibrary::get().set_context(_context); } -void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner) +void CLScheduler::init(cl::Context context, + cl::CommandQueue queue, + const cl::Device &device, + ICLTuner *cl_tuner, + CLGEMMHeuristicsHandle *gemm_h, + CLBackendType cl_backend_type) { set_context(std::move(context)); - _queue = std::move(queue); - _target = get_target_from_device(device); - _is_initialised = true; - _cl_tuner = cl_tuner; + _queue = std::move(queue); + _target = get_target_from_device(device); + _is_initialised = true; + _cl_tuner = cl_tuner; + _gemm_heuristics = gemm_h; + _backend_type = cl_backend_type; } -void CLScheduler::enqueue(ICLKernel &kernel, bool flush) +void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush) { - ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised, - "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ + ARM_COMPUTE_ERROR_ON_MSG( + !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!"); + const bool inject_memory = !tensors.empty(); + // Tune the kernel if the CLTuner has been provided - if(_cl_tuner != nullptr) + if (_cl_tuner != nullptr) { - // Tune the OpenCL kernel - _cl_tuner->tune_kernel_dynamic(kernel); + inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel); } // Run kernel - kernel.run(kernel.window(), _queue); + inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue); + if (_job_chaining_enabled) + { + ++_job_chaining_count; + } + + flush_queue(flush); +} - if(flush) +void CLScheduler::flush_queue(bool flush) +{ + if (_job_chaining_enabled) + { + if (_job_chaining_count >= _job_chaining_size) + { + _job_chaining_count = 0; + /* + Optimisation note: Flush the queue at the first enqueue to start the GPU + execution and then incrementally saturate the clFlush calls to minimize + the CPU activity for job-scheduling. + For eg. job-chain size goes from 1, 2, 4, 8 and 16 + */ + if (_job_chaining_size < 16) + { + _job_chaining_size <<= 1; + } + _queue.flush(); + } + } + else if (flush) { _queue.flush(); } } + +void CLScheduler::enqueue(ICLKernel &kernel, bool flush) +{ + ITensorPack pack; + enqueue_common(kernel, pack, flush); +} + +void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush) +{ + enqueue_common(kernel, tensors, flush); +} + +void CLScheduler::enable_job_chaining(int job_chaining_size) +{ + _job_chaining_enabled = true; + _job_chaining_size = job_chaining_size; +} } // namespace arm_compute |