1 files changed, 72 insertions, 13 deletions
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index cb5f04ce8b..f0a42f55fd 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -81,7 +82,7 @@ cl::Event CLScheduler::enqueue_sync_event()
 
 void CLScheduler::tune_kernel_static(ICLKernel &kernel)
 {
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         _cl_tuner->tune_kernel_static(kernel);
     }
@@ -95,7 +96,16 @@ bool CLScheduler::is_initialised() const
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native)
+    : _context(),
+      _queue(),
+      _target(GPUTarget::MIDGARD),
+      _is_initialised(false),
+      _cl_tuner(nullptr),
+      _gemm_heuristics(nullptr),
+      _backend_type(CLBackendType::Native),
+      _job_chaining_enabled(true),
+      _job_chaining_size(1),
+      _job_chaining_count(0)
 {
 }
 
@@ -106,9 +116,12 @@ CLScheduler &CLScheduler::get()
     return scheduler;
 }
 
-void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h)
+void CLScheduler::default_init_with_context(cl::Device             &device,
+                                            cl::Context            &ctx,
+                                            ICLTuner               *cl_tuner,
+                                            CLGEMMHeuristicsHandle *gemm_h)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         const std::string cl_kernels_folder("./cl_kernels/");
         cl::CommandQueue  queue = cl::CommandQueue(ctx, device);
@@ -120,7 +133,7 @@ void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx
 
 void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         cl::Context ctx;
         cl::Device  dev;
@@ -132,8 +145,16 @@ void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_
         init(ctx, queue, dev, cl_tuner, gemm_h);
     }
 
-    // Set CL tuner
-    _cl_tuner = cl_tuner;
+    // Set CL tuner and GEMM heuristics
+    _cl_tuner        = cl_tuner;
+    _gemm_heuristics = gemm_h;
+}
+
+void CLScheduler::default_reinit(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+{
+    _is_initialised = false;
+
+    default_init(cl_tuner, gemm_h, cl_backend_type);
 }
 
 void CLScheduler::set_context(cl::Context context)
@@ -142,7 +163,12 @@ void CLScheduler::set_context(cl::Context context)
     CLKernelLibrary::get().set_context(_context);
 }
 
-void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+void CLScheduler::init(cl::Context             context,
+                       cl::CommandQueue        queue,
+                       const cl::Device       &device,
+                       ICLTuner               *cl_tuner,
+                       CLGEMMHeuristicsHandle *gemm_h,
+                       CLBackendType           cl_backend_type)
 {
     set_context(std::move(context));
     _queue           = std::move(queue);
@@ -155,22 +181,49 @@ void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::De
 
 void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
-                             "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+    ARM_COMPUTE_ERROR_ON_MSG(
+        !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
                              or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
 
     const bool inject_memory = !tensors.empty();
 
     // Tune the kernel if the CLTuner has been provided
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel
     inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
+    if (_job_chaining_enabled)
+    {
+        ++_job_chaining_count;
+    }
 
-    if(flush)
+    flush_queue(flush);
+}
+
+void CLScheduler::flush_queue(bool flush)
+{
+    if (_job_chaining_enabled)
+    {
+        if (_job_chaining_count >= _job_chaining_size)
+        {
+            _job_chaining_count = 0;
+            /*
+                Optimisation note: Flush the queue at the first enqueue to start the GPU
+                execution and then incrementally saturate the clFlush calls to minimize
+                the CPU activity for job-scheduling.
+                For eg. job-chain size goes from 1, 2, 4, 8 and 16
+            */
+            if (_job_chaining_size < 16)
+            {
+                _job_chaining_size <<= 1;
+            }
+            _queue.flush();
+        }
+    }
+    else if (flush)
     {
         _queue.flush();
     }
@@ -186,4 +239,10 @@ void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush
 {
     enqueue_common(kernel, tensors, flush);
 }
+
+void CLScheduler::enable_job_chaining(int job_chaining_size)
+{
+    _job_chaining_enabled = true;
+    _job_chaining_size    = job_chaining_size;
+}
 } // namespace arm_compute