aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/CL
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2022-04-07 17:41:51 +0100
committerSiCong Li <sicong.li@arm.com>2022-04-14 12:59:27 +0000
commit0a486cf66c70b4bd9b0ea8ba9dc5b42f52ed16c3 (patch)
tree88eea0a182ea6dd8ec45b6aca7843d98cfd73764 /src/runtime/CL
parentca364dfd87cab4cdb9179b68c42f10ff16e55002 (diff)
downloadComputeLibrary-0a486cf66c70b4bd9b0ea8ba9dc5b42f52ed16c3.tar.gz
Enable dynamic cl tuning for dynamically fused kernels
* Add new tune_kernel_dynamic interface * Add generate_config_id Resolves: COMPMID-5154 Signed-off-by: SiCong Li <sicong.li@arm.com> Change-Id: I39870e59fceda875487970061ceb2048995c5a45 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7400 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/runtime/CL')
-rw-r--r--src/runtime/CL/CLScheduler.cpp38
-rw-r--r--src/runtime/CL/CLTuner.cpp73
2 files changed, 89 insertions, 22 deletions
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 18fd52232d..4cff707f1a 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -181,19 +181,12 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f
// Run kernel
inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
-
if(_job_chaining_enabled)
{
- if(++_job_chaining_count >= _job_chaining_size)
- {
- _job_chaining_count = 0;
- _queue.flush();
- }
- }
- else if(flush)
- {
- _queue.flush();
+ ++_job_chaining_count;
}
+
+ flush_queue(flush);
}
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
@@ -204,14 +197,31 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion
"The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
- const bool inject_memory = !tensors._binding.empty();
+ // ClCompositeKernel is stateless thus alway requires memory injection
+
+ // Tune the kernel if the CLTuner has been provided
+ if(_cl_tuner != nullptr)
+ {
+ _cl_tuner->tune_kernel_dynamic(kernel, tensors, exec_desc);
+ }
// Run kernel
- inject_memory ? kernel.run_composite_op(tensors, kernel.window(), _queue, exec_desc) : kernel.run(kernel.window(), _queue);
+ kernel.run_composite_op(tensors, kernel.window(), _queue, exec_desc);
+ if(_job_chaining_enabled)
+ {
+ ++_job_chaining_count;
+ }
+
+ flush_queue(flush);
+}
+
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+void CLScheduler::flush_queue(bool flush)
+{
if(_job_chaining_enabled)
{
- if(++_job_chaining_count >= _job_chaining_size)
+ if(_job_chaining_count >= _job_chaining_size)
{
_job_chaining_count = 0;
_queue.flush();
@@ -223,8 +233,6 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion
}
}
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
{
ITensorPack pack;
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5dec81be2c..81fe7dbde6 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,9 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "src/core/CL/ICLKernel.h"
#include "support/StringSupport.h"
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
#include <cerrno>
#include <fstream>
@@ -40,6 +43,48 @@ CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
{
}
+struct CLTuner::IKernelData
+{
+ virtual ~IKernelData() = default;
+ virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0;
+};
+struct DefaultKernelData : public CLTuner::IKernelData
+{
+ DefaultKernelData(ITensorPack &tensors)
+ : _tensors{ tensors }
+ {
+ }
+ ~DefaultKernelData() override = default;
+ void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override
+ {
+ const bool inject_memory = !_tensors.empty();
+ inject_memory ? kernel.run_op(_tensors, kernel.window(), queue) : kernel.run(kernel.window(), queue);
+ }
+
+private:
+ ITensorPack &_tensors;
+};
+
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+struct CompositeKernelData : public CLTuner::IKernelData
+{
+ CompositeKernelData(experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
+ : _tensors{ tensors }, _exec_desc{ exec_desc }
+ {
+ }
+ ~CompositeKernelData() override = default;
+ void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override
+ {
+ // ClCompositeKernel is purely stateless, and thus always requires memory injection
+ kernel.run_composite_op(_tensors, kernel.window(), queue, _exec_desc);
+ }
+
+private:
+ experimental::dynamic_fusion::TensorBinding &_tensors;
+ const experimental::dynamic_fusion::ClExecutionDescriptor &_exec_desc;
+};
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+
bool CLTuner::kernel_event_is_set() const
{
return _kernel_event() != nullptr;
@@ -74,7 +119,7 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
tune_kernel_dynamic(kernel, pack);
}
-void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
{
// Get the configuration ID from the kernel and append GPU target name and number of available compute units
const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
@@ -89,7 +134,7 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
if(_tune_new_kernels)
{
// Find the optimal LWS for the kernel
- CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, tensors);
+ CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data);
// Insert the optimal LWS in the table
add_tuning_params(config_id, opt_tuning_params);
@@ -113,13 +158,28 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
}
}
}
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+{
+ DefaultKernelData data{ tensors };
+
+ do_tune_kernel_dynamic(kernel, &data);
+}
+
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
+{
+ CompositeKernelData data{ tensors, exec_desc };
+
+ do_tune_kernel_dynamic(kernel, &data);
+}
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params)
{
_tuning_params_table.emplace(kernel_id, optimal_tuning_params);
}
-CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPack &tensors)
+CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data)
{
// Profiling queue
cl::CommandQueue queue_profiler;
@@ -174,8 +234,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
// Run the kernel with default lws to be used as baseline
- const bool inject_memory = !tensors.empty();
- inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
+ data->do_run(kernel, queue_profiler);
queue_profiler.finish();
@@ -211,7 +270,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
}
// Run the kernel
- inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
+ data->do_run(kernel, queue_profiler);
queue_profiler.finish();