aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2022-04-07 17:41:51 +0100
committerSiCong Li <sicong.li@arm.com>2022-04-14 12:59:27 +0000
commit0a486cf66c70b4bd9b0ea8ba9dc5b42f52ed16c3 (patch)
tree88eea0a182ea6dd8ec45b6aca7843d98cfd73764
parentca364dfd87cab4cdb9179b68c42f10ff16e55002 (diff)
downloadComputeLibrary-0a486cf66c70b4bd9b0ea8ba9dc5b42f52ed16c3.tar.gz
Enable dynamic cl tuning for dynamically fused kernels
* Add new tune_kernel_dynamic interface * Add generate_config_id Resolves: COMPMID-5154 Signed-off-by: SiCong Li <sicong.li@arm.com> Change-Id: I39870e59fceda875487970061ceb2048995c5a45 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7400 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/runtime/CL/CLScheduler.h5
-rw-r--r--arm_compute/runtime/CL/CLTuner.h22
-rw-r--r--arm_compute/runtime/CL/ICLTuner.h21
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h23
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp14
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h1
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp26
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h1
-rw-r--r--src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h4
-rw-r--r--src/runtime/CL/CLScheduler.cpp38
-rw-r--r--src/runtime/CL/CLTuner.cpp73
11 files changed, 200 insertions, 28 deletions
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 362d2ba137..5bfaaf4b5d 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -211,6 +211,11 @@ public:
private:
void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush);
+ /** If job chain is disabled, then flush the command queue according to @p flush. Otherwise @p flush is ignored and the queue is only flushed when job chain count exceeds allocated job chain size
+ *
+ * @param[in] flush Flush the command queue. Ignored when job chain is enabled.
+ */
+ void flush_queue(bool flush);
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
void enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush);
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index f96edc962b..e595f8f34b 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -124,6 +124,9 @@ public:
void tune_kernel_static(ICLKernel &kernel) override;
void tune_kernel_dynamic(ICLKernel &kernel) override;
void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override;
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+ void tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) override;
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
/** Is the kernel_event set ?
*
@@ -131,15 +134,26 @@ public:
*/
bool kernel_event_is_set() const;
+ /** A wrapper wrapping tensors and other objects needed for running the kernel
+ */
+ struct IKernelData;
+
private:
+ /** Perform tune_kernel_dynamic
+ *
+ * @param[in] kernel OpenCL kernel to be tuned with tuning parameters
+ * @param[in,out] data IKernelData object wrapping tensors and other objects needed for running the kernel
+ *
+ */
+ void do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data);
/** Find optimal tuning parameters using brute-force approach
*
- * @param[in] kernel OpenCL kernel to be tuned with tuning parameters
- * @param[in,out] tensors Tensors for the kernel to operate on
+ * @param[in] kernel OpenCL kernel to be tuned with tuning parameters
+ * @param[in,out] data IKernelData object wrapping tensors and other objects needed for running the kernel
*
* @return The optimal tuning parameters to use
*/
- CLTuningParams find_optimal_tuning_params(ICLKernel &kernel, ITensorPack &tensors);
+ CLTuningParams find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data);
std::unordered_map<std::string, CLTuningParams> _tuning_params_table;
std::unordered_map<std::string, cl::NDRange> _lws_table;
diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h
index 0f951c384e..a327497255 100644
--- a/arm_compute/runtime/CL/ICLTuner.h
+++ b/arm_compute/runtime/CL/ICLTuner.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,16 @@ namespace arm_compute
{
class ICLKernel;
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct TensorBinding;
+struct ClExecutionDescriptor;
+} // namespace dynamic_fusion
+} // namespace experimental
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
/** Basic interface for tuning the OpenCL kernels */
class ICLTuner
{
@@ -57,6 +67,15 @@ public:
* @param[in, out] tensors Tensors for the kernel to use
*/
virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) = 0;
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+ /** Tune OpenCL kernel dynamically for dynamic fusion interface
+ *
+ * @param[in] kernel Kernel to tune
+ * @param[in, out] tensors Tensors for the kernel to use
+ * @param[in] exec_desc Execution descriptor
+ */
+ virtual void tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) = 0;
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_ICLTUNER_H */
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index e24c742fd7..aa27572746 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -316,6 +316,15 @@ public:
return "";
}
+ /** Generate config id of the component
+ *
+ * @return std::string
+ */
+ virtual std::string generate_config_id() const
+ {
+ return "";
+ }
+
virtual CLBuildOptions generate_build_options() const
{
return CLBuildOptions{};
@@ -537,9 +546,21 @@ public:
return code;
}
+ /** Generate config id of the entire kernel
+ *
+ * Format: kernel_name--comp0_config_id--comp1_config_id--...
+ *
+ * @return std::string
+ */
std::string build_config_id() const
{
- return "";
+ std::string config_id = build_kernel_name();
+ traverse([&](std::stack<ComponentID> stack)
+ {
+ config_id += "--" + _components.find(stack.top())->second->generate_config_id() + "--";
+ });
+
+ return config_id;
}
CLBuildOptions build_options() const
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index 34b735edc9..84e4003d5d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -128,6 +128,20 @@ CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
return build_opts;
}
+std::string ClElementwiseAddKernelComponent::generate_config_id() const
+{
+ auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ std::string config_id{};
+ config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
+ config_id += "_";
+ config_id += support::cpp11::to_string(t_dst_info->dimension(0));
+ config_id += "_";
+ config_id += support::cpp11::to_string(t_dst_info->dimension(1));
+ config_id += "_";
+ config_id += lower_string(string_from_data_layout(t_dst_info->data_layout()));
+ return config_id;
+}
+
ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const
{
// Determine which argument is the accumulator
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index c259811a98..35c9538b8d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -47,6 +47,7 @@ public:
std::string get_component_code() const override;
Window get_window() const override;
CLBuildOptions generate_build_options() const override;
+ std::string generate_config_id() const override;
virtual std::vector<Link> get_links() const override
{
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
index 7d23128276..45b81b424d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
@@ -470,6 +470,32 @@ CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const
return build_opts;
}
+std::string ClGemmNativeKernelComponent::generate_config_id() const
+{
+ auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ std::string config_id{};
+ config_id += (_bias.is_empty() ? "add_bias_" : "");
+ config_id += (_desc.broadcast_bias ? "broadcast_bias_" : "");
+ config_id += (_desc.reinterpret_input_as_3d ? "3di_" : "");
+ config_id += (_desc.depth_output_gemm3d > 0 ? "3do_" : "");
+ config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
+ config_id += "_";
+ config_id += support::cpp11::to_string(t_dst_info->dimension(1));
+ config_id += "_";
+ config_id += support::cpp11::to_string(t_dst_info->dimension(0));
+ config_id += "_";
+ config_id += support::cpp11::to_string(_desc.k);
+ config_id += "_";
+ config_id += support::cpp11::to_string(t_dst_info->dimension(2));
+ config_id += "_";
+ config_id += support::cpp11::to_string(_desc.lhs_info.m0);
+ config_id += "_";
+ config_id += support::cpp11::to_string(_desc.rhs_info.n0);
+ config_id += "_";
+ config_id += support::cpp11::to_string(_desc.rhs_info.k0);
+ return config_id;
+}
+
ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const
{
TagLUT lut{};
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
index 1a1e3e3ce6..b282856b56 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
@@ -53,6 +53,7 @@ public:
Window get_window() const override;
ClKernelArgList get_args();
CLBuildOptions generate_build_options() const override;
+ std::string generate_config_id() const override;
virtual std::vector<Link> get_links() const override
{
diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h
index 29d30003c3..19efb505eb 100644
--- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h
+++ b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h
@@ -43,6 +43,10 @@ struct TensorBinding
: _binding{ binding }
{
}
+ bool empty() const
+ {
+ return _binding.empty();
+ }
std::map<ArgumentID, ICLTensor *> _binding;
};
class ClCompositeKernel : public opencl::IClKernel
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 18fd52232d..4cff707f1a 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -181,19 +181,12 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f
// Run kernel
inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
-
if(_job_chaining_enabled)
{
- if(++_job_chaining_count >= _job_chaining_size)
- {
- _job_chaining_count = 0;
- _queue.flush();
- }
- }
- else if(flush)
- {
- _queue.flush();
+ ++_job_chaining_count;
}
+
+ flush_queue(flush);
}
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
@@ -204,14 +197,31 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion
"The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
- const bool inject_memory = !tensors._binding.empty();
+ // ClCompositeKernel is stateless thus alway requires memory injection
+
+ // Tune the kernel if the CLTuner has been provided
+ if(_cl_tuner != nullptr)
+ {
+ _cl_tuner->tune_kernel_dynamic(kernel, tensors, exec_desc);
+ }
// Run kernel
- inject_memory ? kernel.run_composite_op(tensors, kernel.window(), _queue, exec_desc) : kernel.run(kernel.window(), _queue);
+ kernel.run_composite_op(tensors, kernel.window(), _queue, exec_desc);
+ if(_job_chaining_enabled)
+ {
+ ++_job_chaining_count;
+ }
+
+ flush_queue(flush);
+}
+
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+void CLScheduler::flush_queue(bool flush)
+{
if(_job_chaining_enabled)
{
- if(++_job_chaining_count >= _job_chaining_size)
+ if(_job_chaining_count >= _job_chaining_size)
{
_job_chaining_count = 0;
_queue.flush();
@@ -223,8 +233,6 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion
}
}
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
{
ITensorPack pack;
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5dec81be2c..81fe7dbde6 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,6 +28,9 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "src/core/CL/ICLKernel.h"
#include "support/StringSupport.h"
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
#include <cerrno>
#include <fstream>
@@ -40,6 +43,48 @@ CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
{
}
+struct CLTuner::IKernelData
+{
+ virtual ~IKernelData() = default;
+ virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0;
+};
+struct DefaultKernelData : public CLTuner::IKernelData
+{
+ DefaultKernelData(ITensorPack &tensors)
+ : _tensors{ tensors }
+ {
+ }
+ ~DefaultKernelData() override = default;
+ void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override
+ {
+ const bool inject_memory = !_tensors.empty();
+ inject_memory ? kernel.run_op(_tensors, kernel.window(), queue) : kernel.run(kernel.window(), queue);
+ }
+
+private:
+ ITensorPack &_tensors;
+};
+
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+struct CompositeKernelData : public CLTuner::IKernelData
+{
+ CompositeKernelData(experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
+ : _tensors{ tensors }, _exec_desc{ exec_desc }
+ {
+ }
+ ~CompositeKernelData() override = default;
+ void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override
+ {
+ // ClCompositeKernel is purely stateless, and thus always requires memory injection
+ kernel.run_composite_op(_tensors, kernel.window(), queue, _exec_desc);
+ }
+
+private:
+ experimental::dynamic_fusion::TensorBinding &_tensors;
+ const experimental::dynamic_fusion::ClExecutionDescriptor &_exec_desc;
+};
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+
bool CLTuner::kernel_event_is_set() const
{
return _kernel_event() != nullptr;
@@ -74,7 +119,7 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
tune_kernel_dynamic(kernel, pack);
}
-void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
{
// Get the configuration ID from the kernel and append GPU target name and number of available compute units
const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
@@ -89,7 +134,7 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
if(_tune_new_kernels)
{
// Find the optimal LWS for the kernel
- CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, tensors);
+ CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data);
// Insert the optimal LWS in the table
add_tuning_params(config_id, opt_tuning_params);
@@ -113,13 +158,28 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
}
}
}
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+{
+ DefaultKernelData data{ tensors };
+
+ do_tune_kernel_dynamic(kernel, &data);
+}
+
+#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
+{
+ CompositeKernelData data{ tensors, exec_desc };
+
+ do_tune_kernel_dynamic(kernel, &data);
+}
+#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params)
{
_tuning_params_table.emplace(kernel_id, optimal_tuning_params);
}
-CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPack &tensors)
+CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data)
{
// Profiling queue
cl::CommandQueue queue_profiler;
@@ -174,8 +234,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
// Run the kernel with default lws to be used as baseline
- const bool inject_memory = !tensors.empty();
- inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
+ data->do_run(kernel, queue_profiler);
queue_profiler.finish();
@@ -211,7 +270,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
}
// Run the kernel
- inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
+ data->do_run(kernel, queue_profiler);
queue_profiler.finish();