diff options
11 files changed, 200 insertions, 28 deletions
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h index 362d2ba137..5bfaaf4b5d 100644 --- a/arm_compute/runtime/CL/CLScheduler.h +++ b/arm_compute/runtime/CL/CLScheduler.h @@ -211,6 +211,11 @@ public: private: void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush); + /** If job chain is disabled, then flush the command queue according to @p flush. Otherwise @p flush is ignored and the queue is only flushed when job chain count exceeds allocated job chain size + * + * @param[in] flush Flush the command queue. Ignored when job chain is enabled. + */ + void flush_queue(bool flush); #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) void enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush); diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h index f96edc962b..e595f8f34b 100644 --- a/arm_compute/runtime/CL/CLTuner.h +++ b/arm_compute/runtime/CL/CLTuner.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -124,6 +124,9 @@ public: void tune_kernel_static(ICLKernel &kernel) override; void tune_kernel_dynamic(ICLKernel &kernel) override; void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override; +#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) + void tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) override; +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) /** Is the kernel_event set ? * @@ -131,15 +134,26 @@ public: */ bool kernel_event_is_set() const; + /** A wrapper wrapping tensors and other objects needed for running the kernel + */ + struct IKernelData; + private: + /** Perform tune_kernel_dynamic + * + * @param[in] kernel OpenCL kernel to be tuned with tuning parameters + * @param[in,out] data IKernelData object wrapping tensors and other objects needed for running the kernel + * + */ + void do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data); /** Find optimal tuning parameters using brute-force approach * - * @param[in] kernel OpenCL kernel to be tuned with tuning parameters - * @param[in,out] tensors Tensors for the kernel to operate on + * @param[in] kernel OpenCL kernel to be tuned with tuning parameters + * @param[in,out] data IKernelData object wrapping tensors and other objects needed for running the kernel * * @return The optimal tuning parameters to use */ - CLTuningParams find_optimal_tuning_params(ICLKernel &kernel, ITensorPack &tensors); + CLTuningParams find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data); std::unordered_map<std::string, CLTuningParams> _tuning_params_table; std::unordered_map<std::string, cl::NDRange> _lws_table; diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h index 0f951c384e..a327497255 100644 --- a/arm_compute/runtime/CL/ICLTuner.h +++ b/arm_compute/runtime/CL/ICLTuner.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2020, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,16 @@ namespace arm_compute { class ICLKernel; +#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +namespace experimental +{ +namespace dynamic_fusion +{ +struct TensorBinding; +struct ClExecutionDescriptor; +} // namespace dynamic_fusion +} // namespace experimental +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) /** Basic interface for tuning the OpenCL kernels */ class ICLTuner { @@ -57,6 +67,15 @@ public: * @param[in, out] tensors Tensors for the kernel to use */ virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) = 0; +#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) + /** Tune OpenCL kernel dynamically for dynamic fusion interface + * + * @param[in] kernel Kernel to tune + * @param[in, out] tensors Tensors for the kernel to use + * @param[in] exec_desc Execution descriptor + */ + virtual void tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) = 0; +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) }; } // namespace arm_compute #endif /*ARM_COMPUTE_ICLTUNER_H */ diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h index e24c742fd7..aa27572746 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h @@ -316,6 +316,15 @@ public: return ""; } + /** Generate config id of the component + * + * @return std::string + */ + virtual std::string generate_config_id() const + { + return ""; + } + virtual CLBuildOptions generate_build_options() const { return CLBuildOptions{}; @@ -537,9 +546,21 @@ public: return code; } + /** Generate config id of the entire kernel + * + * Format: kernel_name--comp0_config_id--comp1_config_id--... + * + * @return std::string + */ std::string build_config_id() const { - return ""; + std::string config_id = build_kernel_name(); + traverse([&](std::stack<ComponentID> stack) + { + config_id += "--" + _components.find(stack.top())->second->generate_config_id() + "--"; + }); + + return config_id; } CLBuildOptions build_options() const diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp index 34b735edc9..84e4003d5d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp @@ -128,6 +128,20 @@ CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const return build_opts; } +std::string ClElementwiseAddKernelComponent::generate_config_id() const +{ + auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + std::string config_id{}; + config_id += lower_string(string_from_data_type(t_dst_info->data_type())); + config_id += "_"; + config_id += support::cpp11::to_string(t_dst_info->dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(t_dst_info->dimension(1)); + config_id += "_"; + config_id += lower_string(string_from_data_layout(t_dst_info->data_layout())); + return config_id; +} + ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const { // Determine which argument is the accumulator diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h index c259811a98..35c9538b8d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h @@ -47,6 +47,7 @@ public: std::string get_component_code() const override; Window get_window() const override; CLBuildOptions generate_build_options() const override; + std::string generate_config_id() const override; virtual std::vector<Link> get_links() const override { diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp index 7d23128276..45b81b424d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp @@ -470,6 +470,32 @@ CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const return build_opts; } +std::string ClGemmNativeKernelComponent::generate_config_id() const +{ + auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + std::string config_id{}; + config_id += (_bias.is_empty() ? "add_bias_" : ""); + config_id += (_desc.broadcast_bias ? "broadcast_bias_" : ""); + config_id += (_desc.reinterpret_input_as_3d ? "3di_" : ""); + config_id += (_desc.depth_output_gemm3d > 0 ? "3do_" : ""); + config_id += lower_string(string_from_data_type(t_dst_info->data_type())); + config_id += "_"; + config_id += support::cpp11::to_string(t_dst_info->dimension(1)); + config_id += "_"; + config_id += support::cpp11::to_string(t_dst_info->dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(_desc.k); + config_id += "_"; + config_id += support::cpp11::to_string(t_dst_info->dimension(2)); + config_id += "_"; + config_id += support::cpp11::to_string(_desc.lhs_info.m0); + config_id += "_"; + config_id += support::cpp11::to_string(_desc.rhs_info.n0); + config_id += "_"; + config_id += support::cpp11::to_string(_desc.rhs_info.k0); + return config_id; +} + ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const { TagLUT lut{}; diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h index 1a1e3e3ce6..b282856b56 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h @@ -53,6 +53,7 @@ public: Window get_window() const override; ClKernelArgList get_args(); CLBuildOptions generate_build_options() const override; + std::string generate_config_id() const override; virtual std::vector<Link> get_links() const override { diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h index 29d30003c3..19efb505eb 100644 --- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h +++ b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h @@ -43,6 +43,10 @@ struct TensorBinding : _binding{ binding } { } + bool empty() const + { + return _binding.empty(); + } std::map<ArgumentID, ICLTensor *> _binding; }; class ClCompositeKernel : public opencl::IClKernel diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index 18fd52232d..4cff707f1a 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -181,19 +181,12 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f // Run kernel inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue); - if(_job_chaining_enabled) { - if(++_job_chaining_count >= _job_chaining_size) - { - _job_chaining_count = 0; - _queue.flush(); - } - } - else if(flush) - { - _queue.flush(); + ++_job_chaining_count; } + + flush_queue(flush); } #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) @@ -204,14 +197,31 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!"); - const bool inject_memory = !tensors._binding.empty(); + // ClCompositeKernel is stateless thus alway requires memory injection + + // Tune the kernel if the CLTuner has been provided + if(_cl_tuner != nullptr) + { + _cl_tuner->tune_kernel_dynamic(kernel, tensors, exec_desc); + } // Run kernel - inject_memory ? kernel.run_composite_op(tensors, kernel.window(), _queue, exec_desc) : kernel.run(kernel.window(), _queue); + kernel.run_composite_op(tensors, kernel.window(), _queue, exec_desc); + if(_job_chaining_enabled) + { + ++_job_chaining_count; + } + + flush_queue(flush); +} + +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +void CLScheduler::flush_queue(bool flush) +{ if(_job_chaining_enabled) { - if(++_job_chaining_count >= _job_chaining_size) + if(_job_chaining_count >= _job_chaining_size) { _job_chaining_count = 0; _queue.flush(); @@ -223,8 +233,6 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion } } -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - void CLScheduler::enqueue(ICLKernel &kernel, bool flush) { ITensorPack pack; diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 5dec81be2c..81fe7dbde6 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,6 +28,9 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" +#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) #include <cerrno> #include <fstream> @@ -40,6 +43,48 @@ CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info) { } +struct CLTuner::IKernelData +{ + virtual ~IKernelData() = default; + virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0; +}; +struct DefaultKernelData : public CLTuner::IKernelData +{ + DefaultKernelData(ITensorPack &tensors) + : _tensors{ tensors } + { + } + ~DefaultKernelData() override = default; + void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override + { + const bool inject_memory = !_tensors.empty(); + inject_memory ? kernel.run_op(_tensors, kernel.window(), queue) : kernel.run(kernel.window(), queue); + } + +private: + ITensorPack &_tensors; +}; + +#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +struct CompositeKernelData : public CLTuner::IKernelData +{ + CompositeKernelData(experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) + : _tensors{ tensors }, _exec_desc{ exec_desc } + { + } + ~CompositeKernelData() override = default; + void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override + { + // ClCompositeKernel is purely stateless, and thus always requires memory injection + kernel.run_composite_op(_tensors, kernel.window(), queue, _exec_desc); + } + +private: + experimental::dynamic_fusion::TensorBinding &_tensors; + const experimental::dynamic_fusion::ClExecutionDescriptor &_exec_desc; +}; +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) + bool CLTuner::kernel_event_is_set() const { return _kernel_event() != nullptr; @@ -74,7 +119,7 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel) tune_kernel_dynamic(kernel, pack); } -void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) +void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data) { // Get the configuration ID from the kernel and append GPU target name and number of available compute units const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units()); @@ -89,7 +134,7 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) if(_tune_new_kernels) { // Find the optimal LWS for the kernel - CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, tensors); + CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data); // Insert the optimal LWS in the table add_tuning_params(config_id, opt_tuning_params); @@ -113,13 +158,28 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) } } } +void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) +{ + DefaultKernelData data{ tensors }; + + do_tune_kernel_dynamic(kernel, &data); +} + +#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) +{ + CompositeKernelData data{ tensors, exec_desc }; + + do_tune_kernel_dynamic(kernel, &data); +} +#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params) { _tuning_params_table.emplace(kernel_id, optimal_tuning_params); } -CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPack &tensors) +CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data) { // Profiling queue cl::CommandQueue queue_profiler; @@ -174,8 +234,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac cl::NDRange gws = ICLKernel::gws_from_window(kernel.window()); // Run the kernel with default lws to be used as baseline - const bool inject_memory = !tensors.empty(); - inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler); + data->do_run(kernel, queue_profiler); queue_profiler.finish(); @@ -211,7 +270,7 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac } // Run the kernel - inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler); + data->do_run(kernel, queue_profiler); queue_profiler.finish(); |