From 47f177e679874dc901888973c5fc237b756b38cb Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Wed, 22 Feb 2023 17:24:09 +0000 Subject: Fix LWS search space used by CLTuner * Ensure CLTuner uses the real GWS used by run(), instead of the static GWS (which is usually changed at run time), by caching GWS in each kernel Note this is a somewhat inelegant workaround. The real issue stems from the fact that execution window and scheduler are very much coupled with our operator run() / run_op() method. (Please see COMPMID-5934) * Restrict LWS values to explore within GWS bound for exhaustive mode * Refactor gws_from_window() to include all the information required to calculate GWS * Log lws search space used for tuning * Fix ClDirectConv2dKernel config id Resolves COMPMID-5892 Signed-off-by: SiCong Li Change-Id: I420490d8b94d13ada2e44eb0a12078f883379334 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9193 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/core/CL/ICLKernel.cpp | 31 +++++++++++++------- src/core/CL/ICLKernel.h | 37 +++++++++++++++++++----- src/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 11 +++++++ src/runtime/CL/CLTuner.cpp | 18 ++++++++++-- src/runtime/CL/tuners/CLTuningParametersList.cpp | 27 +++++++++-------- utils/TypePrinter.h | 30 +++++++++++++++++++ 6 files changed, 122 insertions(+), 32 deletions(-) diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 109a076e9a..dc3a86a528 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1)); } - cl::NDRange gws = ICLKernel::gws_from_window(window); + cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items); // Check for empty NDRange if(gws.dimensions() == 0) @@ -51,12 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind return; } - // Use dummy work-items - if(use_dummy_work_items) - { - gws.get()[0] = get_next_power_two(gws[0]); - gws.get()[1] = get_next_power_two(gws[1]); - } + kernel.cache_gws(gws); cl::NDRange valid_lws; if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) @@ -190,7 +185,7 @@ size_t ICLKernel::get_max_workgroup_size() return _max_workgroup_size; } -cl::NDRange ICLKernel::gws_from_window(const Window &window) +cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items) { if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) { @@ -201,6 +196,22 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window) (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); + if(use_dummy_work_items) + { + gws.get()[0] = get_next_power_two(gws[0]); + gws.get()[1] = get_next_power_two(gws[1]); + } + return gws; } -} // namespace arm_compute \ No newline at end of file + +cl::NDRange ICLKernel::get_cached_gws() const +{ + return _cached_gws; +} + +void ICLKernel::cache_gws(const cl::NDRange &gws) +{ + _cached_gws = gws; +} +} // namespace arm_compute diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index 5d5b636cf4..c82809cef3 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -86,9 +86,16 @@ private: return 2 + 2 * dimension_size; } - cl::NDRange default_lws_tune(const Window &window) + /** Get default lws for the kernel + * + * @param[in] window Execution window used by the kernel + * @param[in] use_dummy_work_items If the kernel uses dummy workloads + * + * @return cl::NDRange + */ + cl::NDRange default_lws_tune(const Window &window, bool use_dummy_work_items) { - return get_default_lws_for_type(_type, gws_from_window(window)); + return get_default_lws_for_type(_type, gws_from_window(window, use_dummy_work_items)); } using IKernel::configure; //Prevent children from calling IKernel::configure() directly @@ -115,7 +122,9 @@ protected: if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange())) { - _tuning_params_hint.set_lws(default_lws_tune(window)); + // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which + // will be recalculated with use_dummy_work_items flag at run time again anyway. + _tuning_params_hint.set_lws(default_lws_tune(window, false /* use_dummy_work_items */)); } IKernel::configure(window); @@ -124,7 +133,7 @@ protected: public: /** Constructor */ ICLKernel() - : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint() + : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange) { } /** Returns a reference to the OpenCL kernel of this object. @@ -431,11 +440,24 @@ public: size_t get_max_workgroup_size(); /** Get the global work size given an execution window * - * @param[in] window Execution window + * @param[in] window Execution window + * @param[in] use_dummy_work_items If the kernel uses dummy work items * * @return Global work size of the given execution window */ - static cl::NDRange gws_from_window(const Window &window); + static cl::NDRange gws_from_window(const Window &window, bool use_dummy_work_items); + + /** Get the cached gws used to enqueue this kernel + * + * @return Latest global work size of the kernel + */ + cl::NDRange get_cached_gws() const; + + /** Cache the latest gws used to enqueue this kernel + * + * @param[in] gws Latest global work size of the kernel + */ + void cache_gws(const cl::NDRange &gws); private: /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. @@ -465,6 +487,7 @@ protected: CLKernelType _type; /**< The CL kernel type */ private: CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */ + cl::NDRange _cached_gws; /**< Latest GWS used to enqueue this kernel */ }; /** Add the kernel to the command queue with the given window. diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index 5f882e3a28..68d7e30c9b 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -367,6 +367,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options()); // Set config_id for enabling LWS tuning + // config_id should include the variables used to parameterize the kernel _config_id = kernel_name.str(); _config_id += "_"; _config_id += lower_string(string_from_data_type(data_type)); @@ -384,6 +385,16 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT _config_id += support::cpp11::to_string(conv_stride_x); _config_id += "_"; _config_id += support::cpp11::to_string(conv_stride_y); + // SRC_CHANNELS, SRC_WIDTH, SRC_HEIGHT + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(channel_idx)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(width_idx)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(height_idx)); + _config_id += "_"; + // DST_CHANNELS, DST_WIDTH, DST_HEIGHT + _config_id += support::cpp11::to_string(dst->dimension(channel_idx)); _config_id += "_"; _config_id += support::cpp11::to_string(dst->dimension(width_idx)); _config_id += "_"; diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 1cc20f0c1e..445638f01f 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" @@ -199,11 +200,19 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat }; CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; - cl::NDRange gws = ICLKernel::gws_from_window(kernel.window()); - // Run the kernel with default lws to be used as baseline data->do_run(kernel, queue_profiler); + /// Get the cached gws used by the kernel + /// NOTE: The window configured inside configure() is usually changed in run(). Thus we should not calculate gws + /// from this static window. Instead we get the real gws used (and cached) by run() in the previous step. + /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op() + /// Please see COMPMID-5934 + cl::NDRange gws = kernel.get_cached_gws(); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, + "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", + kernel.config_id().c_str(), to_string(gws).c_str()); + queue_profiler.finish(); const cl_ulong start = _kernel_event.getProfilingInfo(); @@ -236,6 +245,9 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat cl_int wbsm_test = tuning_test.get_wbsm(); kernel.set_wbsm_hint(wbsm_test); } + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, + "[CLTuner] Trying LWS: %s, WBSM: %d", + to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint()); // Run the kernel data->do_run(kernel, queue_profiler); diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp index 6cb2212794..6f3e32491a 100644 --- a/src/runtime/CL/tuners/CLTuningParametersList.cpp +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,7 +31,7 @@ constexpr unsigned int max_lws_supported_x{ 64u }; constexpr unsigned int max_lws_supported_y{ 32u }; constexpr unsigned int max_lws_supported_z{ 32u }; -/** Non instantiable base class for Tuning parameters combinations that use Index2Cooard mapping */ +/** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */ class CLTuningParametersList : public ICLTuningParametersList { protected: @@ -162,10 +162,13 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index) CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info) { - ARM_COMPUTE_UNUSED(gws); - search_space_shape[0] = max_lws_supported_x; - search_space_shape[1] = max_lws_supported_y; - search_space_shape[2] = max_lws_supported_z; + const auto lws_x_max = std::min(static_cast(gws[0]), max_lws_supported_x); + const auto lws_y_max = std::min(static_cast(gws[1]), max_lws_supported_y); + const auto lws_z_max = std::min(static_cast(gws[2]), max_lws_supported_z); + + search_space_shape[0] = lws_x_max; + search_space_shape[1] = lws_y_max; + search_space_shape[2] = lws_z_max; search_space_shape[3] = 1; if(tuning_info.tune_wbsm) { @@ -183,9 +186,9 @@ CLTuningParams CLTuningParametersListNormal::operator[](size_t index) CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info) { - auto lws_x_max = std::min(static_cast(gws[0]), max_lws_supported_x); - auto lws_y_max = std::min(static_cast(gws[1]), max_lws_supported_y); - auto lws_z_max = std::min(static_cast(gws[2]), max_lws_supported_z); + const auto lws_x_max = std::min(static_cast(gws[0]), max_lws_supported_x); + const auto lws_y_max = std::min(static_cast(gws[1]), max_lws_supported_y); + const auto lws_z_max = std::min(static_cast(gws[2]), max_lws_supported_z); // Initialize the tuning parameters values to test _lws_x = {}; @@ -227,9 +230,9 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector(gws[0]), 8u); // Limit exploration to 1 - 8 - auto lws_y_max = std::min(static_cast(gws[1]), 4u); // Limit exploration to 1 - 4 - auto lws_z_max = std::min(static_cast(gws[2]), 4u); // Limit exploration to 1 - 4 + const auto lws_x_max = std::min(static_cast(gws[0]), 8u); // Limit exploration to 1 - 8 + const auto lws_y_max = std::min(static_cast(gws[1]), 4u); // Limit exploration to 1 - 4 + const auto lws_z_max = std::min(static_cast(gws[2]), 4u); // Limit exploration to 1 - 4 // Initialize the LWS values to test _lws_x = {}; diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h index 448f184432..db27ddccde 100644 --- a/utils/TypePrinter.h +++ b/utils/TypePrinter.h @@ -1437,6 +1437,36 @@ inline std::string to_string(ICLTensor *cl_tensor) { return to_string(static_cast(cl_tensor)); } + +/** Formatted output of the cl::NDRange type. + * + * @param[out] os Output stream. + * @param[in] nd_range cl::NDRange to output. + * + * @return Modified output stream. + */ +inline ::std::ostream &operator<<(::std::ostream &os, const cl::NDRange &nd_range) +{ + os << "{" + << nd_range[0] << "," + << nd_range[1] << "," + << nd_range[2] + << "}"; + return os; +} + +/** Formatted output of the cl::NDRange type + * + * @param[in] nd_Range Type to output. + * + * @return Formatted string. + */ +inline std::string to_string(const cl::NDRange &nd_range) +{ + std::stringstream str; + str << nd_range; + return str.str(); +} #endif /* ARM_COMPUTE_OPENCL_ENABLED */ /** Formatted output of the Dimensions type. -- cgit v1.2.1