diff options
Diffstat (limited to 'src/runtime')
-rw-r--r-- | src/runtime/CL/CLTuner.cpp | 18 | ||||
-rw-r--r-- | src/runtime/CL/tuners/CLTuningParametersList.cpp | 27 |
2 files changed, 30 insertions, 15 deletions
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 1cc20f0c1e..445638f01f 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" @@ -199,11 +200,19 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat }; CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; - cl::NDRange gws = ICLKernel::gws_from_window(kernel.window()); - // Run the kernel with default lws to be used as baseline data->do_run(kernel, queue_profiler); + /// Get the cached gws used by the kernel + /// NOTE: The window configured inside configure() is usually changed in run(). Thus we should not calculate gws + /// from this static window. Instead we get the real gws used (and cached) by run() in the previous step. + /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op() + /// Please see COMPMID-5934 + cl::NDRange gws = kernel.get_cached_gws(); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, + "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", + kernel.config_id().c_str(), to_string(gws).c_str()); + queue_profiler.finish(); const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); @@ -236,6 +245,9 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat cl_int wbsm_test = tuning_test.get_wbsm(); kernel.set_wbsm_hint(wbsm_test); } + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, + "[CLTuner] Trying LWS: %s, WBSM: %d", + to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint()); // Run the kernel data->do_run(kernel, queue_profiler); diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp index 6cb2212794..6f3e32491a 100644 --- a/src/runtime/CL/tuners/CLTuningParametersList.cpp +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,7 +31,7 @@ constexpr unsigned int max_lws_supported_x{ 64u }; constexpr unsigned int max_lws_supported_y{ 32u }; constexpr unsigned int max_lws_supported_z{ 32u }; -/** Non instantiable base class for Tuning parameters combinations that use Index2Cooard mapping */ +/** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */ class CLTuningParametersList : public ICLTuningParametersList { protected: @@ -162,10 +162,13 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index) CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info) { - ARM_COMPUTE_UNUSED(gws); - search_space_shape[0] = max_lws_supported_x; - search_space_shape[1] = max_lws_supported_y; - search_space_shape[2] = max_lws_supported_z; + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); + + search_space_shape[0] = lws_x_max; + search_space_shape[1] = lws_y_max; + search_space_shape[2] = lws_z_max; search_space_shape[3] = 1; if(tuning_info.tune_wbsm) { @@ -183,9 +186,9 @@ CLTuningParams CLTuningParametersListNormal::operator[](size_t index) CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info) { - auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); - auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); - auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); // Initialize the tuning parameters values to test _lws_x = {}; @@ -227,9 +230,9 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info) { - auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8 - auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4 - auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4 + const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8 + const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4 + const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4 // Initialize the LWS values to test _lws_x = {}; |