From 47f177e679874dc901888973c5fc237b756b38cb Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Wed, 22 Feb 2023 17:24:09 +0000 Subject: Fix LWS search space used by CLTuner * Ensure CLTuner uses the real GWS used by run(), instead of the static GWS (which is usually changed at run time), by caching GWS in each kernel Note this is a somewhat inelegant workaround. The real issue stems from the fact that execution window and scheduler are very much coupled with our operator run() / run_op() method. (Please see COMPMID-5934) * Restrict LWS values to explore within GWS bound for exhaustive mode * Refactor gws_from_window() to include all the information required to calculate GWS * Log lws search space used for tuning * Fix ClDirectConv2dKernel config id Resolves COMPMID-5892 Signed-off-by: SiCong Li Change-Id: I420490d8b94d13ada2e44eb0a12078f883379334 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9193 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/runtime/CL/CLTuner.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'src/runtime/CL/CLTuner.cpp') diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 1cc20f0c1e..445638f01f 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" @@ -199,11 +200,19 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat }; CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; - cl::NDRange gws = ICLKernel::gws_from_window(kernel.window()); - // Run the kernel with default lws to be used as baseline data->do_run(kernel, queue_profiler); + /// Get the cached gws used by the kernel + /// NOTE: The window configured inside configure() is usually changed in run(). Thus we should not calculate gws + /// from this static window. Instead we get the real gws used (and cached) by run() in the previous step. + /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op() + /// Please see COMPMID-5934 + cl::NDRange gws = kernel.get_cached_gws(); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, + "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", + kernel.config_id().c_str(), to_string(gws).c_str()); + queue_profiler.finish(); const cl_ulong start = _kernel_event.getProfilingInfo(); @@ -236,6 +245,9 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat cl_int wbsm_test = tuning_test.get_wbsm(); kernel.set_wbsm_hint(wbsm_test); } + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, + "[CLTuner] Trying LWS: %s, WBSM: %d", + to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint()); // Run the kernel data->do_run(kernel, queue_profiler); -- cgit v1.2.1