From 47f177e679874dc901888973c5fc237b756b38cb Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Wed, 22 Feb 2023 17:24:09 +0000 Subject: Fix LWS search space used by CLTuner * Ensure CLTuner uses the real GWS used by run(), instead of the static GWS (which is usually changed at run time), by caching GWS in each kernel Note this is a somewhat inelegant workaround. The real issue stems from the fact that execution window and scheduler are very much coupled with our operator run() / run_op() method. (Please see COMPMID-5934) * Restrict LWS values to explore within GWS bound for exhaustive mode * Refactor gws_from_window() to include all the information required to calculate GWS * Log lws search space used for tuning * Fix ClDirectConv2dKernel config id Resolves COMPMID-5892 Signed-off-by: SiCong Li Change-Id: I420490d8b94d13ada2e44eb0a12078f883379334 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9193 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- src/core/CL/ICLKernel.cpp | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'src/core/CL/ICLKernel.cpp') diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 109a076e9a..dc3a86a528 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -43,7 +43,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1)); } - cl::NDRange gws = ICLKernel::gws_from_window(window); + cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items); // Check for empty NDRange if(gws.dimensions() == 0) @@ -51,12 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind return; } - // Use dummy work-items - if(use_dummy_work_items) - { - gws.get()[0] = get_next_power_two(gws[0]); - gws.get()[1] = get_next_power_two(gws[1]); - } + kernel.cache_gws(gws); cl::NDRange valid_lws; if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) @@ -190,7 +185,7 @@ size_t ICLKernel::get_max_workgroup_size() return _max_workgroup_size; } -cl::NDRange ICLKernel::gws_from_window(const Window &window) +cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items) { if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) { @@ -201,6 +196,22 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window) (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); + if(use_dummy_work_items) + { + gws.get()[0] = get_next_power_two(gws[0]); + gws.get()[1] = get_next_power_two(gws[1]); + } + return gws; } -} // namespace arm_compute \ No newline at end of file + +cl::NDRange ICLKernel::get_cached_gws() const +{ + return _cached_gws; +} + +void ICLKernel::cache_gws(const cl::NDRange &gws) +{ + _cached_gws = gws; +} +} // namespace arm_compute -- cgit v1.2.1