aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/ICLKernel.cpp
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2023-02-22 17:24:09 +0000
committerSiCong Li <sicong.li@arm.com>2023-03-06 16:19:11 +0000
commit47f177e679874dc901888973c5fc237b756b38cb (patch)
tree130386717101d0c2440111cb288faa21df8ab151 /src/core/CL/ICLKernel.cpp
parentadfcacc8e39888a9a62e33c178041642d0a3047a (diff)
downloadComputeLibrary-47f177e679874dc901888973c5fc237b756b38cb.tar.gz
Fix LWS search space used by CLTuner
* Ensure CLTuner uses the real GWS used by run(), instead of the static GWS (which is usually changed at run time), by caching GWS in each kernel Note this is a somewhat inelegant workaround. The real issue stems from the fact that execution window and scheduler are very much coupled with our operator run() / run_op() method. (Please see COMPMID-5934) * Restrict LWS values to explore within GWS bound for exhaustive mode * Refactor gws_from_window() to include all the information required to calculate GWS * Log lws search space used for tuning * Fix ClDirectConv2dKernel config id Resolves COMPMID-5892 Signed-off-by: SiCong Li <sicong.li@arm.com> Change-Id: I420490d8b94d13ada2e44eb0a12078f883379334 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9193 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/ICLKernel.cpp')
-rw-r--r--src/core/CL/ICLKernel.cpp31
1 files changed, 21 insertions, 10 deletions
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 109a076e9a..dc3a86a528 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1));
}
- cl::NDRange gws = ICLKernel::gws_from_window(window);
+ cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
// Check for empty NDRange
if(gws.dimensions() == 0)
@@ -51,12 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
return;
}
- // Use dummy work-items
- if(use_dummy_work_items)
- {
- gws.get()[0] = get_next_power_two(gws[0]);
- gws.get()[1] = get_next_power_two(gws[1]);
- }
+ kernel.cache_gws(gws);
cl::NDRange valid_lws;
if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
@@ -190,7 +185,7 @@ size_t ICLKernel::get_max_workgroup_size()
return _max_workgroup_size;
}
-cl::NDRange ICLKernel::gws_from_window(const Window &window)
+cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items)
{
if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
{
@@ -201,6 +196,22 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window)
(window.y().end() - window.y().start()) / window.y().step(),
(window.z().end() - window.z().start()) / window.z().step());
+ if(use_dummy_work_items)
+ {
+ gws.get()[0] = get_next_power_two(gws[0]);
+ gws.get()[1] = get_next_power_two(gws[1]);
+ }
+
return gws;
}
-} // namespace arm_compute \ No newline at end of file
+
+cl::NDRange ICLKernel::get_cached_gws() const
+{
+ return _cached_gws;
+}
+
+void ICLKernel::cache_gws(const cl::NDRange &gws)
+{
+ _cached_gws = gws;
+}
+} // namespace arm_compute