aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2023-02-22 17:24:09 +0000
committerSiCong Li <sicong.li@arm.com>2023-03-06 16:19:11 +0000
commit47f177e679874dc901888973c5fc237b756b38cb (patch)
tree130386717101d0c2440111cb288faa21df8ab151
parentadfcacc8e39888a9a62e33c178041642d0a3047a (diff)
downloadComputeLibrary-47f177e679874dc901888973c5fc237b756b38cb.tar.gz
Fix LWS search space used by CLTuner
* Ensure CLTuner uses the real GWS used by run(), instead of the static GWS (which is usually changed at run time), by caching GWS in each kernel Note this is a somewhat inelegant workaround. The real issue stems from the fact that execution window and scheduler are very much coupled with our operator run() / run_op() method. (Please see COMPMID-5934) * Restrict LWS values to explore within GWS bound for exhaustive mode * Refactor gws_from_window() to include all the information required to calculate GWS * Log lws search space used for tuning * Fix ClDirectConv2dKernel config id Resolves COMPMID-5892 Signed-off-by: SiCong Li <sicong.li@arm.com> Change-Id: I420490d8b94d13ada2e44eb0a12078f883379334 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9193 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/ICLKernel.cpp31
-rw-r--r--src/core/CL/ICLKernel.h37
-rw-r--r--src/gpu/cl/kernels/ClDirectConv2dKernel.cpp11
-rw-r--r--src/runtime/CL/CLTuner.cpp18
-rw-r--r--src/runtime/CL/tuners/CLTuningParametersList.cpp27
-rw-r--r--utils/TypePrinter.h30
6 files changed, 122 insertions, 32 deletions
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 109a076e9a..dc3a86a528 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -43,7 +43,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1));
}
- cl::NDRange gws = ICLKernel::gws_from_window(window);
+ cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
// Check for empty NDRange
if(gws.dimensions() == 0)
@@ -51,12 +51,7 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
return;
}
- // Use dummy work-items
- if(use_dummy_work_items)
- {
- gws.get()[0] = get_next_power_two(gws[0]);
- gws.get()[1] = get_next_power_two(gws[1]);
- }
+ kernel.cache_gws(gws);
cl::NDRange valid_lws;
if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
@@ -190,7 +185,7 @@ size_t ICLKernel::get_max_workgroup_size()
return _max_workgroup_size;
}
-cl::NDRange ICLKernel::gws_from_window(const Window &window)
+cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items)
{
if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
{
@@ -201,6 +196,22 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window)
(window.y().end() - window.y().start()) / window.y().step(),
(window.z().end() - window.z().start()) / window.z().step());
+ if(use_dummy_work_items)
+ {
+ gws.get()[0] = get_next_power_two(gws[0]);
+ gws.get()[1] = get_next_power_two(gws[1]);
+ }
+
return gws;
}
-} // namespace arm_compute \ No newline at end of file
+
+cl::NDRange ICLKernel::get_cached_gws() const
+{
+ return _cached_gws;
+}
+
+void ICLKernel::cache_gws(const cl::NDRange &gws)
+{
+ _cached_gws = gws;
+}
+} // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 5d5b636cf4..c82809cef3 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -86,9 +86,16 @@ private:
return 2 + 2 * dimension_size;
}
- cl::NDRange default_lws_tune(const Window &window)
+ /** Get default lws for the kernel
+ *
+ * @param[in] window Execution window used by the kernel
+ * @param[in] use_dummy_work_items If the kernel uses dummy workloads
+ *
+ * @return cl::NDRange
+ */
+ cl::NDRange default_lws_tune(const Window &window, bool use_dummy_work_items)
{
- return get_default_lws_for_type(_type, gws_from_window(window));
+ return get_default_lws_for_type(_type, gws_from_window(window, use_dummy_work_items));
}
using IKernel::configure; //Prevent children from calling IKernel::configure() directly
@@ -115,7 +122,9 @@ protected:
if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
{
- _tuning_params_hint.set_lws(default_lws_tune(window));
+ // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which
+ // will be recalculated with use_dummy_work_items flag at run time again anyway.
+ _tuning_params_hint.set_lws(default_lws_tune(window, false /* use_dummy_work_items */));
}
IKernel::configure(window);
@@ -124,7 +133,7 @@ protected:
public:
/** Constructor */
ICLKernel()
- : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint()
+ : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange)
{
}
/** Returns a reference to the OpenCL kernel of this object.
@@ -431,11 +440,24 @@ public:
size_t get_max_workgroup_size();
/** Get the global work size given an execution window
*
- * @param[in] window Execution window
+ * @param[in] window Execution window
+ * @param[in] use_dummy_work_items If the kernel uses dummy work items
*
* @return Global work size of the given execution window
*/
- static cl::NDRange gws_from_window(const Window &window);
+ static cl::NDRange gws_from_window(const Window &window, bool use_dummy_work_items);
+
+ /** Get the cached gws used to enqueue this kernel
+ *
+ * @return Latest global work size of the kernel
+ */
+ cl::NDRange get_cached_gws() const;
+
+ /** Cache the latest gws used to enqueue this kernel
+ *
+ * @param[in] gws Latest global work size of the kernel
+ */
+ void cache_gws(const cl::NDRange &gws);
private:
/** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
@@ -465,6 +487,7 @@ protected:
CLKernelType _type; /**< The CL kernel type */
private:
CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */
+ cl::NDRange _cached_gws; /**< Latest GWS used to enqueue this kernel */
};
/** Add the kernel to the command queue with the given window.
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 5f882e3a28..68d7e30c9b 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -367,6 +367,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
_kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
// Set config_id for enabling LWS tuning
+ // config_id should include the variables used to parameterize the kernel
_config_id = kernel_name.str();
_config_id += "_";
_config_id += lower_string(string_from_data_type(data_type));
@@ -384,6 +385,16 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
_config_id += support::cpp11::to_string(conv_stride_x);
_config_id += "_";
_config_id += support::cpp11::to_string(conv_stride_y);
+ // SRC_CHANNELS, SRC_WIDTH, SRC_HEIGHT
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(channel_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(width_idx));
+ _config_id += "_";
+ _config_id += support::cpp11::to_string(src->dimension(height_idx));
+ _config_id += "_";
+ // DST_CHANNELS, DST_WIDTH, DST_HEIGHT
+ _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
_config_id += "_";
_config_id += support::cpp11::to_string(dst->dimension(width_idx));
_config_id += "_";
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 1cc20f0c1e..445638f01f 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/common/utils/Log.h"
#include "src/core/CL/ICLKernel.h"
#include "support/StringSupport.h"
@@ -199,11 +200,19 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
};
CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
- cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
-
// Run the kernel with default lws to be used as baseline
data->do_run(kernel, queue_profiler);
+ /// Get the cached gws used by the kernel
+ /// NOTE: The window configured inside configure() is usually changed in run(). Thus we should not calculate gws
+ /// from this static window. Instead we get the real gws used (and cached) by run() in the previous step.
+ /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op()
+ /// Please see COMPMID-5934
+ cl::NDRange gws = kernel.get_cached_gws();
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
+ "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search",
+ kernel.config_id().c_str(), to_string(gws).c_str());
+
queue_profiler.finish();
const cl_ulong start = _kernel_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
@@ -236,6 +245,9 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, IKernelDat
cl_int wbsm_test = tuning_test.get_wbsm();
kernel.set_wbsm_hint(wbsm_test);
}
+ ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
+ "[CLTuner] Trying LWS: %s, WBSM: %d",
+ to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint());
// Run the kernel
data->do_run(kernel, queue_profiler);
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 6cb2212794..6f3e32491a 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,7 +31,7 @@ constexpr unsigned int max_lws_supported_x{ 64u };
constexpr unsigned int max_lws_supported_y{ 32u };
constexpr unsigned int max_lws_supported_z{ 32u };
-/** Non instantiable base class for Tuning parameters combinations that use Index2Cooard mapping */
+/** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */
class CLTuningParametersList : public ICLTuningParametersList
{
protected:
@@ -162,10 +162,13 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index)
CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
- ARM_COMPUTE_UNUSED(gws);
- search_space_shape[0] = max_lws_supported_x;
- search_space_shape[1] = max_lws_supported_y;
- search_space_shape[2] = max_lws_supported_z;
+ const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+ const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+ const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+
+ search_space_shape[0] = lws_x_max;
+ search_space_shape[1] = lws_y_max;
+ search_space_shape[2] = lws_z_max;
search_space_shape[3] = 1;
if(tuning_info.tune_wbsm)
{
@@ -183,9 +186,9 @@ CLTuningParams CLTuningParametersListNormal::operator[](size_t index)
CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
- auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
- auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
- auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
+ const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
+ const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
+ const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
// Initialize the tuning parameters values to test
_lws_x = {};
@@ -227,9 +230,9 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in
CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
- auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
- auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
- auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
+ const auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
+ const auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
+ const auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
// Initialize the LWS values to test
_lws_x = {};
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 448f184432..db27ddccde 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -1437,6 +1437,36 @@ inline std::string to_string(ICLTensor *cl_tensor)
{
return to_string(static_cast<const ICLTensor *>(cl_tensor));
}
+
+/** Formatted output of the cl::NDRange type.
+ *
+ * @param[out] os Output stream.
+ * @param[in] nd_range cl::NDRange to output.
+ *
+ * @return Modified output stream.
+ */
+inline ::std::ostream &operator<<(::std::ostream &os, const cl::NDRange &nd_range)
+{
+ os << "{"
+ << nd_range[0] << ","
+ << nd_range[1] << ","
+ << nd_range[2]
+ << "}";
+ return os;
+}
+
+/** Formatted output of the cl::NDRange type
+ *
+ * @param[in] nd_Range Type to output.
+ *
+ * @return Formatted string.
+ */
+inline std::string to_string(const cl::NDRange &nd_range)
+{
+ std::stringstream str;
+ str << nd_range;
+ return str.str();
+}
#endif /* ARM_COMPUTE_OPENCL_ENABLED */
/** Formatted output of the Dimensions type.