aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/CL
diff options
context:
space:
mode:
authorManuel Bottini <manuel.bottini@arm.com>2021-01-25 15:07:17 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-02-03 17:35:00 +0000
commitbe9f9f9139b759d314f4f2a6d2ee747079666504 (patch)
tree461690abb95caeaeca40261fd85816a906c8446c /src/runtime/CL
parent7061eb283969f9a020c08349454447564e4dd5b3 (diff)
downloadComputeLibrary-be9f9f9139b759d314f4f2a6d2ee747079666504.tar.gz
Add WBSM tuning to CLTuner
Add WBSM as possible parameter to be tuned Add helper functions to check WBSM support and setting the value in the kernel Update tuning parameter lists to use WBSM Update CLTuner to use WBSM The WBSM tuning is exposed as a parameter to be set at compile time by setting the CLTuningInfo CLTuningInfo contains information about the tuning mode and if wbsm tuning enabled Resolves: COMPMID-3936 Change-Id: Id53697c9c6d2cef41c049f368002f6197351b3ed Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4914 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/runtime/CL')
-rw-r--r--src/runtime/CL/CLTuner.cpp102
-rw-r--r--src/runtime/CL/tuners/CLTuningParametersList.cpp81
2 files changed, 125 insertions, 58 deletions
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 906021790e..e16d6808ed 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -32,13 +32,11 @@
#include <cerrno>
#include <fstream>
#include <limits>
-#include <memory>
-#include <string>
namespace arm_compute
{
CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
- : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info), _tuner_mode(CLTunerMode::NORMAL)
+ : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info)
{
}
@@ -62,12 +60,12 @@ bool CLTuner::tune_new_kernels() const
void CLTuner::set_tuner_mode(CLTunerMode mode)
{
- _tuner_mode = mode;
+ _tuning_info.tuner_mode = mode;
}
CLTunerMode CLTuner::get_tuner_mode() const
{
- return _tuner_mode;
+ return _tuning_info.tuner_mode;
}
void CLTuner::tune_kernel_static(ICLKernel &kernel)
@@ -103,12 +101,20 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
// Set Local-Workgroup-Size
kernel.set_lws_hint(opt_tuning_params.get_lws());
+ if(_tuning_info.tune_wbsm)
+ {
+ kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
+ }
}
}
else
{
// Set Local-Workgroup-Size
kernel.set_lws_hint(p->second.get_lws());
+ if(_tuning_info.tune_wbsm)
+ {
+ kernel.set_wbsm_hint(p->second.get_wbsm());
+ }
}
}
}
@@ -188,13 +194,15 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
cl_ulong min_exec_time = end - start;
_kernel_event = nullptr;
- cl::NDRange opt_lws = cl::NullRange;
+ CLTuningParams opt_tuning_params(cl::NullRange, 0);
// Construct the list of tuning parameters values to be tested based on the tuner mode.
- auto lws_list = cl_tuner::get_tuning_parameters_list(_tuner_mode, gws);
- for(size_t i = 0; i < lws_list->size(); ++i)
+ auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
+ for(size_t i = 0; i < tuning_list->size(); ++i)
{
- cl::NDRange lws_test = (*lws_list)[i].get_lws();
+ CLTuningParams tuning_test = (*tuning_list)[i];
+ // Setting the lws
+ cl::NDRange lws_test = tuning_test.get_lws();
auto x = lws_test[0];
auto y = lws_test[1];
auto z = lws_test[2];
@@ -205,8 +213,12 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
continue;
}
- //Set the Local-Workgroup-Size
kernel.set_lws_hint(lws_test);
+ if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+ {
+ cl_int wbsm_test = tuning_test.get_wbsm();
+ kernel.set_wbsm_hint(wbsm_test);
+ }
// Run the kernel
inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
@@ -222,13 +234,17 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
if(diff < min_exec_time)
{
min_exec_time = diff;
- opt_lws = cl::NDRange(x, y, z);
+ opt_tuning_params.set_lws(tuning_test.get_lws());
+ if(_tuning_info.tune_wbsm)
+ {
+ opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
+ }
}
}
// Restore real function
CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel;
- return CLTuningParams(opt_lws);
+ return opt_tuning_params;
}
void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
@@ -271,34 +287,46 @@ void CLTuner::load_from_file(const std::string &filename)
ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
}
std::string line;
+ bool header_line = true;
while(!std::getline(fs, line).fail())
{
- std::istringstream ss(line);
- std::string token;
- if(std::getline(ss, token, ';').fail())
- {
- ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
- }
- std::string kernel_id = token;
- cl::NDRange lws(1, 1, 1);
- for(int i = 0; i < 3; i++)
+ if(header_line)
{
- if(std::getline(ss, token, ';').fail())
+ header_line = false;
+ size_t pos_lws = line.find("lws");
+ size_t pos_wbsm = line.find("wbsm");
+ _tuning_info.tune_wbsm = false;
+ if(pos_lws != std::string::npos || pos_wbsm != std::string::npos)
{
- ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
+ // The file has in the first line the parameters it has been tuned on
+ if(pos_wbsm != std::string::npos)
+ {
+ _tuning_info.tune_wbsm = true;
+ }
+ // Once the line with the tuning parameter is read we can
+ // read the next one to start collecting the values
+ if(std::getline(fs, line).fail())
+ {
+ break;
+ }
}
- lws.get()[i] = support::cpp11::stoi(token);
}
- // If all dimensions are 0: reset to NullRange (i.e nullptr)
- if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0)
+ CLTuningParams tuning_params;
+ size_t pos = line.find(";");
+ if(pos == std::string::npos)
{
- lws = cl::NullRange;
+ ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
}
- add_tuning_params(kernel_id, lws);
+ std::string kernel_id = line.substr(0, pos);
+ line.erase(0, pos + 1);
+ if(!tuning_params.from_string(_tuning_info, line))
+ {
+ ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
+ }
+ add_tuning_params(kernel_id, tuning_params);
}
fs.close();
- _tuning_info.tune_lws = true;
}
bool CLTuner::save_to_file(const std::string &filename) const
@@ -307,14 +335,24 @@ bool CLTuner::save_to_file(const std::string &filename) const
{
return false;
}
-
std::ofstream fs;
fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
fs.open(filename, std::ios::out);
+ std::string header_string = "";
+ header_string += "lws";
+ if(_tuning_info.tune_wbsm)
+ {
+ if(!header_string.empty())
+ {
+ header_string += " ";
+ }
+ header_string += "wbsm";
+ }
+ fs << header_string << std::endl;
for(auto const &kernel_data : _tuning_params_table)
{
- const cl::NDRange lws = CLTuningParams(kernel_data.second).get_lws();
- fs << kernel_data.first << ";" << lws[0] << ";" << lws[1] << ";" << lws[2] << std::endl;
+ CLTuningParams tun_pams(kernel_data.second);
+ fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;
}
fs.close();
return true;
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 7f63078192..6cb2212794 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -35,8 +35,14 @@ constexpr unsigned int max_lws_supported_z{ 32u };
class CLTuningParametersList : public ICLTuningParametersList
{
protected:
- /* Shape of 3-D search space */
- TensorShape search_space_shape{ 0, 0, 0 };
+ /* Shape of 4-D search space */
+ TensorShape search_space_shape{ 0, 0, 0, 0 };
+ std::vector<unsigned int> _lws_x{ 0 };
+ std::vector<unsigned int> _lws_y{ 0 };
+ std::vector<unsigned int> _lws_z{ 0 };
+ std::vector<int> _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units.
+ The value is in the range [-31,+31].
+ When 0, the runtime-selected wbs used is unmodified. */
/** Constructor */
CLTuningParametersList() = default;
@@ -62,7 +68,7 @@ public:
/** Prevent default constructor calls */
CLTuningParametersListExhaustive() = delete;
/** Constructor */
- CLTuningParametersListExhaustive(const cl::NDRange &gws);
+ CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info);
/** Copy Constructor */
CLTuningParametersListExhaustive(const CLTuningParametersListExhaustive &) = default;
/** Move Constructor */
@@ -83,7 +89,7 @@ class CLTuningParametersListNormal : public CLTuningParametersList
{
public:
/** Constructor */
- CLTuningParametersListNormal(const cl::NDRange &gws);
+ CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info);
/** Copy Constructor */
CLTuningParametersListNormal(const CLTuningParametersListNormal &) = default;
/** Move Constructor */
@@ -98,11 +104,6 @@ public:
// Inherited methods overridden:
CLTuningParams operator[](size_t) override;
-protected:
- std::vector<unsigned int> _lws_x{};
- std::vector<unsigned int> _lws_y{};
- std::vector<unsigned int> _lws_z{};
-
/** Prevent default constructor calls */
CLTuningParametersListNormal() = default;
@@ -125,7 +126,7 @@ public:
/** Prevent default constructor calls */
CLTuningParametersListRapid() = delete;
/** Constructor */
- CLTuningParametersListRapid(const cl::NDRange &gws);
+ CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info);
/** Copy Constructor */
CLTuningParametersListRapid(const CLTuningParametersListRapid &) = default;
/** Move Constructor */
@@ -156,36 +157,53 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index)
{
ARM_COMPUTE_ERROR_ON(index >= size());
auto coords = index2coords(search_space_shape, index);
- return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U);
+ return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U, static_cast<int>(coords[3]));
}
-CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws)
+CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
ARM_COMPUTE_UNUSED(gws);
- search_space_shape = TensorShape(max_lws_supported_x,
- max_lws_supported_y,
- max_lws_supported_z);
+ search_space_shape[0] = max_lws_supported_x;
+ search_space_shape[1] = max_lws_supported_y;
+ search_space_shape[2] = max_lws_supported_z;
+ search_space_shape[3] = 1;
+ if(tuning_info.tune_wbsm)
+ {
+ _wbsm = { -3, -2, -1, 0, 1, 2, 3 };
+ search_space_shape[3] = _wbsm.size();
+ }
}
CLTuningParams CLTuningParametersListNormal::operator[](size_t index)
{
ARM_COMPUTE_ERROR_ON(index >= size());
auto coords = index2coords(search_space_shape, index);
- return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]]);
+ return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]], _wbsm[coords[3]]);
}
-CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws)
+CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
- // Initialize the LWS values to test
+ // Initialize the tuning parameters values to test
+ _lws_x = {};
+ _lws_y = {};
+ _lws_z = {};
initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
- search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+ search_space_shape[0] = _lws_x.size();
+ search_space_shape[1] = _lws_y.size();
+ search_space_shape[2] = _lws_z.size();
+ search_space_shape[3] = 1;
+ if(tuning_info.tune_wbsm)
+ {
+ _wbsm = { -2, -1, 0, 1, 2 };
+ search_space_shape[3] = _wbsm.size();
+ }
}
void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
@@ -207,18 +225,29 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in
}
}
-CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws)
+CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
// Initialize the LWS values to test
+ _lws_x = {};
+ _lws_y = {};
+ _lws_z = {};
initialize_lws_values(_lws_x, lws_x_max);
initialize_lws_values(_lws_y, lws_y_max);
initialize_lws_values(_lws_z, lws_z_max);
- search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+ search_space_shape[0] = _lws_x.size();
+ search_space_shape[1] = _lws_y.size();
+ search_space_shape[2] = _lws_z.size();
+ search_space_shape[3] = 1;
+ if(tuning_info.tune_wbsm)
+ {
+ _wbsm = { -1, 0, 1 };
+ search_space_shape[3] = _wbsm.size();
+ }
}
void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
@@ -231,16 +260,16 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
}
}
-std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws)
+std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
{
- switch(mode)
+ switch(tuning_info.tuner_mode)
{
case CLTunerMode::EXHAUSTIVE:
- return std::make_unique<CLTuningParametersListExhaustive>(gws);
+ return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);
case CLTunerMode::NORMAL:
- return std::make_unique<CLTuningParametersListNormal>(gws);
+ return std::make_unique<CLTuningParametersListNormal>(gws, tuning_info);
case CLTunerMode::RAPID:
- return std::make_unique<CLTuningParametersListRapid>(gws);
+ return std::make_unique<CLTuningParametersListRapid>(gws, tuning_info);
default:
return nullptr;
}