aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/core/CL/CLCompileContext.h9
-rw-r--r--arm_compute/core/CL/CLHelpers.h22
-rw-r--r--arm_compute/core/CL/CLKernelLibrary.h6
-rw-r--r--arm_compute/core/CL/OpenCL.h3
-rw-r--r--arm_compute/runtime/CL/CLTuner.h1
-rw-r--r--arm_compute/runtime/CL/CLTunerTypes.h5
-rw-r--r--arm_compute/runtime/CL/CLTuningParams.h85
-rw-r--r--arm_compute/runtime/CL/tuners/CLTuningParametersList.h5
-rw-r--r--src/core/CL/CLCompileContext.cpp19
-rw-r--r--src/core/CL/CLHelpers.cpp24
-rw-r--r--src/core/CL/CLKernelLibrary.cpp5
-rw-r--r--src/core/CL/ICLKernel.cpp11
-rw-r--r--src/core/CL/ICLKernel.h51
-rw-r--r--src/core/CL/OpenCL.cpp20
-rw-r--r--src/runtime/CL/CLTuner.cpp102
-rw-r--r--src/runtime/CL/tuners/CLTuningParametersList.cpp81
16 files changed, 359 insertions, 90 deletions
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index 6f6dc18b85..46a8c9b341 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -296,6 +296,12 @@ public:
*/
bool int64_base_atomics_supported() const;
+ /* Returns true if the workgroup batch size modifier parameter is supported on the cl device
+ *
+ * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+ */
+ bool is_wbsm_supported() const;
+
private:
/** Load program and its dependencies.
*
@@ -327,6 +333,7 @@ private:
CLDevice _device; /**< Underlying CL device. */
mutable std::map<std::string, const Program> _programs_map; /**< Map with all already loaded program data. */
mutable std::map<std::string, cl::Program> _built_programs_map; /**< Map with all already built program data. */
+ bool _is_wbsm_supported; /**< Support of worksize batch size modifier support boolean*/
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLCOMPILECONTEXT_H */
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index cf18e16e34..0e9aa5d6e5 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,11 @@
#include <set>
#include <string>
+/* CL Device capabilities */
+#define ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM 0x41E4
+/* Workgroup Batch Size Modifier */
+#define ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM 0x41E6
+
namespace arm_compute
{
class CLCoreRuntimeContext;
@@ -226,5 +231,20 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_
*/
cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size);
+/* Helper function to check if the workgroup batch size modifier parameter is supported on the cl device
+ *
+ * @param[in] device cl device to check for support
+ *
+ * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+ */
+bool get_wbsm_support_info(const cl::Device &device);
+
+/* Helper function to set the workgroup batch size modifier parameter in the kernel
+ *
+ * @param[in] kernel cl kernel to set the workgroup batch size modifier parameter
+ * @param[in] wbsm_hint workgroup batch size modifier to use
+ */
+void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint);
+
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLHELPERS_H */
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
index 193389388e..0d8e4a6164 100644
--- a/arm_compute/core/CL/CLKernelLibrary.h
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -148,6 +148,12 @@ public:
*/
std::string get_program_name(const std::string &kernel_name) const;
+ /* Returns true if the workgroup batch size modifier parameter is supported on the cl device
+ *
+ * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+ */
+ bool is_wbsm_supported();
+
/** Sets the CL context used to create programs.
*
* @note Setting the context also resets the device to the
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index f9796d7e95..155c3e4eef 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -135,6 +135,7 @@ public:
DECLARE_FUNCTION_PTR(clEnqueueMarker);
DECLARE_FUNCTION_PTR(clWaitForEvents);
DECLARE_FUNCTION_PTR(clCreateImage);
+ DECLARE_FUNCTION_PTR(clSetKernelExecInfo);
// Third-party extensions
DECLARE_FUNCTION_PTR(clImportMemoryARM);
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 9814867142..e1c98bf411 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -182,7 +182,6 @@ private:
cl::Event _kernel_event;
bool _tune_new_kernels;
CLTuningInfo _tuning_info;
- CLTunerMode _tuner_mode;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLTUNER_H */
diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h
index 49e2d615ea..e93ef5b2b3 100644
--- a/arm_compute/runtime/CL/CLTunerTypes.h
+++ b/arm_compute/runtime/CL/CLTunerTypes.h
@@ -42,7 +42,10 @@ enum class CLTunerMode
/**< OpenCL tuner tuning information */
struct CLTuningInfo
{
- bool tune_lws = true;
+ CLTunerMode tuner_mode = CLTunerMode::NORMAL; /**< Parameter to select the level (granularity) of the tuning */
+ bool tune_wbsm = false; /**< Flag to tune the batches of work groups distributed to compute units.
+ Internally, the library will check if this feature is available on
+ the target platform */
};
/** Converts a string to a strong types enumeration @ref CLTunerMode
diff --git a/arm_compute/runtime/CL/CLTuningParams.h b/arm_compute/runtime/CL/CLTuningParams.h
index 99a386638d..b50481336b 100644
--- a/arm_compute/runtime/CL/CLTuningParams.h
+++ b/arm_compute/runtime/CL/CLTuningParams.h
@@ -25,6 +25,10 @@
#define ARM_COMPUTE_CLTUNING_PARAMS_H
#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLTunerTypes.h"
+#include "support/StringSupport.h"
+
+#include <ostream>
namespace arm_compute
{
@@ -34,26 +38,95 @@ class CLTuningParams
public:
CLTuningParams(const CLTuningParams &) = default;
- CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0)
- : _lws(lws_x, lws_y, lws_z)
+ CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0, int wbsm = 0)
+ : _lws(lws_x, lws_y, lws_z), _wbsm(wbsm)
{
}
- CLTuningParams(cl::NDRange lws)
- : _lws(lws)
+ CLTuningParams(cl::NDRange lws, cl_int wbsm = 0)
+ : _lws(lws), _wbsm(wbsm)
{
}
- void set_lws(cl::NDRange &lws)
+
+ CLTuningParams(cl_int wbsm)
+ : CLTuningParams(cl::NullRange, wbsm)
+ {
+ }
+
+ void set_lws(cl::NDRange lws)
{
_lws = lws;
}
- cl::NDRange get_lws()
+ cl::NDRange get_lws() const
{
return _lws;
}
+ void set_wbsm(cl_int wbsm)
+ {
+ _wbsm = wbsm;
+ }
+
+ cl_int get_wbsm() const
+ {
+ return _wbsm;
+ }
+
+ std::string to_string(CLTuningInfo tuning_info)
+ {
+ std::string tuning_params_string = "";
+ tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) + ";" + support::cpp11::to_string(_lws[2]);
+ if(tuning_info.tune_wbsm)
+ {
+ tuning_params_string += ";" + support::cpp11::to_string(_wbsm);
+ }
+ return tuning_params_string;
+ }
+
+ bool from_string(CLTuningInfo tuning_info, std::string tuning_params_string)
+ {
+ std::replace(tuning_params_string.begin(), tuning_params_string.end(), ';', ' ');
+ std::vector<std::string> array;
+ std::stringstream ss(tuning_params_string);
+ std::string temp;
+ while(ss >> temp)
+ {
+ array.push_back(temp);
+ }
+ // Read 3 values for lws
+ if(array.size() < 3)
+ {
+ return false;
+ }
+ const unsigned int lws_0 = support::cpp11::stoi(array[0]);
+ const unsigned int lws_1 = support::cpp11::stoi(array[1]);
+ const unsigned int lws_2 = support::cpp11::stoi(array[2]);
+ if(lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
+ {
+ // If lws values are 0, cl::NullRange has to be used
+ // otherwise the lws object will be badly created
+ _lws = cl::NullRange;
+ }
+ else
+ {
+ _lws = cl::NDRange(lws_0, lws_1, lws_2);
+ }
+ array.erase(array.begin(), array.begin() + 3);
+ if(tuning_info.tune_wbsm)
+ {
+ if(array.size() < 1)
+ {
+ return false;
+ }
+ _wbsm = support::cpp11::stoi(array[0]);
+ array.erase(array.begin());
+ }
+ return true;
+ }
+
private:
cl::NDRange _lws;
+ cl_int _wbsm;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLTUNING_PARAMS_H */
diff --git a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
index c51b9901ef..69572c98d2 100644
--- a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
+++ b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
@@ -77,9 +77,12 @@ public:
/** Construct an ICLTuningParametersList object for the given tuner mode and gws configuration.
*
+ * @param[in] tuning_info Tuning info containng which parameters to tune and the tuner mode
+ * @param[in] gws Global worksize values
+ *
* @return unique_ptr to the requested ICLTuningParametersList implementation.
*/
-std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws);
+std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws);
} // namespace cl_tuner
} // namespace arm_compute
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 0afb7e5e0e..3db0fe515a 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -137,15 +137,16 @@ Kernel::Kernel(std::string name, const cl::Program &program)
{
}
CLCompileContext::CLCompileContext()
- : _context(), _device(), _programs_map(), _built_programs_map()
+ : _context(), _device(), _programs_map(), _built_programs_map(), _is_wbsm_supported()
{
}
CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device)
- : _context(), _device(), _programs_map(), _built_programs_map()
+ : _context(), _device(), _programs_map(), _built_programs_map(), _is_wbsm_supported()
{
- _context = std::move(context);
- _device = CLDevice(device);
+ _context = std::move(context);
+ _device = CLDevice(device);
+ _is_wbsm_supported = get_wbsm_support_info(device);
}
Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
@@ -318,7 +319,8 @@ const cl::Device &CLCompileContext::get_device() const
void CLCompileContext::set_device(cl::Device device)
{
- _device = std::move(device);
+ _device = std::move(device);
+ _is_wbsm_supported = get_wbsm_support_info(device);
}
cl::NDRange CLCompileContext::default_ndrange() const
@@ -346,6 +348,11 @@ bool CLCompileContext::int64_base_atomics_supported() const
return _device.supported("cl_khr_int64_base_atomics");
}
+bool CLCompileContext::is_wbsm_supported() const
+{
+ return _is_wbsm_supported;
+}
+
size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) const
{
size_t result;
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 895bb72827..aff897738a 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -415,4 +415,26 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio
const unsigned int num_of_threads = ((input_dimension + border_width) / 16);
return cl::NDRange(std::min(8U, num_of_threads));
}
+
+bool get_wbsm_support_info(const cl::Device &device)
+{
+ cl_bitfield capabilities = 0;
+ cl_int err = clGetDeviceInfo(device.get(), ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr);
+ if((err == CL_SUCCESS) && (capabilities & ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM))
+ {
+ return true;
+ }
+ return false;
+}
+
+void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
+{
+ cl_int err = clSetKernelExecInfo(kernel.get(),
+ ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM,
+ sizeof(cl_int),
+ &wbsm_hint);
+ ARM_COMPUTE_UNUSED(err);
+ ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS);
+}
+
} // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index cf1c52e463..75f76ea344 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -1206,6 +1206,11 @@ bool CLKernelLibrary::int64_base_atomics_supported() const
return _compile_context.int64_base_atomics_supported();
}
+bool CLKernelLibrary::is_wbsm_supported()
+{
+ return _compile_context.is_wbsm_supported();
+}
+
std::pair<std::string, bool> CLKernelLibrary::get_program(const std::string &program_name) const
{
#ifdef EMBEDDED_KERNELS
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 2b259bf28a..1c6963f3f1 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,8 +29,6 @@
#include <cstddef>
-using namespace arm_compute;
-
void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
{
if(kernel.kernel()() == nullptr)
@@ -77,9 +75,15 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
lws = valid_lws;
}
+ if(CLKernelLibrary::get().is_wbsm_supported())
+ {
+ set_wbsm(kernel.kernel(), kernel.wbsm_hint());
+ }
queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
}
+namespace arm_compute
+{
template <unsigned int dimension_size>
void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
{
@@ -146,3 +150,4 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window)
return gws;
}
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index a24cd8c798..6737109f34 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -31,6 +31,7 @@
#include "arm_compute/core/IKernel.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTuningParams.h"
#include <string>
@@ -67,19 +68,30 @@ private:
protected:
/** Configure the kernel's window and local workgroup size hint.
*
- * @param[in] window The maximum window which will be returned by window()
- * @param[in] lws_hint (Optional) Local-Workgroup-Size to use.
+ * @param[in] window The maximum window which will be returned by window()
+ * @param[in] lws_hint Local-Workgroup-Size to use.
+ * @param[in] wbsm_hint (Optional) Workgroup-Batch-Size-Modifier to use.
*/
- void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange())
+ void configure_internal(const Window &window, cl::NDRange lws_hint, cl_int wbsm_hint = 0)
{
- _lws_hint = lws_hint;
+ configure_internal(window, CLTuningParams(lws_hint, wbsm_hint));
+ }
+
+ /** Configure the kernel's window and tuning parameters hints.
+ *
+ * @param[in] window The maximum window which will be returned by window()
+ * @param[in] tuning_params_hint (Optional) Tuning parameters to use.
+ */
+ void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
+ {
+ _tuning_params_hint = tuning_params_hint;
IKernel::configure(window);
}
public:
/** Constructor */
ICLKernel()
- : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint()
+ : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint()
{
}
/** Returns a reference to the OpenCL kernel of this object.
@@ -254,7 +266,7 @@ public:
void set_lws_hint(const cl::NDRange &lws_hint)
{
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
- _lws_hint = lws_hint;
+ _tuning_params_hint.set_lws(lws_hint);
}
/** Return the Local-Workgroup-Size hint
@@ -263,7 +275,28 @@ public:
*/
cl::NDRange lws_hint() const
{
- return _lws_hint;
+ return _tuning_params_hint.get_lws();
+ }
+
+ /** Set the workgroup batch size modifier hint
+ *
+ * @note This method should be called after the configuration of the kernel
+ *
+ * @param[in] wbsm_hint workgroup batch size modifier value
+ */
+ void set_wbsm_hint(const cl_int &wbsm_hint)
+ {
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // wbsm_hint will be overwritten by configure()
+ _tuning_params_hint.set_wbsm(wbsm_hint);
+ }
+
+ /** Return the workgroup batch size modifier hint
+ *
+ * @return Current wbsm hint
+ */
+ cl_int wbsm_hint() const
+ {
+ return _tuning_params_hint.get_wbsm();
}
/** Get the configuration ID
@@ -344,7 +377,7 @@ protected:
std::string _config_id; /**< Configuration ID */
size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */
private:
- cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+ CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */
};
/** Add the kernel to the command queue with the given window.
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 6c70861946..aff6285697 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -134,6 +134,7 @@ bool CLSymbols::load(const std::string &library)
LOAD_FUNCTION_PTR(clEnqueueMarker, handle);
LOAD_FUNCTION_PTR(clWaitForEvents, handle);
LOAD_FUNCTION_PTR(clCreateImage, handle);
+ LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle);
// Third-party extensions
LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
@@ -962,6 +963,23 @@ clCreateImage(cl_context context,
}
}
+cl_int clSetKernelExecInfo(cl_kernel kernel,
+ cl_kernel_exec_info param_name,
+ size_t param_value_size,
+ const void *param_value)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr;
+ if(func != nullptr)
+ {
+ return func(kernel, param_name, param_value_size, param_value);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}
+
cl_mem
clImportMemoryARM(cl_context context,
cl_mem_flags flags,
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 906021790e..e16d6808ed 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -32,13 +32,11 @@
#include <cerrno>
#include <fstream>
#include <limits>
-#include <memory>
-#include <string>
namespace arm_compute
{
CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
- : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info), _tuner_mode(CLTunerMode::NORMAL)
+ : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info)
{
}
@@ -62,12 +60,12 @@ bool CLTuner::tune_new_kernels() const
void CLTuner::set_tuner_mode(CLTunerMode mode)
{
- _tuner_mode = mode;
+ _tuning_info.tuner_mode = mode;
}
CLTunerMode CLTuner::get_tuner_mode() const
{
- return _tuner_mode;
+ return _tuning_info.tuner_mode;
}
void CLTuner::tune_kernel_static(ICLKernel &kernel)
@@ -103,12 +101,20 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
// Set Local-Workgroup-Size
kernel.set_lws_hint(opt_tuning_params.get_lws());
+ if(_tuning_info.tune_wbsm)
+ {
+ kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
+ }
}
}
else
{
// Set Local-Workgroup-Size
kernel.set_lws_hint(p->second.get_lws());
+ if(_tuning_info.tune_wbsm)
+ {
+ kernel.set_wbsm_hint(p->second.get_wbsm());
+ }
}
}
}
@@ -188,13 +194,15 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
cl_ulong min_exec_time = end - start;
_kernel_event = nullptr;
- cl::NDRange opt_lws = cl::NullRange;
+ CLTuningParams opt_tuning_params(cl::NullRange, 0);
// Construct the list of tuning parameters values to be tested based on the tuner mode.
- auto lws_list = cl_tuner::get_tuning_parameters_list(_tuner_mode, gws);
- for(size_t i = 0; i < lws_list->size(); ++i)
+ auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
+ for(size_t i = 0; i < tuning_list->size(); ++i)
{
- cl::NDRange lws_test = (*lws_list)[i].get_lws();
+ CLTuningParams tuning_test = (*tuning_list)[i];
+ // Setting the lws
+ cl::NDRange lws_test = tuning_test.get_lws();
auto x = lws_test[0];
auto y = lws_test[1];
auto z = lws_test[2];
@@ -205,8 +213,12 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
continue;
}
- //Set the Local-Workgroup-Size
kernel.set_lws_hint(lws_test);
+ if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+ {
+ cl_int wbsm_test = tuning_test.get_wbsm();
+ kernel.set_wbsm_hint(wbsm_test);
+ }
// Run the kernel
inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
@@ -222,13 +234,17 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac
if(diff < min_exec_time)
{
min_exec_time = diff;
- opt_lws = cl::NDRange(x, y, z);
+ opt_tuning_params.set_lws(tuning_test.get_lws());
+ if(_tuning_info.tune_wbsm)
+ {
+ opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
+ }
}
}
// Restore real function
CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel;
- return CLTuningParams(opt_lws);
+ return opt_tuning_params;
}
void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
@@ -271,34 +287,46 @@ void CLTuner::load_from_file(const std::string &filename)
ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
}
std::string line;
+ bool header_line = true;
while(!std::getline(fs, line).fail())
{
- std::istringstream ss(line);
- std::string token;
- if(std::getline(ss, token, ';').fail())
- {
- ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
- }
- std::string kernel_id = token;
- cl::NDRange lws(1, 1, 1);
- for(int i = 0; i < 3; i++)
+ if(header_line)
{
- if(std::getline(ss, token, ';').fail())
+ header_line = false;
+ size_t pos_lws = line.find("lws");
+ size_t pos_wbsm = line.find("wbsm");
+ _tuning_info.tune_wbsm = false;
+ if(pos_lws != std::string::npos || pos_wbsm != std::string::npos)
{
- ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
+ // The file has in the first line the parameters it has been tuned on
+ if(pos_wbsm != std::string::npos)
+ {
+ _tuning_info.tune_wbsm = true;
+ }
+ // Once the line with the tuning parameter is read we can
+ // read the next one to start collecting the values
+ if(std::getline(fs, line).fail())
+ {
+ break;
+ }
}
- lws.get()[i] = support::cpp11::stoi(token);
}
- // If all dimensions are 0: reset to NullRange (i.e nullptr)
- if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0)
+ CLTuningParams tuning_params;
+ size_t pos = line.find(";");
+ if(pos == std::string::npos)
{
- lws = cl::NullRange;
+ ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
}
- add_tuning_params(kernel_id, lws);
+ std::string kernel_id = line.substr(0, pos);
+ line.erase(0, pos + 1);
+ if(!tuning_params.from_string(_tuning_info, line))
+ {
+ ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
+ }
+ add_tuning_params(kernel_id, tuning_params);
}
fs.close();
- _tuning_info.tune_lws = true;
}
bool CLTuner::save_to_file(const std::string &filename) const
@@ -307,14 +335,24 @@ bool CLTuner::save_to_file(const std::string &filename) const
{
return false;
}
-
std::ofstream fs;
fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
fs.open(filename, std::ios::out);
+ std::string header_string = "";
+ header_string += "lws";
+ if(_tuning_info.tune_wbsm)
+ {
+ if(!header_string.empty())
+ {
+ header_string += " ";
+ }
+ header_string += "wbsm";
+ }
+ fs << header_string << std::endl;
for(auto const &kernel_data : _tuning_params_table)
{
- const cl::NDRange lws = CLTuningParams(kernel_data.second).get_lws();
- fs << kernel_data.first << ";" << lws[0] << ";" << lws[1] << ";" << lws[2] << std::endl;
+ CLTuningParams tun_pams(kernel_data.second);
+ fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;
}
fs.close();
return true;
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 7f63078192..6cb2212794 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -35,8 +35,14 @@ constexpr unsigned int max_lws_supported_z{ 32u };
class CLTuningParametersList : public ICLTuningParametersList
{
protected:
- /* Shape of 3-D search space */
- TensorShape search_space_shape{ 0, 0, 0 };
+ /* Shape of 4-D search space */
+ TensorShape search_space_shape{ 0, 0, 0, 0 };
+ std::vector<unsigned int> _lws_x{ 0 };
+ std::vector<unsigned int> _lws_y{ 0 };
+ std::vector<unsigned int> _lws_z{ 0 };
+ std::vector<int> _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units.
+ The value is in the range [-31,+31].
+ When 0, the runtime-selected wbs used is unmodified. */
/** Constructor */
CLTuningParametersList() = default;
@@ -62,7 +68,7 @@ public:
/** Prevent default constructor calls */
CLTuningParametersListExhaustive() = delete;
/** Constructor */
- CLTuningParametersListExhaustive(const cl::NDRange &gws);
+ CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info);
/** Copy Constructor */
CLTuningParametersListExhaustive(const CLTuningParametersListExhaustive &) = default;
/** Move Constructor */
@@ -83,7 +89,7 @@ class CLTuningParametersListNormal : public CLTuningParametersList
{
public:
/** Constructor */
- CLTuningParametersListNormal(const cl::NDRange &gws);
+ CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info);
/** Copy Constructor */
CLTuningParametersListNormal(const CLTuningParametersListNormal &) = default;
/** Move Constructor */
@@ -98,11 +104,6 @@ public:
// Inherited methods overridden:
CLTuningParams operator[](size_t) override;
-protected:
- std::vector<unsigned int> _lws_x{};
- std::vector<unsigned int> _lws_y{};
- std::vector<unsigned int> _lws_z{};
-
/** Prevent default constructor calls */
CLTuningParametersListNormal() = default;
@@ -125,7 +126,7 @@ public:
/** Prevent default constructor calls */
CLTuningParametersListRapid() = delete;
/** Constructor */
- CLTuningParametersListRapid(const cl::NDRange &gws);
+ CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info);
/** Copy Constructor */
CLTuningParametersListRapid(const CLTuningParametersListRapid &) = default;
/** Move Constructor */
@@ -156,36 +157,53 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index)
{
ARM_COMPUTE_ERROR_ON(index >= size());
auto coords = index2coords(search_space_shape, index);
- return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U);
+ return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U, static_cast<int>(coords[3]));
}
-CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws)
+CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
ARM_COMPUTE_UNUSED(gws);
- search_space_shape = TensorShape(max_lws_supported_x,
- max_lws_supported_y,
- max_lws_supported_z);
+ search_space_shape[0] = max_lws_supported_x;
+ search_space_shape[1] = max_lws_supported_y;
+ search_space_shape[2] = max_lws_supported_z;
+ search_space_shape[3] = 1;
+ if(tuning_info.tune_wbsm)
+ {
+ _wbsm = { -3, -2, -1, 0, 1, 2, 3 };
+ search_space_shape[3] = _wbsm.size();
+ }
}
CLTuningParams CLTuningParametersListNormal::operator[](size_t index)
{
ARM_COMPUTE_ERROR_ON(index >= size());
auto coords = index2coords(search_space_shape, index);
- return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]]);
+ return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]], _wbsm[coords[3]]);
}
-CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws)
+CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
- // Initialize the LWS values to test
+ // Initialize the tuning parameters values to test
+ _lws_x = {};
+ _lws_y = {};
+ _lws_z = {};
initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
- search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+ search_space_shape[0] = _lws_x.size();
+ search_space_shape[1] = _lws_y.size();
+ search_space_shape[2] = _lws_z.size();
+ search_space_shape[3] = 1;
+ if(tuning_info.tune_wbsm)
+ {
+ _wbsm = { -2, -1, 0, 1, 2 };
+ search_space_shape[3] = _wbsm.size();
+ }
}
void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
@@ -207,18 +225,29 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in
}
}
-CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws)
+CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info)
{
auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
// Initialize the LWS values to test
+ _lws_x = {};
+ _lws_y = {};
+ _lws_z = {};
initialize_lws_values(_lws_x, lws_x_max);
initialize_lws_values(_lws_y, lws_y_max);
initialize_lws_values(_lws_z, lws_z_max);
- search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+ search_space_shape[0] = _lws_x.size();
+ search_space_shape[1] = _lws_y.size();
+ search_space_shape[2] = _lws_z.size();
+ search_space_shape[3] = 1;
+ if(tuning_info.tune_wbsm)
+ {
+ _wbsm = { -1, 0, 1 };
+ search_space_shape[3] = _wbsm.size();
+ }
}
void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
@@ -231,16 +260,16 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int
}
}
-std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws)
+std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
{
- switch(mode)
+ switch(tuning_info.tuner_mode)
{
case CLTunerMode::EXHAUSTIVE:
- return std::make_unique<CLTuningParametersListExhaustive>(gws);
+ return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);
case CLTunerMode::NORMAL:
- return std::make_unique<CLTuningParametersListNormal>(gws);
+ return std::make_unique<CLTuningParametersListNormal>(gws, tuning_info);
case CLTunerMode::RAPID:
- return std::make_unique<CLTuningParametersListRapid>(gws);
+ return std::make_unique<CLTuningParametersListRapid>(gws, tuning_info);
default:
return nullptr;
}