diff options
-rw-r--r-- | arm_compute/core/CL/CLCompileContext.h | 9 | ||||
-rw-r--r-- | arm_compute/core/CL/CLHelpers.h | 22 | ||||
-rw-r--r-- | arm_compute/core/CL/CLKernelLibrary.h | 6 | ||||
-rw-r--r-- | arm_compute/core/CL/OpenCL.h | 3 | ||||
-rw-r--r-- | arm_compute/runtime/CL/CLTuner.h | 1 | ||||
-rw-r--r-- | arm_compute/runtime/CL/CLTunerTypes.h | 5 | ||||
-rw-r--r-- | arm_compute/runtime/CL/CLTuningParams.h | 85 | ||||
-rw-r--r-- | arm_compute/runtime/CL/tuners/CLTuningParametersList.h | 5 | ||||
-rw-r--r-- | src/core/CL/CLCompileContext.cpp | 19 | ||||
-rw-r--r-- | src/core/CL/CLHelpers.cpp | 24 | ||||
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 5 | ||||
-rw-r--r-- | src/core/CL/ICLKernel.cpp | 11 | ||||
-rw-r--r-- | src/core/CL/ICLKernel.h | 51 | ||||
-rw-r--r-- | src/core/CL/OpenCL.cpp | 20 | ||||
-rw-r--r-- | src/runtime/CL/CLTuner.cpp | 102 | ||||
-rw-r--r-- | src/runtime/CL/tuners/CLTuningParametersList.cpp | 81 |
16 files changed, 359 insertions, 90 deletions
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h index 6f6dc18b85..46a8c9b341 100644 --- a/arm_compute/core/CL/CLCompileContext.h +++ b/arm_compute/core/CL/CLCompileContext.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -296,6 +296,12 @@ public: */ bool int64_base_atomics_supported() const; + /* Returns true if the workgroup batch size modifier parameter is supported on the cl device + * + * @return true if the workgroup batch size modifier parameter is supported, false otherwise + */ + bool is_wbsm_supported() const; + private: /** Load program and its dependencies. * @@ -327,6 +333,7 @@ private: CLDevice _device; /**< Underlying CL device. */ mutable std::map<std::string, const Program> _programs_map; /**< Map with all already loaded program data. */ mutable std::map<std::string, cl::Program> _built_programs_map; /**< Map with all already built program data. */ + bool _is_wbsm_supported; /**< Support of worksize batch size modifier support boolean*/ }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLCOMPILECONTEXT_H */ diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index cf18e16e34..0e9aa5d6e5 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,11 @@ #include <set> #include <string> +/* CL Device capabilities */ +#define ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM 0x41E4 +/* Workgroup Batch Size Modifier */ +#define ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM 0x41E6 + namespace arm_compute { class CLCoreRuntimeContext; @@ -226,5 +231,20 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_ */ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size); +/* Helper function to check if the workgroup batch size modifier parameter is supported on the cl device + * + * @param[in] device cl device to check for support + * + * @return true if the workgroup batch size modifier parameter is supported, false otherwise + */ +bool get_wbsm_support_info(const cl::Device &device); + +/* Helper function to set the workgroup batch size modifier parameter in the kernel + * + * @param[in] kernel cl kernel to set the workgroup batch size modifier parameter + * @param[in] wbsm_hint workgroup batch size modifier to use + */ +void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint); + } // namespace arm_compute #endif /* ARM_COMPUTE_CLHELPERS_H */ diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h index 193389388e..0d8e4a6164 100644 --- a/arm_compute/core/CL/CLKernelLibrary.h +++ b/arm_compute/core/CL/CLKernelLibrary.h @@ -148,6 +148,12 @@ public: */ std::string get_program_name(const std::string &kernel_name) const; + /* Returns true if the workgroup batch size modifier parameter is supported on the cl device + * + * @return true if the workgroup batch size modifier parameter is supported, false otherwise + */ + bool is_wbsm_supported(); + /** Sets the CL context used to create programs. * * @note Setting the context also resets the device to the diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h index f9796d7e95..155c3e4eef 100644 --- a/arm_compute/core/CL/OpenCL.h +++ b/arm_compute/core/CL/OpenCL.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -135,6 +135,7 @@ public: DECLARE_FUNCTION_PTR(clEnqueueMarker); DECLARE_FUNCTION_PTR(clWaitForEvents); DECLARE_FUNCTION_PTR(clCreateImage); + DECLARE_FUNCTION_PTR(clSetKernelExecInfo); // Third-party extensions DECLARE_FUNCTION_PTR(clImportMemoryARM); diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h index 9814867142..e1c98bf411 100644 --- a/arm_compute/runtime/CL/CLTuner.h +++ b/arm_compute/runtime/CL/CLTuner.h @@ -182,7 +182,6 @@ private: cl::Event _kernel_event; bool _tune_new_kernels; CLTuningInfo _tuning_info; - CLTunerMode _tuner_mode; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLTUNER_H */ diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h index 49e2d615ea..e93ef5b2b3 100644 --- a/arm_compute/runtime/CL/CLTunerTypes.h +++ b/arm_compute/runtime/CL/CLTunerTypes.h @@ -42,7 +42,10 @@ enum class CLTunerMode /**< OpenCL tuner tuning information */ struct CLTuningInfo { - bool tune_lws = true; + CLTunerMode tuner_mode = CLTunerMode::NORMAL; /**< Parameter to select the level (granularity) of the tuning */ + bool tune_wbsm = false; /**< Flag to tune the batches of work groups distributed to compute units. + Internally, the library will check if this feature is available on + the target platform */ }; /** Converts a string to a strong types enumeration @ref CLTunerMode diff --git a/arm_compute/runtime/CL/CLTuningParams.h b/arm_compute/runtime/CL/CLTuningParams.h index 99a386638d..b50481336b 100644 --- a/arm_compute/runtime/CL/CLTuningParams.h +++ b/arm_compute/runtime/CL/CLTuningParams.h @@ -25,6 +25,10 @@ #define ARM_COMPUTE_CLTUNING_PARAMS_H #include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/runtime/CL/CLTunerTypes.h" +#include "support/StringSupport.h" + +#include <ostream> namespace arm_compute { @@ -34,26 +38,95 @@ class CLTuningParams public: CLTuningParams(const CLTuningParams &) = default; - CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0) - : _lws(lws_x, lws_y, lws_z) + CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0, int wbsm = 0) + : _lws(lws_x, lws_y, lws_z), _wbsm(wbsm) { } - CLTuningParams(cl::NDRange lws) - : _lws(lws) + CLTuningParams(cl::NDRange lws, cl_int wbsm = 0) + : _lws(lws), _wbsm(wbsm) { } - void set_lws(cl::NDRange &lws) + + CLTuningParams(cl_int wbsm) + : CLTuningParams(cl::NullRange, wbsm) + { + } + + void set_lws(cl::NDRange lws) { _lws = lws; } - cl::NDRange get_lws() + cl::NDRange get_lws() const { return _lws; } + void set_wbsm(cl_int wbsm) + { + _wbsm = wbsm; + } + + cl_int get_wbsm() const + { + return _wbsm; + } + + std::string to_string(CLTuningInfo tuning_info) + { + std::string tuning_params_string = ""; + tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) + ";" + support::cpp11::to_string(_lws[2]); + if(tuning_info.tune_wbsm) + { + tuning_params_string += ";" + support::cpp11::to_string(_wbsm); + } + return tuning_params_string; + } + + bool from_string(CLTuningInfo tuning_info, std::string tuning_params_string) + { + std::replace(tuning_params_string.begin(), tuning_params_string.end(), ';', ' '); + std::vector<std::string> array; + std::stringstream ss(tuning_params_string); + std::string temp; + while(ss >> temp) + { + array.push_back(temp); + } + // Read 3 values for lws + if(array.size() < 3) + { + return false; + } + const unsigned int lws_0 = support::cpp11::stoi(array[0]); + const unsigned int lws_1 = support::cpp11::stoi(array[1]); + const unsigned int lws_2 = support::cpp11::stoi(array[2]); + if(lws_0 == 0 && lws_1 == 0 && lws_2 == 0) + { + // If lws values are 0, cl::NullRange has to be used + // otherwise the lws object will be badly created + _lws = cl::NullRange; + } + else + { + _lws = cl::NDRange(lws_0, lws_1, lws_2); + } + array.erase(array.begin(), array.begin() + 3); + if(tuning_info.tune_wbsm) + { + if(array.size() < 1) + { + return false; + } + _wbsm = support::cpp11::stoi(array[0]); + array.erase(array.begin()); + } + return true; + } + private: cl::NDRange _lws; + cl_int _wbsm; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLTUNING_PARAMS_H */ diff --git a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h index c51b9901ef..69572c98d2 100644 --- a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h +++ b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h @@ -77,9 +77,12 @@ public: /** Construct an ICLTuningParametersList object for the given tuner mode and gws configuration. * + * @param[in] tuning_info Tuning info containng which parameters to tune and the tuner mode + * @param[in] gws Global worksize values + * * @return unique_ptr to the requested ICLTuningParametersList implementation. */ -std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws); +std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws); } // namespace cl_tuner } // namespace arm_compute diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp index 0afb7e5e0e..3db0fe515a 100644 --- a/src/core/CL/CLCompileContext.cpp +++ b/src/core/CL/CLCompileContext.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -137,15 +137,16 @@ Kernel::Kernel(std::string name, const cl::Program &program) { } CLCompileContext::CLCompileContext() - : _context(), _device(), _programs_map(), _built_programs_map() + : _context(), _device(), _programs_map(), _built_programs_map(), _is_wbsm_supported() { } CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device) - : _context(), _device(), _programs_map(), _built_programs_map() + : _context(), _device(), _programs_map(), _built_programs_map(), _is_wbsm_supported() { - _context = std::move(context); - _device = CLDevice(device); + _context = std::move(context); + _device = CLDevice(device); + _is_wbsm_supported = get_wbsm_support_info(device); } Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source, @@ -318,7 +319,8 @@ const cl::Device &CLCompileContext::get_device() const void CLCompileContext::set_device(cl::Device device) { - _device = std::move(device); + _device = std::move(device); + _is_wbsm_supported = get_wbsm_support_info(device); } cl::NDRange CLCompileContext::default_ndrange() const @@ -346,6 +348,11 @@ bool CLCompileContext::int64_base_atomics_supported() const return _device.supported("cl_khr_int64_base_atomics"); } +bool CLCompileContext::is_wbsm_supported() const +{ + return _is_wbsm_supported; +} + size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) const { size_t result; diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 895bb72827..aff897738a 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -415,4 +415,26 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio const unsigned int num_of_threads = ((input_dimension + border_width) / 16); return cl::NDRange(std::min(8U, num_of_threads)); } + +bool get_wbsm_support_info(const cl::Device &device) +{ + cl_bitfield capabilities = 0; + cl_int err = clGetDeviceInfo(device.get(), ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr); + if((err == CL_SUCCESS) && (capabilities & ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM)) + { + return true; + } + return false; +} + +void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint) +{ + cl_int err = clSetKernelExecInfo(kernel.get(), + ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM, + sizeof(cl_int), + &wbsm_hint); + ARM_COMPUTE_UNUSED(err); + ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS); +} + } // namespace arm_compute diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index cf1c52e463..75f76ea344 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -1206,6 +1206,11 @@ bool CLKernelLibrary::int64_base_atomics_supported() const return _compile_context.int64_base_atomics_supported(); } +bool CLKernelLibrary::is_wbsm_supported() +{ + return _compile_context.is_wbsm_supported(); +} + std::pair<std::string, bool> CLKernelLibrary::get_program(const std::string &program_name) const { #ifdef EMBEDDED_KERNELS diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 2b259bf28a..1c6963f3f1 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,8 +29,6 @@ #include <cstddef> -using namespace arm_compute; - void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items) { if(kernel.kernel()() == nullptr) @@ -77,9 +75,15 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind lws = valid_lws; } + if(CLKernelLibrary::get().is_wbsm_supported()) + { + set_wbsm(kernel.kernel(), kernel.wbsm_hint()); + } queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws); } +namespace arm_compute +{ template <unsigned int dimension_size> void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window) { @@ -146,3 +150,4 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window) return gws; } +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index a24cd8c798..6737109f34 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,6 +31,7 @@ #include "arm_compute/core/IKernel.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/CL/CLTuningParams.h" #include <string> @@ -67,19 +68,30 @@ private: protected: /** Configure the kernel's window and local workgroup size hint. * - * @param[in] window The maximum window which will be returned by window() - * @param[in] lws_hint (Optional) Local-Workgroup-Size to use. + * @param[in] window The maximum window which will be returned by window() + * @param[in] lws_hint Local-Workgroup-Size to use. + * @param[in] wbsm_hint (Optional) Workgroup-Batch-Size-Modifier to use. */ - void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange()) + void configure_internal(const Window &window, cl::NDRange lws_hint, cl_int wbsm_hint = 0) { - _lws_hint = lws_hint; + configure_internal(window, CLTuningParams(lws_hint, wbsm_hint)); + } + + /** Configure the kernel's window and tuning parameters hints. + * + * @param[in] window The maximum window which will be returned by window() + * @param[in] tuning_params_hint (Optional) Tuning parameters to use. + */ + void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0)) + { + _tuning_params_hint = tuning_params_hint; IKernel::configure(window); } public: /** Constructor */ ICLKernel() - : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint() + : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint() { } /** Returns a reference to the OpenCL kernel of this object. @@ -254,7 +266,7 @@ public: void set_lws_hint(const cl::NDRange &lws_hint) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure() - _lws_hint = lws_hint; + _tuning_params_hint.set_lws(lws_hint); } /** Return the Local-Workgroup-Size hint @@ -263,7 +275,28 @@ public: */ cl::NDRange lws_hint() const { - return _lws_hint; + return _tuning_params_hint.get_lws(); + } + + /** Set the workgroup batch size modifier hint + * + * @note This method should be called after the configuration of the kernel + * + * @param[in] wbsm_hint workgroup batch size modifier value + */ + void set_wbsm_hint(const cl_int &wbsm_hint) + { + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // wbsm_hint will be overwritten by configure() + _tuning_params_hint.set_wbsm(wbsm_hint); + } + + /** Return the workgroup batch size modifier hint + * + * @return Current wbsm hint + */ + cl_int wbsm_hint() const + { + return _tuning_params_hint.get_wbsm(); } /** Get the configuration ID @@ -344,7 +377,7 @@ protected: std::string _config_id; /**< Configuration ID */ size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */ private: - cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */ + CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */ }; /** Add the kernel to the command queue with the given window. diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index 6c70861946..aff6285697 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -134,6 +134,7 @@ bool CLSymbols::load(const std::string &library) LOAD_FUNCTION_PTR(clEnqueueMarker, handle); LOAD_FUNCTION_PTR(clWaitForEvents, handle); LOAD_FUNCTION_PTR(clCreateImage, handle); + LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle); // Third-party extensions LOAD_FUNCTION_PTR(clImportMemoryARM, handle); @@ -962,6 +963,23 @@ clCreateImage(cl_context context, } } +cl_int clSetKernelExecInfo(cl_kernel kernel, + cl_kernel_exec_info param_name, + size_t param_value_size, + const void *param_value) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr; + if(func != nullptr) + { + return func(kernel, param_name, param_value_size, param_value); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + cl_mem clImportMemoryARM(cl_context context, cl_mem_flags flags, diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 906021790e..e16d6808ed 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -32,13 +32,11 @@ #include <cerrno> #include <fstream> #include <limits> -#include <memory> -#include <string> namespace arm_compute { CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info) - : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info), _tuner_mode(CLTunerMode::NORMAL) + : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info) { } @@ -62,12 +60,12 @@ bool CLTuner::tune_new_kernels() const void CLTuner::set_tuner_mode(CLTunerMode mode) { - _tuner_mode = mode; + _tuning_info.tuner_mode = mode; } CLTunerMode CLTuner::get_tuner_mode() const { - return _tuner_mode; + return _tuning_info.tuner_mode; } void CLTuner::tune_kernel_static(ICLKernel &kernel) @@ -103,12 +101,20 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) // Set Local-Workgroup-Size kernel.set_lws_hint(opt_tuning_params.get_lws()); + if(_tuning_info.tune_wbsm) + { + kernel.set_wbsm_hint(opt_tuning_params.get_wbsm()); + } } } else { // Set Local-Workgroup-Size kernel.set_lws_hint(p->second.get_lws()); + if(_tuning_info.tune_wbsm) + { + kernel.set_wbsm_hint(p->second.get_wbsm()); + } } } } @@ -188,13 +194,15 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac cl_ulong min_exec_time = end - start; _kernel_event = nullptr; - cl::NDRange opt_lws = cl::NullRange; + CLTuningParams opt_tuning_params(cl::NullRange, 0); // Construct the list of tuning parameters values to be tested based on the tuner mode. - auto lws_list = cl_tuner::get_tuning_parameters_list(_tuner_mode, gws); - for(size_t i = 0; i < lws_list->size(); ++i) + auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws); + for(size_t i = 0; i < tuning_list->size(); ++i) { - cl::NDRange lws_test = (*lws_list)[i].get_lws(); + CLTuningParams tuning_test = (*tuning_list)[i]; + // Setting the lws + cl::NDRange lws_test = tuning_test.get_lws(); auto x = lws_test[0]; auto y = lws_test[1]; auto z = lws_test[2]; @@ -205,8 +213,12 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac continue; } - //Set the Local-Workgroup-Size kernel.set_lws_hint(lws_test); + if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) + { + cl_int wbsm_test = tuning_test.get_wbsm(); + kernel.set_wbsm_hint(wbsm_test); + } // Run the kernel inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler); @@ -222,13 +234,17 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac if(diff < min_exec_time) { min_exec_time = diff; - opt_lws = cl::NDRange(x, y, z); + opt_tuning_params.set_lws(tuning_test.get_lws()); + if(_tuning_info.tune_wbsm) + { + opt_tuning_params.set_wbsm(tuning_test.get_wbsm()); + } } } // Restore real function CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel; - return CLTuningParams(opt_lws); + return opt_tuning_params; } void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table) @@ -271,34 +287,46 @@ void CLTuner::load_from_file(const std::string &filename) ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno); } std::string line; + bool header_line = true; while(!std::getline(fs, line).fail()) { - std::istringstream ss(line); - std::string token; - if(std::getline(ss, token, ';').fail()) - { - ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str()); - } - std::string kernel_id = token; - cl::NDRange lws(1, 1, 1); - for(int i = 0; i < 3; i++) + if(header_line) { - if(std::getline(ss, token, ';').fail()) + header_line = false; + size_t pos_lws = line.find("lws"); + size_t pos_wbsm = line.find("wbsm"); + _tuning_info.tune_wbsm = false; + if(pos_lws != std::string::npos || pos_wbsm != std::string::npos) { - ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str()); + // The file has in the first line the parameters it has been tuned on + if(pos_wbsm != std::string::npos) + { + _tuning_info.tune_wbsm = true; + } + // Once the line with the tuning parameter is read we can + // read the next one to start collecting the values + if(std::getline(fs, line).fail()) + { + break; + } } - lws.get()[i] = support::cpp11::stoi(token); } - // If all dimensions are 0: reset to NullRange (i.e nullptr) - if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0) + CLTuningParams tuning_params; + size_t pos = line.find(";"); + if(pos == std::string::npos) { - lws = cl::NullRange; + ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } - add_tuning_params(kernel_id, lws); + std::string kernel_id = line.substr(0, pos); + line.erase(0, pos + 1); + if(!tuning_params.from_string(_tuning_info, line)) + { + ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); + } + add_tuning_params(kernel_id, tuning_params); } fs.close(); - _tuning_info.tune_lws = true; } bool CLTuner::save_to_file(const std::string &filename) const @@ -307,14 +335,24 @@ bool CLTuner::save_to_file(const std::string &filename) const { return false; } - std::ofstream fs; fs.exceptions(std::ifstream::failbit | std::ifstream::badbit); fs.open(filename, std::ios::out); + std::string header_string = ""; + header_string += "lws"; + if(_tuning_info.tune_wbsm) + { + if(!header_string.empty()) + { + header_string += " "; + } + header_string += "wbsm"; + } + fs << header_string << std::endl; for(auto const &kernel_data : _tuning_params_table) { - const cl::NDRange lws = CLTuningParams(kernel_data.second).get_lws(); - fs << kernel_data.first << ";" << lws[0] << ";" << lws[1] << ";" << lws[2] << std::endl; + CLTuningParams tun_pams(kernel_data.second); + fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl; } fs.close(); return true; diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp index 7f63078192..6cb2212794 100644 --- a/src/runtime/CL/tuners/CLTuningParametersList.cpp +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -35,8 +35,14 @@ constexpr unsigned int max_lws_supported_z{ 32u }; class CLTuningParametersList : public ICLTuningParametersList { protected: - /* Shape of 3-D search space */ - TensorShape search_space_shape{ 0, 0, 0 }; + /* Shape of 4-D search space */ + TensorShape search_space_shape{ 0, 0, 0, 0 }; + std::vector<unsigned int> _lws_x{ 0 }; + std::vector<unsigned int> _lws_y{ 0 }; + std::vector<unsigned int> _lws_z{ 0 }; + std::vector<int> _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units. + The value is in the range [-31,+31]. + When 0, the runtime-selected wbs used is unmodified. */ /** Constructor */ CLTuningParametersList() = default; @@ -62,7 +68,7 @@ public: /** Prevent default constructor calls */ CLTuningParametersListExhaustive() = delete; /** Constructor */ - CLTuningParametersListExhaustive(const cl::NDRange &gws); + CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info); /** Copy Constructor */ CLTuningParametersListExhaustive(const CLTuningParametersListExhaustive &) = default; /** Move Constructor */ @@ -83,7 +89,7 @@ class CLTuningParametersListNormal : public CLTuningParametersList { public: /** Constructor */ - CLTuningParametersListNormal(const cl::NDRange &gws); + CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info); /** Copy Constructor */ CLTuningParametersListNormal(const CLTuningParametersListNormal &) = default; /** Move Constructor */ @@ -98,11 +104,6 @@ public: // Inherited methods overridden: CLTuningParams operator[](size_t) override; -protected: - std::vector<unsigned int> _lws_x{}; - std::vector<unsigned int> _lws_y{}; - std::vector<unsigned int> _lws_z{}; - /** Prevent default constructor calls */ CLTuningParametersListNormal() = default; @@ -125,7 +126,7 @@ public: /** Prevent default constructor calls */ CLTuningParametersListRapid() = delete; /** Constructor */ - CLTuningParametersListRapid(const cl::NDRange &gws); + CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info); /** Copy Constructor */ CLTuningParametersListRapid(const CLTuningParametersListRapid &) = default; /** Move Constructor */ @@ -156,36 +157,53 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index) { ARM_COMPUTE_ERROR_ON(index >= size()); auto coords = index2coords(search_space_shape, index); - return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U); + return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U, static_cast<int>(coords[3])); } -CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws) +CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info) { ARM_COMPUTE_UNUSED(gws); - search_space_shape = TensorShape(max_lws_supported_x, - max_lws_supported_y, - max_lws_supported_z); + search_space_shape[0] = max_lws_supported_x; + search_space_shape[1] = max_lws_supported_y; + search_space_shape[2] = max_lws_supported_z; + search_space_shape[3] = 1; + if(tuning_info.tune_wbsm) + { + _wbsm = { -3, -2, -1, 0, 1, 2, 3 }; + search_space_shape[3] = _wbsm.size(); + } } CLTuningParams CLTuningParametersListNormal::operator[](size_t index) { ARM_COMPUTE_ERROR_ON(index >= size()); auto coords = index2coords(search_space_shape, index); - return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]]); + return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]], _wbsm[coords[3]]); } -CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws) +CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info) { auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x); auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y); auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z); - // Initialize the LWS values to test + // Initialize the tuning parameters values to test + _lws_x = {}; + _lws_y = {}; + _lws_z = {}; initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 initialize_lws_values(_lws_z, gws[2], lws_z_max, false); - search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size()); + search_space_shape[0] = _lws_x.size(); + search_space_shape[1] = _lws_y.size(); + search_space_shape[2] = _lws_z.size(); + search_space_shape[3] = 1; + if(tuning_info.tune_wbsm) + { + _wbsm = { -2, -1, 0, 1, 2 }; + search_space_shape[3] = _wbsm.size(); + } } void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one) @@ -207,18 +225,29 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned in } } -CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws) +CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info) { auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8 auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4 auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4 // Initialize the LWS values to test + _lws_x = {}; + _lws_y = {}; + _lws_z = {}; initialize_lws_values(_lws_x, lws_x_max); initialize_lws_values(_lws_y, lws_y_max); initialize_lws_values(_lws_z, lws_z_max); - search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size()); + search_space_shape[0] = _lws_x.size(); + search_space_shape[1] = _lws_y.size(); + search_space_shape[2] = _lws_z.size(); + search_space_shape[3] = 1; + if(tuning_info.tune_wbsm) + { + _wbsm = { -1, 0, 1 }; + search_space_shape[3] = _wbsm.size(); + } } void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max) @@ -231,16 +260,16 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int } } -std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws) +std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws) { - switch(mode) + switch(tuning_info.tuner_mode) { case CLTunerMode::EXHAUSTIVE: - return std::make_unique<CLTuningParametersListExhaustive>(gws); + return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info); case CLTunerMode::NORMAL: - return std::make_unique<CLTuningParametersListNormal>(gws); + return std::make_unique<CLTuningParametersListNormal>(gws, tuning_info); case CLTunerMode::RAPID: - return std::make_unique<CLTuningParametersListRapid>(gws); + return std::make_unique<CLTuningParametersListRapid>(gws, tuning_info); default: return nullptr; } |