From be9f9f9139b759d314f4f2a6d2ee747079666504 Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Mon, 25 Jan 2021 15:07:17 +0000 Subject: Add WBSM tuning to CLTuner Add WBSM as possible parameter to be tuned Add helper functions to check WBSM support and setting the value in the kernel Update tuning parameter lists to use WBSM Update CLTuner to use WBSM The WBSM tuning is exposed as a parameter to be set at compile time by setting the CLTuningInfo CLTuningInfo contains information about the tuning mode and if wbsm tuning enabled Resolves: COMPMID-3936 Change-Id: Id53697c9c6d2cef41c049f368002f6197351b3ed Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4914 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice Reviewed-by: Georgios Pinitas --- arm_compute/core/CL/CLCompileContext.h | 9 +- arm_compute/core/CL/CLHelpers.h | 22 ++++- arm_compute/core/CL/CLKernelLibrary.h | 6 ++ arm_compute/core/CL/OpenCL.h | 3 +- arm_compute/runtime/CL/CLTuner.h | 1 - arm_compute/runtime/CL/CLTunerTypes.h | 5 +- arm_compute/runtime/CL/CLTuningParams.h | 85 +++++++++++++++-- .../runtime/CL/tuners/CLTuningParametersList.h | 5 +- src/core/CL/CLCompileContext.cpp | 19 ++-- src/core/CL/CLHelpers.cpp | 24 ++++- src/core/CL/CLKernelLibrary.cpp | 5 + src/core/CL/ICLKernel.cpp | 11 ++- src/core/CL/ICLKernel.h | 51 +++++++++-- src/core/CL/OpenCL.cpp | 20 +++- src/runtime/CL/CLTuner.cpp | 102 ++++++++++++++------- src/runtime/CL/tuners/CLTuningParametersList.cpp | 81 ++++++++++------ 16 files changed, 359 insertions(+), 90 deletions(-) diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h index 6f6dc18b85..46a8c9b341 100644 --- a/arm_compute/core/CL/CLCompileContext.h +++ b/arm_compute/core/CL/CLCompileContext.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -296,6 +296,12 @@ public: */ bool int64_base_atomics_supported() const; + /* Returns true if the workgroup batch size modifier parameter is supported on the cl device + * + * @return true if the workgroup batch size modifier parameter is supported, false otherwise + */ + bool is_wbsm_supported() const; + private: /** Load program and its dependencies. * @@ -327,6 +333,7 @@ private: CLDevice _device; /**< Underlying CL device. */ mutable std::map _programs_map; /**< Map with all already loaded program data. */ mutable std::map _built_programs_map; /**< Map with all already built program data. */ + bool _is_wbsm_supported; /**< Support of worksize batch size modifier support boolean*/ }; } // namespace arm_compute #endif /* ARM_COMPUTE_CLCOMPILECONTEXT_H */ diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index cf18e16e34..0e9aa5d6e5 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,11 @@ #include #include +/* CL Device capabilities */ +#define ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM 0x41E4 +/* Workgroup Batch Size Modifier */ +#define ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM 0x41E6 + namespace arm_compute { class CLCoreRuntimeContext; @@ -226,5 +231,20 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_ */ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size); +/* Helper function to check if the workgroup batch size modifier parameter is supported on the cl device + * + * @param[in] device cl device to check for support + * + * @return true if the workgroup batch size modifier parameter is supported, false otherwise + */ +bool get_wbsm_support_info(const cl::Device &device); + +/* Helper function to set the workgroup batch size modifier parameter in the kernel + * + * @param[in] kernel cl kernel to set the workgroup batch size modifier parameter + * @param[in] wbsm_hint workgroup batch size modifier to use + */ +void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint); + } // namespace arm_compute #endif /* ARM_COMPUTE_CLHELPERS_H */ diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h index 193389388e..0d8e4a6164 100644 --- a/arm_compute/core/CL/CLKernelLibrary.h +++ b/arm_compute/core/CL/CLKernelLibrary.h @@ -148,6 +148,12 @@ public: */ std::string get_program_name(const std::string &kernel_name) const; + /* Returns true if the workgroup batch size modifier parameter is supported on the cl device + * + * @return true if the workgroup batch size modifier parameter is supported, false otherwise + */ + bool is_wbsm_supported(); + /** Sets the CL context used to create programs. * * @note Setting the context also resets the device to the diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h index f9796d7e95..155c3e4eef 100644 --- a/arm_compute/core/CL/OpenCL.h +++ b/arm_compute/core/CL/OpenCL.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -135,6 +135,7 @@ public: DECLARE_FUNCTION_PTR(clEnqueueMarker); DECLARE_FUNCTION_PTR(clWaitForEvents); DECLARE_FUNCTION_PTR(clCreateImage); + DECLARE_FUNCTION_PTR(clSetKernelExecInfo); // Third-party extensions DECLARE_FUNCTION_PTR(clImportMemoryARM); diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h index 9814867142..e1c98bf411 100644 --- a/arm_compute/runtime/CL/CLTuner.h +++ b/arm_compute/runtime/CL/CLTuner.h @@ -182,7 +182,6 @@ private: cl::Event _kernel_event; bool _tune_new_kernels; CLTuningInfo _tuning_info; - CLTunerMode _tuner_mode; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLTUNER_H */ diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h index 49e2d615ea..e93ef5b2b3 100644 --- a/arm_compute/runtime/CL/CLTunerTypes.h +++ b/arm_compute/runtime/CL/CLTunerTypes.h @@ -42,7 +42,10 @@ enum class CLTunerMode /**< OpenCL tuner tuning information */ struct CLTuningInfo { - bool tune_lws = true; + CLTunerMode tuner_mode = CLTunerMode::NORMAL; /**< Parameter to select the level (granularity) of the tuning */ + bool tune_wbsm = false; /**< Flag to tune the batches of work groups distributed to compute units. + Internally, the library will check if this feature is available on + the target platform */ }; /** Converts a string to a strong types enumeration @ref CLTunerMode diff --git a/arm_compute/runtime/CL/CLTuningParams.h b/arm_compute/runtime/CL/CLTuningParams.h index 99a386638d..b50481336b 100644 --- a/arm_compute/runtime/CL/CLTuningParams.h +++ b/arm_compute/runtime/CL/CLTuningParams.h @@ -25,6 +25,10 @@ #define ARM_COMPUTE_CLTUNING_PARAMS_H #include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/runtime/CL/CLTunerTypes.h" +#include "support/StringSupport.h" + +#include namespace arm_compute { @@ -34,26 +38,95 @@ class CLTuningParams public: CLTuningParams(const CLTuningParams &) = default; - CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0) - : _lws(lws_x, lws_y, lws_z) + CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0, int wbsm = 0) + : _lws(lws_x, lws_y, lws_z), _wbsm(wbsm) { } - CLTuningParams(cl::NDRange lws) - : _lws(lws) + CLTuningParams(cl::NDRange lws, cl_int wbsm = 0) + : _lws(lws), _wbsm(wbsm) { } - void set_lws(cl::NDRange &lws) + + CLTuningParams(cl_int wbsm) + : CLTuningParams(cl::NullRange, wbsm) + { + } + + void set_lws(cl::NDRange lws) { _lws = lws; } - cl::NDRange get_lws() + cl::NDRange get_lws() const { return _lws; } + void set_wbsm(cl_int wbsm) + { + _wbsm = wbsm; + } + + cl_int get_wbsm() const + { + return _wbsm; + } + + std::string to_string(CLTuningInfo tuning_info) + { + std::string tuning_params_string = ""; + tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) + ";" + support::cpp11::to_string(_lws[2]); + if(tuning_info.tune_wbsm) + { + tuning_params_string += ";" + support::cpp11::to_string(_wbsm); + } + return tuning_params_string; + } + + bool from_string(CLTuningInfo tuning_info, std::string tuning_params_string) + { + std::replace(tuning_params_string.begin(), tuning_params_string.end(), ';', ' '); + std::vector array; + std::stringstream ss(tuning_params_string); + std::string temp; + while(ss >> temp) + { + array.push_back(temp); + } + // Read 3 values for lws + if(array.size() < 3) + { + return false; + } + const unsigned int lws_0 = support::cpp11::stoi(array[0]); + const unsigned int lws_1 = support::cpp11::stoi(array[1]); + const unsigned int lws_2 = support::cpp11::stoi(array[2]); + if(lws_0 == 0 && lws_1 == 0 && lws_2 == 0) + { + // If lws values are 0, cl::NullRange has to be used + // otherwise the lws object will be badly created + _lws = cl::NullRange; + } + else + { + _lws = cl::NDRange(lws_0, lws_1, lws_2); + } + array.erase(array.begin(), array.begin() + 3); + if(tuning_info.tune_wbsm) + { + if(array.size() < 1) + { + return false; + } + _wbsm = support::cpp11::stoi(array[0]); + array.erase(array.begin()); + } + return true; + } + private: cl::NDRange _lws; + cl_int _wbsm; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLTUNING_PARAMS_H */ diff --git a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h index c51b9901ef..69572c98d2 100644 --- a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h +++ b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h @@ -76,10 +76,13 @@ public: }; /** Construct an ICLTuningParametersList object for the given tuner mode and gws configuration. + * + * @param[in] tuning_info Tuning info containng which parameters to tune and the tuner mode + * @param[in] gws Global worksize values * * @return unique_ptr to the requested ICLTuningParametersList implementation. */ -std::unique_ptr get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws); +std::unique_ptr get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws); } // namespace cl_tuner } // namespace arm_compute diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp index 0afb7e5e0e..3db0fe515a 100644 --- a/src/core/CL/CLCompileContext.cpp +++ b/src/core/CL/CLCompileContext.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -137,15 +137,16 @@ Kernel::Kernel(std::string name, const cl::Program &program) { } CLCompileContext::CLCompileContext() - : _context(), _device(), _programs_map(), _built_programs_map() + : _context(), _device(), _programs_map(), _built_programs_map(), _is_wbsm_supported() { } CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device) - : _context(), _device(), _programs_map(), _built_programs_map() + : _context(), _device(), _programs_map(), _built_programs_map(), _is_wbsm_supported() { - _context = std::move(context); - _device = CLDevice(device); + _context = std::move(context); + _device = CLDevice(device); + _is_wbsm_supported = get_wbsm_support_info(device); } Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source, @@ -318,7 +319,8 @@ const cl::Device &CLCompileContext::get_device() const void CLCompileContext::set_device(cl::Device device) { - _device = std::move(device); + _device = std::move(device); + _is_wbsm_supported = get_wbsm_support_info(device); } cl::NDRange CLCompileContext::default_ndrange() const @@ -346,6 +348,11 @@ bool CLCompileContext::int64_base_atomics_supported() const return _device.supported("cl_khr_int64_base_atomics"); } +bool CLCompileContext::is_wbsm_supported() const +{ + return _is_wbsm_supported; +} + size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) const { size_t result; diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 895bb72827..aff897738a 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -415,4 +415,26 @@ cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimensio const unsigned int num_of_threads = ((input_dimension + border_width) / 16); return cl::NDRange(std::min(8U, num_of_threads)); } + +bool get_wbsm_support_info(const cl::Device &device) +{ + cl_bitfield capabilities = 0; + cl_int err = clGetDeviceInfo(device.get(), ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr); + if((err == CL_SUCCESS) && (capabilities & ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM)) + { + return true; + } + return false; +} + +void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint) +{ + cl_int err = clSetKernelExecInfo(kernel.get(), + ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM, + sizeof(cl_int), + &wbsm_hint); + ARM_COMPUTE_UNUSED(err); + ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS); +} + } // namespace arm_compute diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index cf1c52e463..75f76ea344 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -1206,6 +1206,11 @@ bool CLKernelLibrary::int64_base_atomics_supported() const return _compile_context.int64_base_atomics_supported(); } +bool CLKernelLibrary::is_wbsm_supported() +{ + return _compile_context.is_wbsm_supported(); +} + std::pair CLKernelLibrary::get_program(const std::string &program_name) const { #ifdef EMBEDDED_KERNELS diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 2b259bf28a..1c6963f3f1 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,8 +29,6 @@ #include -using namespace arm_compute; - void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items) { if(kernel.kernel()() == nullptr) @@ -77,9 +75,15 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind lws = valid_lws; } + if(CLKernelLibrary::get().is_wbsm_supported()) + { + set_wbsm(kernel.kernel(), kernel.wbsm_hint()); + } queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws); } +namespace arm_compute +{ template void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window) { @@ -146,3 +150,4 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window) return gws; } +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index a24cd8c798..6737109f34 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,6 +31,7 @@ #include "arm_compute/core/IKernel.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/CL/CLTuningParams.h" #include @@ -67,19 +68,30 @@ private: protected: /** Configure the kernel's window and local workgroup size hint. * - * @param[in] window The maximum window which will be returned by window() - * @param[in] lws_hint (Optional) Local-Workgroup-Size to use. + * @param[in] window The maximum window which will be returned by window() + * @param[in] lws_hint Local-Workgroup-Size to use. + * @param[in] wbsm_hint (Optional) Workgroup-Batch-Size-Modifier to use. */ - void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange()) + void configure_internal(const Window &window, cl::NDRange lws_hint, cl_int wbsm_hint = 0) { - _lws_hint = lws_hint; + configure_internal(window, CLTuningParams(lws_hint, wbsm_hint)); + } + + /** Configure the kernel's window and tuning parameters hints. + * + * @param[in] window The maximum window which will be returned by window() + * @param[in] tuning_params_hint (Optional) Tuning parameters to use. + */ + void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0)) + { + _tuning_params_hint = tuning_params_hint; IKernel::configure(window); } public: /** Constructor */ ICLKernel() - : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint() + : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint() { } /** Returns a reference to the OpenCL kernel of this object. @@ -254,7 +266,7 @@ public: void set_lws_hint(const cl::NDRange &lws_hint) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure() - _lws_hint = lws_hint; + _tuning_params_hint.set_lws(lws_hint); } /** Return the Local-Workgroup-Size hint @@ -263,7 +275,28 @@ public: */ cl::NDRange lws_hint() const { - return _lws_hint; + return _tuning_params_hint.get_lws(); + } + + /** Set the workgroup batch size modifier hint + * + * @note This method should be called after the configuration of the kernel + * + * @param[in] wbsm_hint workgroup batch size modifier value + */ + void set_wbsm_hint(const cl_int &wbsm_hint) + { + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // wbsm_hint will be overwritten by configure() + _tuning_params_hint.set_wbsm(wbsm_hint); + } + + /** Return the workgroup batch size modifier hint + * + * @return Current wbsm hint + */ + cl_int wbsm_hint() const + { + return _tuning_params_hint.get_wbsm(); } /** Get the configuration ID @@ -344,7 +377,7 @@ protected: std::string _config_id; /**< Configuration ID */ size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */ private: - cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */ + CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */ }; /** Add the kernel to the command queue with the given window. diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index 6c70861946..aff6285697 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -134,6 +134,7 @@ bool CLSymbols::load(const std::string &library) LOAD_FUNCTION_PTR(clEnqueueMarker, handle); LOAD_FUNCTION_PTR(clWaitForEvents, handle); LOAD_FUNCTION_PTR(clCreateImage, handle); + LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle); // Third-party extensions LOAD_FUNCTION_PTR(clImportMemoryARM, handle); @@ -962,6 +963,23 @@ clCreateImage(cl_context context, } } +cl_int clSetKernelExecInfo(cl_kernel kernel, + cl_kernel_exec_info param_name, + size_t param_value_size, + const void *param_value) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr; + if(func != nullptr) + { + return func(kernel, param_name, param_value_size, param_value); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + cl_mem clImportMemoryARM(cl_context context, cl_mem_flags flags, diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 906021790e..e16d6808ed 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -32,13 +32,11 @@ #include #include #include -#include -#include namespace arm_compute { CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info) - : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info), _tuner_mode(CLTunerMode::NORMAL) + : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info) { } @@ -62,12 +60,12 @@ bool CLTuner::tune_new_kernels() const void CLTuner::set_tuner_mode(CLTunerMode mode) { - _tuner_mode = mode; + _tuning_info.tuner_mode = mode; } CLTunerMode CLTuner::get_tuner_mode() const { - return _tuner_mode; + return _tuning_info.tuner_mode; } void CLTuner::tune_kernel_static(ICLKernel &kernel) @@ -103,12 +101,20 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) // Set Local-Workgroup-Size kernel.set_lws_hint(opt_tuning_params.get_lws()); + if(_tuning_info.tune_wbsm) + { + kernel.set_wbsm_hint(opt_tuning_params.get_wbsm()); + } } } else { // Set Local-Workgroup-Size kernel.set_lws_hint(p->second.get_lws()); + if(_tuning_info.tune_wbsm) + { + kernel.set_wbsm_hint(p->second.get_wbsm()); + } } } } @@ -188,13 +194,15 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac cl_ulong min_exec_time = end - start; _kernel_event = nullptr; - cl::NDRange opt_lws = cl::NullRange; + CLTuningParams opt_tuning_params(cl::NullRange, 0); // Construct the list of tuning parameters values to be tested based on the tuner mode. - auto lws_list = cl_tuner::get_tuning_parameters_list(_tuner_mode, gws); - for(size_t i = 0; i < lws_list->size(); ++i) + auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws); + for(size_t i = 0; i < tuning_list->size(); ++i) { - cl::NDRange lws_test = (*lws_list)[i].get_lws(); + CLTuningParams tuning_test = (*tuning_list)[i]; + // Setting the lws + cl::NDRange lws_test = tuning_test.get_lws(); auto x = lws_test[0]; auto y = lws_test[1]; auto z = lws_test[2]; @@ -205,8 +213,12 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac continue; } - //Set the Local-Workgroup-Size kernel.set_lws_hint(lws_test); + if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported()) + { + cl_int wbsm_test = tuning_test.get_wbsm(); + kernel.set_wbsm_hint(wbsm_test); + } // Run the kernel inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler); @@ -222,13 +234,17 @@ CLTuningParams CLTuner::find_optimal_tuning_params(ICLKernel &kernel, ITensorPac if(diff < min_exec_time) { min_exec_time = diff; - opt_lws = cl::NDRange(x, y, z); + opt_tuning_params.set_lws(tuning_test.get_lws()); + if(_tuning_info.tune_wbsm) + { + opt_tuning_params.set_wbsm(tuning_test.get_wbsm()); + } } } // Restore real function CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel; - return CLTuningParams(opt_lws); + return opt_tuning_params; } void CLTuner::import_lws_table(const std::unordered_map &lws_table) @@ -271,34 +287,46 @@ void CLTuner::load_from_file(const std::string &filename) ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno); } std::string line; + bool header_line = true; while(!std::getline(fs, line).fail()) { - std::istringstream ss(line); - std::string token; - if(std::getline(ss, token, ';').fail()) - { - ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str()); - } - std::string kernel_id = token; - cl::NDRange lws(1, 1, 1); - for(int i = 0; i < 3; i++) + if(header_line) { - if(std::getline(ss, token, ';').fail()) + header_line = false; + size_t pos_lws = line.find("lws"); + size_t pos_wbsm = line.find("wbsm"); + _tuning_info.tune_wbsm = false; + if(pos_lws != std::string::npos || pos_wbsm != std::string::npos) { - ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str()); + // The file has in the first line the parameters it has been tuned on + if(pos_wbsm != std::string::npos) + { + _tuning_info.tune_wbsm = true; + } + // Once the line with the tuning parameter is read we can + // read the next one to start collecting the values + if(std::getline(fs, line).fail()) + { + break; + } } - lws.get()[i] = support::cpp11::stoi(token); } - // If all dimensions are 0: reset to NullRange (i.e nullptr) - if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0) + CLTuningParams tuning_params; + size_t pos = line.find(";"); + if(pos == std::string::npos) { - lws = cl::NullRange; + ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); } - add_tuning_params(kernel_id, lws); + std::string kernel_id = line.substr(0, pos); + line.erase(0, pos + 1); + if(!tuning_params.from_string(_tuning_info, line)) + { + ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str()); + } + add_tuning_params(kernel_id, tuning_params); } fs.close(); - _tuning_info.tune_lws = true; } bool CLTuner::save_to_file(const std::string &filename) const @@ -307,14 +335,24 @@ bool CLTuner::save_to_file(const std::string &filename) const { return false; } - std::ofstream fs; fs.exceptions(std::ifstream::failbit | std::ifstream::badbit); fs.open(filename, std::ios::out); + std::string header_string = ""; + header_string += "lws"; + if(_tuning_info.tune_wbsm) + { + if(!header_string.empty()) + { + header_string += " "; + } + header_string += "wbsm"; + } + fs << header_string << std::endl; for(auto const &kernel_data : _tuning_params_table) { - const cl::NDRange lws = CLTuningParams(kernel_data.second).get_lws(); - fs << kernel_data.first << ";" << lws[0] << ";" << lws[1] << ";" << lws[2] << std::endl; + CLTuningParams tun_pams(kernel_data.second); + fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl; } fs.close(); return true; diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp index 7f63078192..6cb2212794 100644 --- a/src/runtime/CL/tuners/CLTuningParametersList.cpp +++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp @@ -35,8 +35,14 @@ constexpr unsigned int max_lws_supported_z{ 32u }; class CLTuningParametersList : public ICLTuningParametersList { protected: - /* Shape of 3-D search space */ - TensorShape search_space_shape{ 0, 0, 0 }; + /* Shape of 4-D search space */ + TensorShape search_space_shape{ 0, 0, 0, 0 }; + std::vector _lws_x{ 0 }; + std::vector _lws_y{ 0 }; + std::vector _lws_z{ 0 }; + std::vector _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units. + The value is in the range [-31,+31]. + When 0, the runtime-selected wbs used is unmodified. */ /** Constructor */ CLTuningParametersList() = default; @@ -62,7 +68,7 @@ public: /** Prevent default constructor calls */ CLTuningParametersListExhaustive() = delete; /** Constructor */ - CLTuningParametersListExhaustive(const cl::NDRange &gws); + CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info); /** Copy Constructor */ CLTuningParametersListExhaustive(const CLTuningParametersListExhaustive &) = default; /** Move Constructor */ @@ -83,7 +89,7 @@ class CLTuningParametersListNormal : public CLTuningParametersList { public: /** Constructor */ - CLTuningParametersListNormal(const cl::NDRange &gws); + CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info); /** Copy Constructor */ CLTuningParametersListNormal(const CLTuningParametersListNormal &) = default; /** Move Constructor */ @@ -98,11 +104,6 @@ public: // Inherited methods overridden: CLTuningParams operator[](size_t) override; -protected: - std::vector _lws_x{}; - std::vector _lws_y{}; - std::vector _lws_z{}; - /** Prevent default constructor calls */ CLTuningParametersListNormal() = default; @@ -125,7 +126,7 @@ public: /** Prevent default constructor calls */ CLTuningParametersListRapid() = delete; /** Constructor */ - CLTuningParametersListRapid(const cl::NDRange &gws); + CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info); /** Copy Constructor */ CLTuningParametersListRapid(const CLTuningParametersListRapid &) = default; /** Move Constructor */ @@ -156,36 +157,53 @@ CLTuningParams CLTuningParametersListExhaustive::operator[](size_t index) { ARM_COMPUTE_ERROR_ON(index >= size()); auto coords = index2coords(search_space_shape, index); - return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U); + return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U, static_cast(coords[3])); } -CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws) +CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info) { ARM_COMPUTE_UNUSED(gws); - search_space_shape = TensorShape(max_lws_supported_x, - max_lws_supported_y, - max_lws_supported_z); + search_space_shape[0] = max_lws_supported_x; + search_space_shape[1] = max_lws_supported_y; + search_space_shape[2] = max_lws_supported_z; + search_space_shape[3] = 1; + if(tuning_info.tune_wbsm) + { + _wbsm = { -3, -2, -1, 0, 1, 2, 3 }; + search_space_shape[3] = _wbsm.size(); + } } CLTuningParams CLTuningParametersListNormal::operator[](size_t index) { ARM_COMPUTE_ERROR_ON(index >= size()); auto coords = index2coords(search_space_shape, index); - return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]]); + return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]], _wbsm[coords[3]]); } -CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws) +CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info) { auto lws_x_max = std::min(static_cast(gws[0]), max_lws_supported_x); auto lws_y_max = std::min(static_cast(gws[1]), max_lws_supported_y); auto lws_z_max = std::min(static_cast(gws[2]), max_lws_supported_z); - // Initialize the LWS values to test + // Initialize the tuning parameters values to test + _lws_x = {}; + _lws_y = {}; + _lws_z = {}; initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16 initialize_lws_values(_lws_z, gws[2], lws_z_max, false); - search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size()); + search_space_shape[0] = _lws_x.size(); + search_space_shape[1] = _lws_y.size(); + search_space_shape[2] = _lws_z.size(); + search_space_shape[3] = 1; + if(tuning_info.tune_wbsm) + { + _wbsm = { -2, -1, 0, 1, 2 }; + search_space_shape[3] = _wbsm.size(); + } } void CLTuningParametersListNormal::initialize_lws_values(std::vector &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one) @@ -207,18 +225,29 @@ void CLTuningParametersListNormal::initialize_lws_values(std::vector(gws[0]), 8u); // Limit exploration to 1 - 8 auto lws_y_max = std::min(static_cast(gws[1]), 4u); // Limit exploration to 1 - 4 auto lws_z_max = std::min(static_cast(gws[2]), 4u); // Limit exploration to 1 - 4 // Initialize the LWS values to test + _lws_x = {}; + _lws_y = {}; + _lws_z = {}; initialize_lws_values(_lws_x, lws_x_max); initialize_lws_values(_lws_y, lws_y_max); initialize_lws_values(_lws_z, lws_z_max); - search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size()); + search_space_shape[0] = _lws_x.size(); + search_space_shape[1] = _lws_y.size(); + search_space_shape[2] = _lws_z.size(); + search_space_shape[3] = 1; + if(tuning_info.tune_wbsm) + { + _wbsm = { -1, 0, 1 }; + search_space_shape[3] = _wbsm.size(); + } } void CLTuningParametersListRapid::initialize_lws_values(std::vector &lws, unsigned int lws_max) @@ -231,16 +260,16 @@ void CLTuningParametersListRapid::initialize_lws_values(std::vector get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws) +std::unique_ptr get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws) { - switch(mode) + switch(tuning_info.tuner_mode) { case CLTunerMode::EXHAUSTIVE: - return std::make_unique(gws); + return std::make_unique(gws, tuning_info); case CLTunerMode::NORMAL: - return std::make_unique(gws); + return std::make_unique(gws, tuning_info); case CLTunerMode::RAPID: - return std::make_unique(gws); + return std::make_unique(gws, tuning_info); default: return nullptr; } -- cgit v1.2.1