From 5a6e0532b39d674f8773014a0a553d9bc70a9baa Mon Sep 17 00:00:00 2001 From: Abel Bernabeu Date: Thu, 28 Sep 2017 09:53:45 +0100 Subject: COMPUTE-8024 Fixed the maximum OpenCL workgroup size The maximum workgroup size depends on the kernel and the device, rather than being a property of the device. The present patch fixes the case when a kernel is queued with no workgroup size and the default workgroup size is used instead. A previous patch introduced a maximum workgroup size that depended on the device but ignored the kernel. In OpenCL the maximum workgroup size we query from the device is an upper bound of the actual maximum that we can query for a given kernel running on the same device. For some kernels the values will match, but for others we will get a lower value when querying for an specific kernel (i.e. if the kernel uses a high number of registers). Change-Id: I3bed6bde80ddc4f0ddb8f82c80903774aa1999b6 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/89471 Tested-by: Kaizen Reviewed-by: Anthony Barbier --- arm_compute/core/CL/CLKernelLibrary.h | 29 ++++++++++------------------- arm_compute/core/CL/ICLKernel.h | 17 ++++++++++++----- arm_compute/core/CL/OpenCL.h | 2 ++ src/core/CL/CLKernelLibrary.cpp | 21 ++++++++++----------- src/core/CL/ICLKernel.cpp | 25 ++++++++++++++++++++++--- src/core/CL/OpenCL.cpp | 21 +++++++++++++++++++++ 6 files changed, 77 insertions(+), 38 deletions(-) diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h index 4d4565d6fd..fc131cdcfe 100644 --- a/arm_compute/core/CL/CLKernelLibrary.h +++ b/arm_compute/core/CL/CLKernelLibrary.h @@ -162,11 +162,9 @@ public: */ void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), cl::Device device = cl::Device::getDefault()) { - _kernel_path = std::move(kernel_path); - _context = std::move(context); - _device = std::move(device); - _max_workgroup_size = 0; - max_local_workgroup_size(); + _kernel_path = std::move(kernel_path); + _context = std::move(context); + _device = std::move(device); } /** Sets the path that the kernels reside in. * @@ -208,20 +206,15 @@ public: { _device = cl_devices[0]; } - - _max_workgroup_size = 0; - max_local_workgroup_size(); - }; + } /** Sets the CL device for which the programs are created. * * @param[in] device A CL device. */ void set_device(cl::Device device) { - _device = std::move(device); - _max_workgroup_size = 0; - max_local_workgroup_size(); - }; + _device = std::move(device); + } /** Creates a kernel from the kernel library. * * @param[in] kernel_name Kernel name. @@ -238,15 +231,14 @@ public: * */ void load_binary(); - /** Find the maximum number of local work items in a workgroup can be supported by the device + /** Find the maximum number of local work items in a workgroup can be supported for the kernel. * */ - size_t max_local_workgroup_size(); - - /** Return the default NDRange that is suitable for the device. + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; + /** Return the default NDRange for the device. * */ - cl::NDRange default_ndrange(); + cl::NDRange default_ndrange() const; private: /** Load program and its dependencies. @@ -270,7 +262,6 @@ private: static const std::map _kernel_program_map; /**< Map that associates kernel names with programs. */ static const std::map _program_source_map; /**< Contains sources for all programs. Used for compile-time kernel inclusion. >*/ - size_t _max_workgroup_size; /** Maximum local workgroup size supported on the device */ }; } #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_H__ */ diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h index d118d13f3f..9119940bc5 100644 --- a/arm_compute/core/CL/ICLKernel.h +++ b/arm_compute/core/CL/ICLKernel.h @@ -175,6 +175,12 @@ public: */ GPUTarget get_target() const; + /** Get the maximum workgroup size for the device the CLKernelLibrary uses. + * + * @return The maximum workgroup size value. + */ + size_t get_max_workgroup_size(); + private: /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. * @@ -208,10 +214,11 @@ private: unsigned int num_arguments_per_tensor() const; protected: - cl::Kernel _kernel; /**< OpenCL kernel to run */ - cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */ - GPUTarget _target; /**< The targeted GPU */ - std::string _config_id; /**< Configuration ID */ + cl::Kernel _kernel; /**< OpenCL kernel to run */ + cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */ + GPUTarget _target; /**< The targeted GPU */ + std::string _config_id; /**< Configuration ID */ + size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */ }; /** Add the kernel to the command queue with the given window. @@ -223,7 +230,7 @@ protected: * @param[in,out] queue OpenCL command queue. * @param[in] kernel Kernel to enqueue * @param[in] window Window the kernel has to process. - * @param[in] lws_hint Local workgroup size requested, by default (128,1) + * @param[in] lws_hint Local workgroup size requested, by default (128,1). * * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed. */ diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h index 151cc9b53d..6780e23c2d 100644 --- a/arm_compute/core/CL/OpenCL.h +++ b/arm_compute/core/CL/OpenCL.h @@ -84,6 +84,7 @@ public: using clGetDeviceIDs_func = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); using clRetainEvent_func = cl_int (*)(cl_event); using clGetPlatformIDs_func = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *); + using clGetKernelWorkGroupInfo_func = cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); clBuildProgram_func clBuildProgram = nullptr; clEnqueueNDRangeKernel_func clEnqueueNDRangeKernel = nullptr; @@ -115,6 +116,7 @@ public: clGetDeviceIDs_func clGetDeviceIDs = nullptr; clRetainEvent_func clRetainEvent = nullptr; clGetPlatformIDs_func clGetPlatformIDs = nullptr; + clGetKernelWorkGroupInfo_func clGetKernelWorkGroupInfo = nullptr; private: std::pair _loaded{ false, false }; diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index e165cf3350..6e5e802538 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -569,7 +569,7 @@ const std::map CLKernelLibrary::_program_source_map = }; CLKernelLibrary::CLKernelLibrary() - : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map(), _max_workgroup_size(0) + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() { } @@ -709,19 +709,18 @@ std::string CLKernelLibrary::get_program_source(const std::string &program_name) return program_source_it->second; } -size_t CLKernelLibrary::max_local_workgroup_size() +size_t CLKernelLibrary::max_local_workgroup_size(const cl::Kernel &kernel) const { - if(_max_workgroup_size == 0) - { - size_t err = clGetDeviceInfo(_device.get(), CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_max_workgroup_size, nullptr); - ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information"); - ARM_COMPUTE_UNUSED(err); - } + size_t result; + + size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); + ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_UNUSED(err); - return _max_workgroup_size; + return result; } -cl::NDRange CLKernelLibrary::default_ndrange() +cl::NDRange CLKernelLibrary::default_ndrange() const { - return cl::NDRange(std::min(_max_workgroup_size, 128u), 1); + return cl::NDRange(128u, 1); } diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 1e04f00343..17b58b727f 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -52,18 +52,28 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); + cl::NDRange valid_lws; + if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) + { + valid_lws = cl::NullRange; + } + else + { + valid_lws = lws_hint; + } + cl::NDRange lws = cl::NullRange; - if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2])) + if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) { - lws = lws_hint; + lws = valid_lws; } queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws); } ICLKernel::ICLKernel() - : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id) + : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0) { } @@ -163,3 +173,12 @@ GPUTarget ICLKernel::get_target() const { return _target; } + +size_t ICLKernel::get_max_workgroup_size() +{ + if(_max_workgroup_size == 0) + { + _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel); + } + return _max_workgroup_size; +} diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index cc2391977b..287c5e2f5a 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -101,6 +101,7 @@ bool CLSymbols::load(const std::string &library) clGetDeviceIDs = reinterpret_cast(dlsym(handle, "clGetDeviceIDs")); clRetainEvent = reinterpret_cast(dlsym(handle, "clRetainEvent")); clGetPlatformIDs = reinterpret_cast(dlsym(handle, "clGetPlatformIDs")); + clGetKernelWorkGroupInfo = reinterpret_cast(dlsym(handle, "clGetKernelWorkGroupInfo")); dlclose(handle); @@ -647,3 +648,23 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint return CL_OUT_OF_RESOURCES; } } + +cl_int +clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo; + if(func != nullptr) + { + return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} -- cgit v1.2.1