diff options
author | Abel Bernabeu <abel.bernabeu@arm.com> | 2017-09-28 09:53:45 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:35:24 +0000 |
commit | 5a6e0532b39d674f8773014a0a553d9bc70a9baa (patch) | |
tree | 5cfb228a11ed903f9e2872dc86d5cd1fdf1edc08 /src | |
parent | 53b405f1e08ad41cb9a527abfe0308ec1edf18ff (diff) | |
download | ComputeLibrary-5a6e0532b39d674f8773014a0a553d9bc70a9baa.tar.gz |
COMPUTE-8024 Fixed the maximum OpenCL workgroup size
The maximum workgroup size depends on the kernel and the device, rather
than being a property of the device. The present patch fixes the case
when a kernel is queued with no workgroup size and the default workgroup
size is used instead.
A previous patch introduced a maximum workgroup size that depended on
the device but ignored the kernel. In OpenCL the maximum workgroup size
we query from the device is an upper bound of the actual maximum that
we can query for a given kernel running on the same device. For some
kernels the values will match, but for others we will get a lower value
when querying for an specific kernel (i.e. if the kernel uses a high
number of registers).
Change-Id: I3bed6bde80ddc4f0ddb8f82c80903774aa1999b6
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/89471
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 21 | ||||
-rw-r--r-- | src/core/CL/ICLKernel.cpp | 25 | ||||
-rw-r--r-- | src/core/CL/OpenCL.cpp | 21 |
3 files changed, 53 insertions, 14 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index e165cf3350..6e5e802538 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -569,7 +569,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = }; CLKernelLibrary::CLKernelLibrary() - : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map(), _max_workgroup_size(0) + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() { } @@ -709,19 +709,18 @@ std::string CLKernelLibrary::get_program_source(const std::string &program_name) return program_source_it->second; } -size_t CLKernelLibrary::max_local_workgroup_size() +size_t CLKernelLibrary::max_local_workgroup_size(const cl::Kernel &kernel) const { - if(_max_workgroup_size == 0) - { - size_t err = clGetDeviceInfo(_device.get(), CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_max_workgroup_size, nullptr); - ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information"); - ARM_COMPUTE_UNUSED(err); - } + size_t result; + + size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); + ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_UNUSED(err); - return _max_workgroup_size; + return result; } -cl::NDRange CLKernelLibrary::default_ndrange() +cl::NDRange CLKernelLibrary::default_ndrange() const { - return cl::NDRange(std::min<size_t>(_max_workgroup_size, 128u), 1); + return cl::NDRange(128u, 1); } diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 1e04f00343..17b58b727f 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -52,18 +52,28 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); + cl::NDRange valid_lws; + if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) + { + valid_lws = cl::NullRange; + } + else + { + valid_lws = lws_hint; + } + cl::NDRange lws = cl::NullRange; - if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2])) + if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) { - lws = lws_hint; + lws = valid_lws; } queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws); } ICLKernel::ICLKernel() - : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id) + : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0) { } @@ -163,3 +173,12 @@ GPUTarget ICLKernel::get_target() const { return _target; } + +size_t ICLKernel::get_max_workgroup_size() +{ + if(_max_workgroup_size == 0) + { + _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel); + } + return _max_workgroup_size; +} diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index cc2391977b..287c5e2f5a 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -101,6 +101,7 @@ bool CLSymbols::load(const std::string &library) clGetDeviceIDs = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs")); clRetainEvent = reinterpret_cast<clRetainEvent_func>(dlsym(handle, "clRetainEvent")); clGetPlatformIDs = reinterpret_cast<clGetPlatformIDs_func>(dlsym(handle, "clGetPlatformIDs")); + clGetKernelWorkGroupInfo = reinterpret_cast<clGetKernelWorkGroupInfo_func>(dlsym(handle, "clGetKernelWorkGroupInfo")); dlclose(handle); @@ -647,3 +648,23 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint return CL_OUT_OF_RESOURCES; } } + +cl_int +clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo; + if(func != nullptr) + { + return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} |