diff options
-rw-r--r-- | arm_compute/core/CL/CLKernelLibrary.h | 29 | ||||
-rw-r--r-- | arm_compute/core/CL/ICLKernel.h | 17 | ||||
-rw-r--r-- | arm_compute/core/CL/OpenCL.h | 2 | ||||
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 21 | ||||
-rw-r--r-- | src/core/CL/ICLKernel.cpp | 25 | ||||
-rw-r--r-- | src/core/CL/OpenCL.cpp | 21 |
6 files changed, 77 insertions, 38 deletions
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h index 4d4565d6fd..fc131cdcfe 100644 --- a/arm_compute/core/CL/CLKernelLibrary.h +++ b/arm_compute/core/CL/CLKernelLibrary.h @@ -162,11 +162,9 @@ public: */ void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), cl::Device device = cl::Device::getDefault()) { - _kernel_path = std::move(kernel_path); - _context = std::move(context); - _device = std::move(device); - _max_workgroup_size = 0; - max_local_workgroup_size(); + _kernel_path = std::move(kernel_path); + _context = std::move(context); + _device = std::move(device); } /** Sets the path that the kernels reside in. * @@ -208,20 +206,15 @@ public: { _device = cl_devices[0]; } - - _max_workgroup_size = 0; - max_local_workgroup_size(); - }; + } /** Sets the CL device for which the programs are created. * * @param[in] device A CL device. */ void set_device(cl::Device device) { - _device = std::move(device); - _max_workgroup_size = 0; - max_local_workgroup_size(); - }; + _device = std::move(device); + } /** Creates a kernel from the kernel library. * * @param[in] kernel_name Kernel name. @@ -238,15 +231,14 @@ public: * */ void load_binary(); - /** Find the maximum number of local work items in a workgroup can be supported by the device + /** Find the maximum number of local work items in a workgroup can be supported for the kernel. * */ - size_t max_local_workgroup_size(); - - /** Return the default NDRange that is suitable for the device. + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; + /** Return the default NDRange for the device. * */ - cl::NDRange default_ndrange(); + cl::NDRange default_ndrange() const; private: /** Load program and its dependencies. @@ -270,7 +262,6 @@ private: static const std::map<std::string, std::string> _kernel_program_map; /**< Map that associates kernel names with programs. */ static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs. Used for compile-time kernel inclusion. >*/ - size_t _max_workgroup_size; /** Maximum local workgroup size supported on the device */ }; } #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_H__ */ diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h index d118d13f3f..9119940bc5 100644 --- a/arm_compute/core/CL/ICLKernel.h +++ b/arm_compute/core/CL/ICLKernel.h @@ -175,6 +175,12 @@ public: */ GPUTarget get_target() const; + /** Get the maximum workgroup size for the device the CLKernelLibrary uses. + * + * @return The maximum workgroup size value. + */ + size_t get_max_workgroup_size(); + private: /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. * @@ -208,10 +214,11 @@ private: unsigned int num_arguments_per_tensor() const; protected: - cl::Kernel _kernel; /**< OpenCL kernel to run */ - cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */ - GPUTarget _target; /**< The targeted GPU */ - std::string _config_id; /**< Configuration ID */ + cl::Kernel _kernel; /**< OpenCL kernel to run */ + cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */ + GPUTarget _target; /**< The targeted GPU */ + std::string _config_id; /**< Configuration ID */ + size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */ }; /** Add the kernel to the command queue with the given window. @@ -223,7 +230,7 @@ protected: * @param[in,out] queue OpenCL command queue. * @param[in] kernel Kernel to enqueue * @param[in] window Window the kernel has to process. - * @param[in] lws_hint Local workgroup size requested, by default (128,1) + * @param[in] lws_hint Local workgroup size requested, by default (128,1). * * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed. */ diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h index 151cc9b53d..6780e23c2d 100644 --- a/arm_compute/core/CL/OpenCL.h +++ b/arm_compute/core/CL/OpenCL.h @@ -84,6 +84,7 @@ public: using clGetDeviceIDs_func = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); using clRetainEvent_func = cl_int (*)(cl_event); using clGetPlatformIDs_func = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *); + using clGetKernelWorkGroupInfo_func = cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); clBuildProgram_func clBuildProgram = nullptr; clEnqueueNDRangeKernel_func clEnqueueNDRangeKernel = nullptr; @@ -115,6 +116,7 @@ public: clGetDeviceIDs_func clGetDeviceIDs = nullptr; clRetainEvent_func clRetainEvent = nullptr; clGetPlatformIDs_func clGetPlatformIDs = nullptr; + clGetKernelWorkGroupInfo_func clGetKernelWorkGroupInfo = nullptr; private: std::pair<bool, bool> _loaded{ false, false }; diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index e165cf3350..6e5e802538 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -569,7 +569,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map = }; CLKernelLibrary::CLKernelLibrary() - : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map(), _max_workgroup_size(0) + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() { } @@ -709,19 +709,18 @@ std::string CLKernelLibrary::get_program_source(const std::string &program_name) return program_source_it->second; } -size_t CLKernelLibrary::max_local_workgroup_size() +size_t CLKernelLibrary::max_local_workgroup_size(const cl::Kernel &kernel) const { - if(_max_workgroup_size == 0) - { - size_t err = clGetDeviceInfo(_device.get(), CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_max_workgroup_size, nullptr); - ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information"); - ARM_COMPUTE_UNUSED(err); - } + size_t result; + + size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); + ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_UNUSED(err); - return _max_workgroup_size; + return result; } -cl::NDRange CLKernelLibrary::default_ndrange() +cl::NDRange CLKernelLibrary::default_ndrange() const { - return cl::NDRange(std::min<size_t>(_max_workgroup_size, 128u), 1); + return cl::NDRange(128u, 1); } diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 1e04f00343..17b58b727f 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -52,18 +52,28 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); + cl::NDRange valid_lws; + if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) + { + valid_lws = cl::NullRange; + } + else + { + valid_lws = lws_hint; + } + cl::NDRange lws = cl::NullRange; - if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2])) + if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) { - lws = lws_hint; + lws = valid_lws; } queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws); } ICLKernel::ICLKernel() - : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id) + : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0) { } @@ -163,3 +173,12 @@ GPUTarget ICLKernel::get_target() const { return _target; } + +size_t ICLKernel::get_max_workgroup_size() +{ + if(_max_workgroup_size == 0) + { + _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel); + } + return _max_workgroup_size; +} diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index cc2391977b..287c5e2f5a 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -101,6 +101,7 @@ bool CLSymbols::load(const std::string &library) clGetDeviceIDs = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs")); clRetainEvent = reinterpret_cast<clRetainEvent_func>(dlsym(handle, "clRetainEvent")); clGetPlatformIDs = reinterpret_cast<clGetPlatformIDs_func>(dlsym(handle, "clGetPlatformIDs")); + clGetKernelWorkGroupInfo = reinterpret_cast<clGetKernelWorkGroupInfo_func>(dlsym(handle, "clGetKernelWorkGroupInfo")); dlclose(handle); @@ -647,3 +648,23 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint return CL_OUT_OF_RESOURCES; } } + +cl_int +clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo; + if(func != nullptr) + { + return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} |