aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/core/CL/CLKernelLibrary.h29
-rw-r--r--arm_compute/core/CL/ICLKernel.h17
-rw-r--r--arm_compute/core/CL/OpenCL.h2
-rw-r--r--src/core/CL/CLKernelLibrary.cpp21
-rw-r--r--src/core/CL/ICLKernel.cpp25
-rw-r--r--src/core/CL/OpenCL.cpp21
6 files changed, 77 insertions, 38 deletions
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
index 4d4565d6fd..fc131cdcfe 100644
--- a/arm_compute/core/CL/CLKernelLibrary.h
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -162,11 +162,9 @@ public:
*/
void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), cl::Device device = cl::Device::getDefault())
{
- _kernel_path = std::move(kernel_path);
- _context = std::move(context);
- _device = std::move(device);
- _max_workgroup_size = 0;
- max_local_workgroup_size();
+ _kernel_path = std::move(kernel_path);
+ _context = std::move(context);
+ _device = std::move(device);
}
/** Sets the path that the kernels reside in.
*
@@ -208,20 +206,15 @@ public:
{
_device = cl_devices[0];
}
-
- _max_workgroup_size = 0;
- max_local_workgroup_size();
- };
+ }
/** Sets the CL device for which the programs are created.
*
* @param[in] device A CL device.
*/
void set_device(cl::Device device)
{
- _device = std::move(device);
- _max_workgroup_size = 0;
- max_local_workgroup_size();
- };
+ _device = std::move(device);
+ }
/** Creates a kernel from the kernel library.
*
* @param[in] kernel_name Kernel name.
@@ -238,15 +231,14 @@ public:
*
*/
void load_binary();
- /** Find the maximum number of local work items in a workgroup can be supported by the device
+ /** Find the maximum number of local work items in a workgroup can be supported for the kernel.
*
*/
- size_t max_local_workgroup_size();
-
- /** Return the default NDRange that is suitable for the device.
+ size_t max_local_workgroup_size(const cl::Kernel &kernel) const;
+ /** Return the default NDRange for the device.
*
*/
- cl::NDRange default_ndrange();
+ cl::NDRange default_ndrange() const;
private:
/** Load program and its dependencies.
@@ -270,7 +262,6 @@ private:
static const std::map<std::string, std::string> _kernel_program_map; /**< Map that associates kernel names with programs. */
static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
Used for compile-time kernel inclusion. >*/
- size_t _max_workgroup_size; /** Maximum local workgroup size supported on the device */
};
}
#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_H__ */
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index d118d13f3f..9119940bc5 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -175,6 +175,12 @@ public:
*/
GPUTarget get_target() const;
+ /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
+ *
+ * @return The maximum workgroup size value.
+ */
+ size_t get_max_workgroup_size();
+
private:
/** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
*
@@ -208,10 +214,11 @@ private:
unsigned int num_arguments_per_tensor() const;
protected:
- cl::Kernel _kernel; /**< OpenCL kernel to run */
- cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
- GPUTarget _target; /**< The targeted GPU */
- std::string _config_id; /**< Configuration ID */
+ cl::Kernel _kernel; /**< OpenCL kernel to run */
+ cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+ GPUTarget _target; /**< The targeted GPU */
+ std::string _config_id; /**< Configuration ID */
+ size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */
};
/** Add the kernel to the command queue with the given window.
@@ -223,7 +230,7 @@ protected:
* @param[in,out] queue OpenCL command queue.
* @param[in] kernel Kernel to enqueue
* @param[in] window Window the kernel has to process.
- * @param[in] lws_hint Local workgroup size requested, by default (128,1)
+ * @param[in] lws_hint Local workgroup size requested, by default (128,1).
*
* @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
*/
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index 151cc9b53d..6780e23c2d 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -84,6 +84,7 @@ public:
using clGetDeviceIDs_func = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
using clRetainEvent_func = cl_int (*)(cl_event);
using clGetPlatformIDs_func = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *);
+ using clGetKernelWorkGroupInfo_func = cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
clBuildProgram_func clBuildProgram = nullptr;
clEnqueueNDRangeKernel_func clEnqueueNDRangeKernel = nullptr;
@@ -115,6 +116,7 @@ public:
clGetDeviceIDs_func clGetDeviceIDs = nullptr;
clRetainEvent_func clRetainEvent = nullptr;
clGetPlatformIDs_func clGetPlatformIDs = nullptr;
+ clGetKernelWorkGroupInfo_func clGetKernelWorkGroupInfo = nullptr;
private:
std::pair<bool, bool> _loaded{ false, false };
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index e165cf3350..6e5e802538 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -569,7 +569,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
};
CLKernelLibrary::CLKernelLibrary()
- : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map(), _max_workgroup_size(0)
+ : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
{
}
@@ -709,19 +709,18 @@ std::string CLKernelLibrary::get_program_source(const std::string &program_name)
return program_source_it->second;
}
-size_t CLKernelLibrary::max_local_workgroup_size()
+size_t CLKernelLibrary::max_local_workgroup_size(const cl::Kernel &kernel) const
{
- if(_max_workgroup_size == 0)
- {
- size_t err = clGetDeviceInfo(_device.get(), CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &_max_workgroup_size, nullptr);
- ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetDeviceInfo failed to return valid information");
- ARM_COMPUTE_UNUSED(err);
- }
+ size_t result;
+
+ size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+ ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+ ARM_COMPUTE_UNUSED(err);
- return _max_workgroup_size;
+ return result;
}
-cl::NDRange CLKernelLibrary::default_ndrange()
+cl::NDRange CLKernelLibrary::default_ndrange() const
{
- return cl::NDRange(std::min<size_t>(_max_workgroup_size, 128u), 1);
+ return cl::NDRange(128u, 1);
}
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 1e04f00343..17b58b727f 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -52,18 +52,28 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind
(window.y().end() - window.y().start()) / window.y().step(),
(window.z().end() - window.z().start()) / window.z().step());
+ cl::NDRange valid_lws;
+ if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
+ {
+ valid_lws = cl::NullRange;
+ }
+ else
+ {
+ valid_lws = lws_hint;
+ }
+
cl::NDRange lws = cl::NullRange;
- if((lws_hint[0] <= gws[0]) && (lws_hint[1] <= gws[1]) && (lws_hint[2] <= gws[2]))
+ if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
{
- lws = lws_hint;
+ lws = valid_lws;
}
queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
}
ICLKernel::ICLKernel()
- : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id)
+ : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0)
{
}
@@ -163,3 +173,12 @@ GPUTarget ICLKernel::get_target() const
{
return _target;
}
+
+size_t ICLKernel::get_max_workgroup_size()
+{
+ if(_max_workgroup_size == 0)
+ {
+ _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
+ }
+ return _max_workgroup_size;
+}
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index cc2391977b..287c5e2f5a 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -101,6 +101,7 @@ bool CLSymbols::load(const std::string &library)
clGetDeviceIDs = reinterpret_cast<clGetDeviceIDs_func>(dlsym(handle, "clGetDeviceIDs"));
clRetainEvent = reinterpret_cast<clRetainEvent_func>(dlsym(handle, "clRetainEvent"));
clGetPlatformIDs = reinterpret_cast<clGetPlatformIDs_func>(dlsym(handle, "clGetPlatformIDs"));
+ clGetKernelWorkGroupInfo = reinterpret_cast<clGetKernelWorkGroupInfo_func>(dlsym(handle, "clGetKernelWorkGroupInfo"));
dlclose(handle);
@@ -647,3 +648,23 @@ cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint
return CL_OUT_OF_RESOURCES;
}
}
+
+cl_int
+clGetKernelWorkGroupInfo(cl_kernel kernel,
+ cl_device_id device,
+ cl_kernel_work_group_info param_name,
+ size_t param_value_size,
+ void *param_value,
+ size_t *param_value_size_ret)
+{
+ arm_compute::CLSymbols::get().load_default();
+ auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo;
+ if(func != nullptr)
+ {
+ return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
+ }
+ else
+ {
+ return CL_OUT_OF_RESOURCES;
+ }
+}