aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/core/CL/ICLKernel.h
diff options
context:
space:
mode:
authorAbel Bernabeu <abel.bernabeu@arm.com>2017-09-28 09:53:45 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit5a6e0532b39d674f8773014a0a553d9bc70a9baa (patch)
tree5cfb228a11ed903f9e2872dc86d5cd1fdf1edc08 /arm_compute/core/CL/ICLKernel.h
parent53b405f1e08ad41cb9a527abfe0308ec1edf18ff (diff)
downloadComputeLibrary-5a6e0532b39d674f8773014a0a553d9bc70a9baa.tar.gz
COMPUTE-8024 Fixed the maximum OpenCL workgroup size
The maximum workgroup size depends on the kernel and the device, rather than being a property of the device. The present patch fixes the case when a kernel is queued with no workgroup size and the default workgroup size is used instead. A previous patch introduced a maximum workgroup size that depended on the device but ignored the kernel. In OpenCL the maximum workgroup size we query from the device is an upper bound of the actual maximum that we can query for a given kernel running on the same device. For some kernels the values will match, but for others we will get a lower value when querying for an specific kernel (i.e. if the kernel uses a high number of registers). Change-Id: I3bed6bde80ddc4f0ddb8f82c80903774aa1999b6 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/89471 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute/core/CL/ICLKernel.h')
-rw-r--r--arm_compute/core/CL/ICLKernel.h17
1 files changed, 12 insertions, 5 deletions
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index d118d13f3f..9119940bc5 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -175,6 +175,12 @@ public:
*/
GPUTarget get_target() const;
+ /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
+ *
+ * @return The maximum workgroup size value.
+ */
+ size_t get_max_workgroup_size();
+
private:
/** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
*
@@ -208,10 +214,11 @@ private:
unsigned int num_arguments_per_tensor() const;
protected:
- cl::Kernel _kernel; /**< OpenCL kernel to run */
- cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
- GPUTarget _target; /**< The targeted GPU */
- std::string _config_id; /**< Configuration ID */
+ cl::Kernel _kernel; /**< OpenCL kernel to run */
+ cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+ GPUTarget _target; /**< The targeted GPU */
+ std::string _config_id; /**< Configuration ID */
+ size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */
};
/** Add the kernel to the command queue with the given window.
@@ -223,7 +230,7 @@ protected:
* @param[in,out] queue OpenCL command queue.
* @param[in] kernel Kernel to enqueue
* @param[in] window Window the kernel has to process.
- * @param[in] lws_hint Local workgroup size requested, by default (128,1)
+ * @param[in] lws_hint Local workgroup size requested, by default (128,1).
*
* @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
*/