diff options
Diffstat (limited to 'src/core/CL/ICLKernel.h')
-rw-r--r-- | src/core/CL/ICLKernel.h | 178 |
1 files changed, 155 insertions, 23 deletions
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index 6737109f34..6aebef15a5 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,21 +27,42 @@ #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/CLTypes.h" #include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/IKernel.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/CL/CLTuningParams.h" +#include "src/core/CL/DefaultLWSHeuristics.h" + #include <string> namespace arm_compute { +namespace +{ +bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1) +{ + if (lws0.dimensions() != lws1.dimensions()) + { + return false; + } + + for (size_t i = 0; i < lws0.dimensions(); ++i) + { + if (lws0.get()[i] != lws1.get()[i]) + { + return false; + } + } + + return true; +} +} // namespace template <typename T> class ICLArray; class ICLTensor; class Window; - /** Common interface for all the OpenCL kernels */ class ICLKernel : public IKernel { @@ -50,7 +71,7 @@ private: * * @return The number of arguments enqueued per array object. */ - template <unsigned int dimension_size> + template <unsigned int dimension_size> constexpr static unsigned int num_arguments_per_array() { return num_arguments_per_tensor<dimension_size>(); @@ -59,11 +80,24 @@ private: * * @return The number of arguments enqueued per tensor object. */ - template <unsigned int dimension_size> + template <unsigned int dimension_size> constexpr static unsigned int num_arguments_per_tensor() { return 2 + 2 * dimension_size; } + + /** Get default lws for the kernel + * + * @param[in] window Execution window used by the kernel + * @param[in] use_dummy_work_items If the kernel uses dummy workloads + * + * @return cl::NDRange + */ + cl::NDRange default_lws_tune(const Window &window, bool use_dummy_work_items) + { + return get_default_lws_for_type(_type, gws_from_window(window, use_dummy_work_items)); + } + using IKernel::configure; //Prevent children from calling IKernel::configure() directly protected: /** Configure the kernel's window and local workgroup size hint. @@ -82,16 +116,32 @@ protected: * @param[in] window The maximum window which will be returned by window() * @param[in] tuning_params_hint (Optional) Tuning parameters to use. */ - void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0)) + void configure_internal(const Window &window, + CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), + 0)) { _tuning_params_hint = tuning_params_hint; + + if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange())) + { + // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which + // will be recalculated with use_dummy_work_items flag at run time again anyway. + _tuning_params_hint.set_lws(default_lws_tune(window, false /* use_dummy_work_items */)); + } + IKernel::configure(window); } public: /** Constructor */ ICLKernel() - : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint() + : _kernel(nullptr), + _target(GPUTarget::MIDGARD), + _config_id(arm_compute::default_config_id), + _max_workgroup_size(0), + _type(CLKernelType::UNKNOWN), + _tuning_params_hint(), + _cached_gws(cl::NullRange) { } /** Returns a reference to the OpenCL kernel of this object. @@ -102,6 +152,14 @@ public: { return _kernel; } + /** Returns the CL kernel type + * + * @return The CL kernel type + */ + CLKernelType type() const + { + return _type; + } /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set. @@ -111,7 +169,11 @@ public: * @param[in] window Window the kernel will be executed on. */ template <typename T> - void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) + void add_1D_array_argument(unsigned int &idx, + const ICLArray<T> *array, + const Strides &strides, + unsigned int num_dimensions, + const Window &window) { add_array_argument<T, 1>(idx, array, strides, num_dimensions, window); } @@ -134,7 +196,7 @@ public: */ void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window) { - if(cond) + if (cond) { add_1D_tensor_argument(idx, tensor, window); } @@ -158,7 +220,7 @@ public: */ void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window) { - if(cond) + if (cond) { add_2D_tensor_argument(idx, tensor, window); } @@ -183,6 +245,51 @@ public: { add_tensor_argument<4>(idx, tensor, window); } + /** Add the passed 5D tensor's parameters to the object's kernel's arguments starting from the index idx. + * + * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] window Window the kernel will be executed on. + */ + void add_5D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<5>(idx, tensor, window); + } + + /** Add the passed NHW 3D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes. + * + * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + */ + void add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor); + + /** Returns the number of arguments enqueued per NHW 3D Tensor object. + * + * @return The number of arguments enqueued per NHW 3D Tensor object. + */ + constexpr static unsigned int num_arguments_per_3d_tensor_nhw() + { + constexpr unsigned int no_args_per_3d_tensor_nhw = 7u; + return no_args_per_3d_tensor_nhw; + } + + /** Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes. + * + * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + */ + void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor); + + /** Returns the number of arguments enqueued per NHWC 4D Tensor object. + * + * @return The number of arguments enqueued per NHWC 4D Tensor object. + */ + constexpr static unsigned int num_arguments_per_4d_tensor_nhwc() + { + constexpr unsigned int no_args_per_4d_tensor_nhwc = 9u; + return no_args_per_4d_tensor_nhwc; + } + /** Returns the number of arguments enqueued per 1D array object. * * @return The number of arguments enqueues per 1D array object. @@ -345,11 +452,24 @@ public: size_t get_max_workgroup_size(); /** Get the global work size given an execution window * - * @param[in] window Execution window + * @param[in] window Execution window + * @param[in] use_dummy_work_items If the kernel uses dummy work items * * @return Global work size of the given execution window */ - static cl::NDRange gws_from_window(const Window &window); + static cl::NDRange gws_from_window(const Window &window, bool use_dummy_work_items); + + /** Get the cached gws used to enqueue this kernel + * + * @return Latest global work size of the kernel + */ + cl::NDRange get_cached_gws() const; + + /** Cache the latest gws used to enqueue this kernel + * + * @param[in] gws Latest global work size of the kernel + */ + void cache_gws(const cl::NDRange &gws); private: /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. @@ -361,7 +481,11 @@ private: * @param[in] window Window the kernel will be executed on. */ template <typename T, unsigned int dimension_size> - void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window); + void add_array_argument(unsigned int &idx, + const ICLArray<T> *array, + const Strides &strides, + unsigned int num_dimensions, + const Window &window); /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. @@ -372,12 +496,14 @@ private: void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); protected: - cl::Kernel _kernel; /**< OpenCL kernel to run */ - GPUTarget _target; /**< The targeted GPU */ - std::string _config_id; /**< Configuration ID */ - size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */ + cl::Kernel _kernel; /**< OpenCL kernel to run */ + GPUTarget _target; /**< The targeted GPU */ + std::string _config_id; /**< Configuration ID */ + size_t _max_workgroup_size; /**< The maximum workgroup size for this kernel */ + CLKernelType _type; /**< The CL kernel type */ private: CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */ + cl::NDRange _cached_gws; /**< Latest GWS used to enqueue this kernel */ }; /** Add the kernel to the command queue with the given window. @@ -395,7 +521,11 @@ private: * * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed. */ -void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false); +void enqueue(cl::CommandQueue &queue, + ICLKernel &kernel, + const Window &window, + const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), + bool use_dummy_work_items = false); /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. * @@ -406,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c * @param[in] window Window the kernel will be executed on. */ template <typename T, unsigned int dimension_size> -void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) +void ICLKernel::add_array_argument( + unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window) { ARM_COMPUTE_ERROR_ON(array == nullptr); // Calculate offset to the start of the window unsigned int offset_first_element = 0; - for(unsigned int n = 0; n < num_dimensions; ++n) + for (unsigned int n = 0; n < num_dimensions; ++n) { offset_first_element += window[n].start() * strides[n]; } @@ -421,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons unsigned int idx_start = idx; _kernel.setArg(idx++, array->cl_buffer()); - for(unsigned int dimension = 0; dimension < dimension_size; dimension++) + for (unsigned int dimension = 0; dimension < dimension_size; dimension++) { _kernel.setArg<cl_uint>(idx++, strides[dimension]); _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step()); @@ -430,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons _kernel.setArg<cl_uint>(idx++, offset_first_element); ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx, - "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>()); + "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", + dimension_size, num_arguments_per_array<dimension_size>()); ARM_COMPUTE_UNUSED(idx_start); } -} +} // namespace arm_compute #endif /*ARM_COMPUTE_ICLKERNEL_H */ |