1 files changed, 155 insertions, 23 deletions
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 6737109f34..6aebef15a5 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,21 +27,42 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTuningParams.h"
 
+#include "src/core/CL/DefaultLWSHeuristics.h"
+
 #include <string>
 
 namespace arm_compute
 {
+namespace
+{
+bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
+{
+    if (lws0.dimensions() != lws1.dimensions())
+    {
+        return false;
+    }
+
+    for (size_t i = 0; i < lws0.dimensions(); ++i)
+    {
+        if (lws0.get()[i] != lws1.get()[i])
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+} // namespace
 template <typename T>
 class ICLArray;
 class ICLTensor;
 class Window;
-
 /** Common interface for all the OpenCL kernels */
 class ICLKernel : public IKernel
 {
@@ -50,7 +71,7 @@ private:
      *
      * @return The number of arguments enqueued per array object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_array()
     {
         return num_arguments_per_tensor<dimension_size>();
@@ -59,11 +80,24 @@ private:
      *
      * @return The number of arguments enqueued per tensor object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_tensor()
     {
         return 2 + 2 * dimension_size;
     }
+
+    /** Get default lws for the kernel
+     *
+     * @param[in] window               Execution window used by the kernel
+     * @param[in] use_dummy_work_items If the kernel uses dummy workloads
+     *
+     * @return    cl::NDRange
+     */
+    cl::NDRange default_lws_tune(const Window &window, bool use_dummy_work_items)
+    {
+        return get_default_lws_for_type(_type, gws_from_window(window, use_dummy_work_items));
+    }
+
     using IKernel::configure; //Prevent children from calling IKernel::configure() directly
 protected:
     /** Configure the kernel's window and local workgroup size hint.
@@ -82,16 +116,32 @@ protected:
      * @param[in] window             The maximum window which will be returned by window()
      * @param[in] tuning_params_hint (Optional) Tuning parameters to use.
      */
-    void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
+    void configure_internal(const Window  &window,
+                            CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(),
+                                                                               0))
     {
         _tuning_params_hint = tuning_params_hint;
+
+        if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
+        {
+            // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which
+            // will be recalculated with use_dummy_work_items flag at run time again anyway.
+            _tuning_params_hint.set_lws(default_lws_tune(window, false /* use_dummy_work_items */));
+        }
+
         IKernel::configure(window);
     }
 
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint()
+        : _kernel(nullptr),
+          _target(GPUTarget::MIDGARD),
+          _config_id(arm_compute::default_config_id),
+          _max_workgroup_size(0),
+          _type(CLKernelType::UNKNOWN),
+          _tuning_params_hint(),
+          _cached_gws(cl::NullRange)
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -102,6 +152,14 @@ public:
     {
         return _kernel;
     }
+    /** Returns the CL kernel type
+     *
+     * @return The CL kernel type
+     */
+    CLKernelType type() const
+    {
+        return _type;
+    }
     /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
@@ -111,7 +169,11 @@ public:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T>
-    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+    void add_1D_array_argument(unsigned int      &idx,
+                               const ICLArray<T> *array,
+                               const Strides     &strides,
+                               unsigned int       num_dimensions,
+                               const Window      &window)
     {
         add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
     }
@@ -134,7 +196,7 @@ public:
      */
     void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_1D_tensor_argument(idx, tensor, window);
         }
@@ -158,7 +220,7 @@ public:
      */
     void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_2D_tensor_argument(idx, tensor, window);
         }
@@ -183,6 +245,51 @@ public:
     {
         add_tensor_argument<4>(idx, tensor, window);
     }
+    /** Add the passed 5D tensor's parameters to the object's kernel's arguments starting from the index idx.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     * @param[in]     window Window the kernel will be executed on.
+     */
+    void add_5D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<5>(idx, tensor, window);
+    }
+
+    /** Add the passed NHW 3D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     */
+    void add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor);
+
+    /** Returns the number of arguments enqueued per NHW 3D Tensor object.
+     *
+     * @return The number of arguments enqueued per NHW 3D Tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_3d_tensor_nhw()
+    {
+        constexpr unsigned int no_args_per_3d_tensor_nhw = 7u;
+        return no_args_per_3d_tensor_nhw;
+    }
+
+    /** Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes.
+     *
+     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
+     */
+    void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor);
+
+    /** Returns the number of arguments enqueued per NHWC 4D Tensor object.
+     *
+     * @return The number of arguments enqueued per NHWC 4D Tensor object.
+     */
+    constexpr static unsigned int num_arguments_per_4d_tensor_nhwc()
+    {
+        constexpr unsigned int no_args_per_4d_tensor_nhwc = 9u;
+        return no_args_per_4d_tensor_nhwc;
+    }
+
     /** Returns the number of arguments enqueued per 1D array object.
      *
      * @return The number of arguments enqueues per 1D array object.
@@ -345,11 +452,24 @@ public:
     size_t get_max_workgroup_size();
     /** Get the global work size given an execution window
      *
-     * @param[in] window Execution window
+     * @param[in] window               Execution window
+     * @param[in] use_dummy_work_items If the kernel uses dummy work items
      *
      * @return Global work size of the given execution window
      */
-    static cl::NDRange gws_from_window(const Window &window);
+    static cl::NDRange gws_from_window(const Window &window, bool use_dummy_work_items);
+
+    /** Get the cached gws used to enqueue this kernel
+     *
+     * @return Latest global work size of the kernel
+     */
+    cl::NDRange get_cached_gws() const;
+
+    /** Cache the latest gws used to enqueue this kernel
+     *
+     * @param[in] gws Latest global work size of the kernel
+     */
+    void cache_gws(const cl::NDRange &gws);
 
 private:
     /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
@@ -361,7 +481,11 @@ private:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T, unsigned int dimension_size>
-    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
+    void add_array_argument(unsigned int      &idx,
+                            const ICLArray<T> *array,
+                            const Strides     &strides,
+                            unsigned int       num_dimensions,
+                            const Window      &window);
     /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -372,12 +496,14 @@ private:
     void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
 
 protected:
-    cl::Kernel  _kernel;             /**< OpenCL kernel to run */
-    GPUTarget   _target;             /**< The targeted GPU */
-    std::string _config_id;          /**< Configuration ID */
-    size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
+    cl::Kernel   _kernel;             /**< OpenCL kernel to run */
+    GPUTarget    _target;             /**< The targeted GPU */
+    std::string  _config_id;          /**< Configuration ID */
+    size_t       _max_workgroup_size; /**< The maximum workgroup size for this kernel */
+    CLKernelType _type;               /**< The CL kernel type */
 private:
     CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */
+    cl::NDRange    _cached_gws;         /**< Latest GWS used to enqueue this kernel */
 };
 
 /** Add the kernel to the command queue with the given window.
@@ -395,7 +521,11 @@ private:
  *
  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
  */
-void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
+void enqueue(cl::CommandQueue  &queue,
+             ICLKernel         &kernel,
+             const Window      &window,
+             const cl::NDRange &lws_hint             = CLKernelLibrary::get().default_ndrange(),
+             bool               use_dummy_work_items = false);
 
 /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
  *
@@ -406,14 +536,15 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c
  * @param[in]     window         Window the kernel will be executed on.
  */
 template <typename T, unsigned int dimension_size>
-void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+void ICLKernel::add_array_argument(
+    unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(array == nullptr);
 
     // Calculate offset to the start of the window
     unsigned int offset_first_element = 0;
 
-    for(unsigned int n = 0; n < num_dimensions; ++n)
+    for (unsigned int n = 0; n < num_dimensions; ++n)
     {
         offset_first_element += window[n].start() * strides[n];
     }
@@ -421,7 +552,7 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, array->cl_buffer());
 
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    for (unsigned int dimension = 0; dimension < dimension_size; dimension++)
     {
         _kernel.setArg<cl_uint>(idx++, strides[dimension]);
         _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
@@ -430,8 +561,9 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
-                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
+                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_array<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLKERNEL_H */