From 0021d750d66d199c411df00cdd8308c325f1fef3 Mon Sep 17 00:00:00 2001
From: Diego Lopez Recas <Diego.LopezRecas@arm.com>
Date: Mon, 18 Dec 2017 14:42:56 +0000
Subject: IVGCVSW-863 Broadcast support in CL/NEON Arithmetic Add

Also, added instrumentation to support generic tensor broadcasting for
NEON and CL backends.

Change-Id: I1bc5747a286e1a4b464c209067581e103d473b9a
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114201
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/CL/ICLKernel.h | 124 +++++++++++++++++++++++++---------------
 1 file changed, 79 insertions(+), 45 deletions(-)

(limited to 'arm_compute/core/CL/ICLKernel.h')
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index a1bc3eb8d2..e660ae55a0 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,14 +41,40 @@ class Window;
 /** Common interface for all the OpenCL kernels */
 class ICLKernel : public IKernel
 {
+private:
+    /** Returns the number of arguments enqueued per array object.
+     *
+     * @return The number of arguments enqueued per array object.
+     */
+    template <unsigned int        dimension_size>
+    constexpr static unsigned int num_arguments_per_array()
+    {
+        return num_arguments_per_tensor<dimension_size>();
+    }
+    /** Returns the number of arguments enqueued per tensor object.
+     *
+     * @return The number of arguments enqueued per tensor object.
+     */
+    template <unsigned int        dimension_size>
+    constexpr static unsigned int num_arguments_per_tensor()
+    {
+        return 2 + 2 * dimension_size;
+    }
+
 public:
     /** Constructor */
-    ICLKernel();
+    ICLKernel()
+        : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0)
+    {
+    }
     /** Returns a reference to the OpenCL kernel of this object.
      *
      * @return A reference to the OpenCL kernel of this object.
      */
-    cl::Kernel &kernel();
+    cl::Kernel &kernel()
+    {
+        return _kernel;
+    }
     /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
@@ -58,60 +84,90 @@ public:
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T>
-    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
+    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+    {
+        add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
+    }
     /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
      * @param[in]     window Window the kernel will be executed on.
      */
-    void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<1>(idx, tensor, window);
+    }
     /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
      * @param[in]     window Window the kernel will be executed on.
      */
-    void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<2>(idx, tensor, window);
+    }
     /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
      * @param[in]     window Window the kernel will be executed on.
      */
-    void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<3>(idx, tensor, window);
+    }
     /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
      * @param[in]     window Window the kernel will be executed on.
      */
-    void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
+    void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
+    {
+        add_tensor_argument<4>(idx, tensor, window);
+    }
     /** Returns the number of arguments enqueued per 1D array object.
      *
      * @return The number of arguments enqueues per 1D array object.
      */
-    unsigned int num_arguments_per_1D_array() const;
+    constexpr static unsigned int num_arguments_per_1D_array()
+    {
+        return num_arguments_per_array<1>();
+    }
     /** Returns the number of arguments enqueued per 1D tensor object.
      *
      * @return The number of arguments enqueues per 1D tensor object.
      */
-    unsigned int num_arguments_per_1D_tensor() const;
+    constexpr static unsigned int num_arguments_per_1D_tensor()
+    {
+        return num_arguments_per_tensor<1>();
+    }
     /** Returns the number of arguments enqueued per 2D tensor object.
      *
      * @return The number of arguments enqueues per 2D tensor object.
      */
-    unsigned int num_arguments_per_2D_tensor() const;
+    constexpr static unsigned int num_arguments_per_2D_tensor()
+    {
+        return num_arguments_per_tensor<2>();
+    }
     /** Returns the number of arguments enqueued per 3D tensor object.
      *
      * @return The number of arguments enqueues per 3D tensor object.
      */
-    unsigned int num_arguments_per_3D_tensor() const;
+    constexpr static unsigned int num_arguments_per_3D_tensor()
+    {
+        return num_arguments_per_tensor<3>();
+    }
     /** Returns the number of arguments enqueued per 4D tensor object.
      *
      * @return The number of arguments enqueues per 4D tensor object.
      */
-    unsigned int num_arguments_per_4D_tensor() const;
+    constexpr static unsigned int num_arguments_per_4D_tensor()
+    {
+        return num_arguments_per_tensor<4>();
+    }
     /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
      *
      * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
@@ -161,7 +217,10 @@ public:
      *
      * @param[in] target The targeted GPU architecture
      */
-    void set_target(GPUTarget target);
+    void set_target(GPUTarget target)
+    {
+        _target = target;
+    }
 
     /** Set the targeted GPU architecture according to the CL device
      *
@@ -173,7 +232,10 @@ public:
      *
      * @return The targeted GPU architecture.
      */
-    GPUTarget get_target() const;
+    GPUTarget get_target() const
+    {
+        return _target;
+    }
 
     /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
      *
@@ -207,18 +269,6 @@ private:
      */
     template <unsigned int dimension_size>
     void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
-    /** Returns the number of arguments enqueued per array object.
-     *
-     * @return The number of arguments enqueued per array object.
-     */
-    template <unsigned int dimension_size>
-    unsigned int           num_arguments_per_array() const;
-    /** Returns the number of arguments enqueued per tensor object.
-     *
-     * @return The number of arguments enqueued per tensor object.
-     */
-    template <unsigned int dimension_size>
-    unsigned int           num_arguments_per_tensor() const;
 
 protected:
     cl::Kernel  _kernel;             /**< OpenCL kernel to run */
@@ -246,6 +296,8 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c
 template <typename T, unsigned int dimension_size>
 void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
 {
+    ARM_COMPUTE_ERROR_ON(array == nullptr);
+
     // Calculate offset to the start of the window
     unsigned int offset_first_element = 0;
 
@@ -269,23 +321,5 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, cons
                              "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
-
-template <typename T>
-void ICLKernel::add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
-{
-    add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
-}
-
-template <unsigned int dimension_size>
-unsigned int           ICLKernel::num_arguments_per_array() const
-{
-    return num_arguments_per_tensor<dimension_size>();
-}
-
-template <unsigned int dimension_size>
-unsigned int           ICLKernel::num_arguments_per_tensor() const
-{
-    return 2 + 2 * dimension_size;
-}
 }
 #endif /*__ARM_COMPUTE_ICLKERNEL_H__ */
-- 
cgit v1.2.1