From 0021d750d66d199c411df00cdd8308c325f1fef3 Mon Sep 17 00:00:00 2001 From: Diego Lopez Recas Date: Mon, 18 Dec 2017 14:42:56 +0000 Subject: IVGCVSW-863 Broadcast support in CL/NEON Arithmetic Add Also, added instrumentation to support generic tensor broadcasting for NEON and CL backends. Change-Id: I1bc5747a286e1a4b464c209067581e103d473b9a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114201 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- arm_compute/core/CL/ICLKernel.h | 124 +++++++++++++++++++++++++--------------- 1 file changed, 79 insertions(+), 45 deletions(-) (limited to 'arm_compute/core/CL/ICLKernel.h') diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h index a1bc3eb8d2..e660ae55a0 100644 --- a/arm_compute/core/CL/ICLKernel.h +++ b/arm_compute/core/CL/ICLKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,14 +41,40 @@ class Window; /** Common interface for all the OpenCL kernels */ class ICLKernel : public IKernel { +private: + /** Returns the number of arguments enqueued per array object. + * + * @return The number of arguments enqueued per array object. + */ + template + constexpr static unsigned int num_arguments_per_array() + { + return num_arguments_per_tensor(); + } + /** Returns the number of arguments enqueued per tensor object. + * + * @return The number of arguments enqueued per tensor object. + */ + template + constexpr static unsigned int num_arguments_per_tensor() + { + return 2 + 2 * dimension_size; + } + public: /** Constructor */ - ICLKernel(); + ICLKernel() + : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0) + { + } /** Returns a reference to the OpenCL kernel of this object. * * @return A reference to the OpenCL kernel of this object. */ - cl::Kernel &kernel(); + cl::Kernel &kernel() + { + return _kernel; + } /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set. @@ -58,60 +84,90 @@ public: * @param[in] window Window the kernel will be executed on. */ template - void add_1D_array_argument(unsigned int &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window); + void add_1D_array_argument(unsigned int &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) + { + add_array_argument(idx, array, strides, num_dimensions, window); + } /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. * @param[in] tensor Tensor to set as an argument of the object's kernel. * @param[in] window Window the kernel will be executed on. */ - void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); + void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<1>(idx, tensor, window); + } /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. * @param[in] tensor Tensor to set as an argument of the object's kernel. * @param[in] window Window the kernel will be executed on. */ - void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); + void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<2>(idx, tensor, window); + } /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. * @param[in] tensor Tensor to set as an argument of the object's kernel. * @param[in] window Window the kernel will be executed on. */ - void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); + void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<3>(idx, tensor, window); + } /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. * @param[in] tensor Tensor to set as an argument of the object's kernel. * @param[in] window Window the kernel will be executed on. */ - void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); + void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) + { + add_tensor_argument<4>(idx, tensor, window); + } /** Returns the number of arguments enqueued per 1D array object. * * @return The number of arguments enqueues per 1D array object. */ - unsigned int num_arguments_per_1D_array() const; + constexpr static unsigned int num_arguments_per_1D_array() + { + return num_arguments_per_array<1>(); + } /** Returns the number of arguments enqueued per 1D tensor object. * * @return The number of arguments enqueues per 1D tensor object. */ - unsigned int num_arguments_per_1D_tensor() const; + constexpr static unsigned int num_arguments_per_1D_tensor() + { + return num_arguments_per_tensor<1>(); + } /** Returns the number of arguments enqueued per 2D tensor object. * * @return The number of arguments enqueues per 2D tensor object. */ - unsigned int num_arguments_per_2D_tensor() const; + constexpr static unsigned int num_arguments_per_2D_tensor() + { + return num_arguments_per_tensor<2>(); + } /** Returns the number of arguments enqueued per 3D tensor object. * * @return The number of arguments enqueues per 3D tensor object. */ - unsigned int num_arguments_per_3D_tensor() const; + constexpr static unsigned int num_arguments_per_3D_tensor() + { + return num_arguments_per_tensor<3>(); + } /** Returns the number of arguments enqueued per 4D tensor object. * * @return The number of arguments enqueues per 4D tensor object. */ - unsigned int num_arguments_per_4D_tensor() const; + constexpr static unsigned int num_arguments_per_4D_tensor() + { + return num_arguments_per_tensor<4>(); + } /** Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue. * * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns. @@ -161,7 +217,10 @@ public: * * @param[in] target The targeted GPU architecture */ - void set_target(GPUTarget target); + void set_target(GPUTarget target) + { + _target = target; + } /** Set the targeted GPU architecture according to the CL device * @@ -173,7 +232,10 @@ public: * * @return The targeted GPU architecture. */ - GPUTarget get_target() const; + GPUTarget get_target() const + { + return _target; + } /** Get the maximum workgroup size for the device the CLKernelLibrary uses. * @@ -207,18 +269,6 @@ private: */ template void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window); - /** Returns the number of arguments enqueued per array object. - * - * @return The number of arguments enqueued per array object. - */ - template - unsigned int num_arguments_per_array() const; - /** Returns the number of arguments enqueued per tensor object. - * - * @return The number of arguments enqueued per tensor object. - */ - template - unsigned int num_arguments_per_tensor() const; protected: cl::Kernel _kernel; /**< OpenCL kernel to run */ @@ -246,6 +296,8 @@ void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, c template void ICLKernel::add_array_argument(unsigned &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) { + ARM_COMPUTE_ERROR_ON(array == nullptr); + // Calculate offset to the start of the window unsigned int offset_first_element = 0; @@ -269,23 +321,5 @@ void ICLKernel::add_array_argument(unsigned &idx, const ICLArray *array, cons "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array()); ARM_COMPUTE_UNUSED(idx_start); } - -template -void ICLKernel::add_1D_array_argument(unsigned int &idx, const ICLArray *array, const Strides &strides, unsigned int num_dimensions, const Window &window) -{ - add_array_argument(idx, array, strides, num_dimensions, window); -} - -template -unsigned int ICLKernel::num_arguments_per_array() const -{ - return num_arguments_per_tensor(); -} - -template -unsigned int ICLKernel::num_arguments_per_tensor() const -{ - return 2 + 2 * dimension_size; -} } #endif /*__ARM_COMPUTE_ICLKERNEL_H__ */ -- cgit v1.2.1