From 0021d750d66d199c411df00cdd8308c325f1fef3 Mon Sep 17 00:00:00 2001 From: Diego Lopez Recas Date: Mon, 18 Dec 2017 14:42:56 +0000 Subject: IVGCVSW-863 Broadcast support in CL/NEON Arithmetic Add Also, added instrumentation to support generic tensor broadcasting for NEON and CL backends. Change-Id: I1bc5747a286e1a4b464c209067581e103d473b9a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114201 Reviewed-by: Anthony Barbier Tested-by: Jenkins --- src/core/CL/ICLKernel.cpp | 81 +++++++---------------------------------------- 1 file changed, 11 insertions(+), 70 deletions(-) (limited to 'src/core/CL/ICLKernel.cpp') diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 7da74381d3..491e0c4b91 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -43,10 +43,11 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind return; } - // Make sure that dimensions > Z are 1 - for(unsigned int i = 3; i < Coordinates::num_max_dimensions; ++i) + for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) { - ARM_COMPUTE_ERROR_ON((window[i].end() - window[i].start()) != 1); + ARM_COMPUTE_ERROR_ON(window[i].step() == 0); + // Make sure that dimensions > Z are 1 + ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1)); } cl::NDRange gws = ICLKernel::gws_from_window(window); @@ -77,16 +78,6 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws); } -ICLKernel::ICLKernel() - : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0) -{ -} - -cl::Kernel &ICLKernel::kernel() -{ - return _kernel; -} - template void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window) { @@ -106,10 +97,10 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons unsigned int idx_start = idx; _kernel.setArg(idx++, tensor->cl_buffer()); - for(unsigned int dimension = 0; dimension < dimension_size; dimension++) + for(unsigned int d = 0; d < dimension_size; ++d) { - _kernel.setArg(idx++, strides[dimension]); - _kernel.setArg(idx++, strides[dimension] * window[dimension].step()); + _kernel.setArg(idx++, strides[d]); + _kernel.setArg(idx++, strides[d] * window[d].step()); } _kernel.setArg(idx++, offset_first_element); @@ -119,66 +110,16 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons ARM_COMPUTE_UNUSED(idx_start); } -void ICLKernel::add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) -{ - add_tensor_argument<1>(idx, tensor, window); -} - -void ICLKernel::add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) -{ - add_tensor_argument<2>(idx, tensor, window); -} - -void ICLKernel::add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) -{ - add_tensor_argument<3>(idx, tensor, window); -} - -void ICLKernel::add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window) -{ - add_tensor_argument<4>(idx, tensor, window); -} - -unsigned int ICLKernel::num_arguments_per_1D_array() const -{ - return num_arguments_per_array<1>(); -} - -unsigned int ICLKernel::num_arguments_per_1D_tensor() const -{ - return num_arguments_per_tensor<1>(); -} - -unsigned int ICLKernel::num_arguments_per_2D_tensor() const -{ - return num_arguments_per_tensor<2>(); -} - -unsigned int ICLKernel::num_arguments_per_3D_tensor() const -{ - return num_arguments_per_tensor<3>(); -} - -unsigned int ICLKernel::num_arguments_per_4D_tensor() const -{ - return num_arguments_per_tensor<4>(); -} +template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window); +template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window); +template void ICLKernel::add_tensor_argument<3>(unsigned &idx, const ICLTensor *tensor, const Window &window); +template void ICLKernel::add_tensor_argument<4>(unsigned &idx, const ICLTensor *tensor, const Window &window); void ICLKernel::set_target(cl::Device &device) { _target = get_target_from_device(device); } -void ICLKernel::set_target(GPUTarget target) -{ - _target = target; -} - -GPUTarget ICLKernel::get_target() const -{ - return _target; -} - size_t ICLKernel::get_max_workgroup_size() { if(_max_workgroup_size == 0) -- cgit v1.2.1