diff options
Diffstat (limited to 'src/core/CL/ICLKernel.cpp')
-rw-r--r-- | src/core/CL/ICLKernel.cpp | 123 |
1 files changed, 99 insertions, 24 deletions
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 2b259bf28a..ac53e7f1d2 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,43 +25,41 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" + #include "src/core/helpers/Utils.h" #include <cstddef> -using namespace arm_compute; - -void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items) +void arm_compute::enqueue(cl::CommandQueue &queue, + ICLKernel &kernel, + const Window &window, + const cl::NDRange &lws_hint, + bool use_dummy_work_items) { - if(kernel.kernel()() == nullptr) + if (kernel.kernel()() == nullptr) { return; } - for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) + for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_ERROR_ON(window[i].step() == 0); // Make sure that dimensions > Z are 1 ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1)); } - cl::NDRange gws = ICLKernel::gws_from_window(window); + cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items); // Check for empty NDRange - if(gws.dimensions() == 0) + if (gws.dimensions() == 0) { return; } - // Use dummy work-items - if(use_dummy_work_items) - { - gws.get()[0] = get_next_power_two(gws[0]); - gws.get()[1] = get_next_power_two(gws[1]); - } + kernel.cache_gws(gws); cl::NDRange valid_lws; - if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) + if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size()) { valid_lws = cl::NullRange; } @@ -72,14 +70,20 @@ void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Wind cl::NDRange lws = cl::NullRange; - if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) + if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2])) { lws = valid_lws; } + if (CLKernelLibrary::get().is_wbsm_supported()) + { + set_wbsm(kernel.kernel(), kernel.wbsm_hint()); + } queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws); } +namespace arm_compute +{ template <unsigned int dimension_size> void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window) { @@ -91,7 +95,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons // Calculate offset to the start of the window unsigned int offset_first_element = info->offset_first_element_in_bytes(); - for(unsigned int n = 0; n < info->num_dimensions(); ++n) + for (unsigned int n = 0; n < info->num_dimensions(); ++n) { offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n]; } @@ -99,24 +103,78 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons unsigned int idx_start = idx; _kernel.setArg(idx++, tensor->cl_buffer()); - for(unsigned int d = 0; d < dimension_size; ++d) + for (unsigned int d = 0; d < dimension_size; ++d) { - _kernel.setArg<cl_uint>(idx++, strides[d]); - _kernel.setArg<cl_uint>(idx++, strides[d] * window[d].step()); + _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]); + _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step())); } _kernel.setArg<cl_uint>(idx++, offset_first_element); ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx, - "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>()); + "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", + dimension_size, num_arguments_per_tensor<dimension_size>()); ARM_COMPUTE_UNUSED(idx_start); } +void ICLKernel::add_3d_tensor_nhw_argument(unsigned int &idx, const ICLTensor *tensor) +{ + ARM_COMPUTE_ERROR_ON(tensor == nullptr); + + const ITensorInfo *info = tensor->info(); + ARM_COMPUTE_ERROR_ON(info == nullptr); + const Strides &strides = info->strides_in_bytes(); + + // Tensor poniter + _kernel.setArg(idx++, tensor->cl_buffer()); + + // Add stride_y, stride_z + _kernel.setArg<cl_uint>(idx++, strides[1]); + _kernel.setArg<cl_uint>(idx++, strides[2]); + + // Tensor dimensions + _kernel.setArg<cl_uint>(idx++, info->dimension(0)); + _kernel.setArg<cl_uint>(idx++, info->dimension(1)); + _kernel.setArg<cl_uint>(idx++, info->dimension(2)); + + // Offset of first element + unsigned int offset_first_element = info->offset_first_element_in_bytes(); + _kernel.setArg<cl_uint>(idx++, offset_first_element); +} + +void ICLKernel::add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor) +{ + ARM_COMPUTE_ERROR_ON(tensor == nullptr); + + const ITensorInfo *info = tensor->info(); + ARM_COMPUTE_ERROR_ON(info == nullptr); + const Strides &strides = info->strides_in_bytes(); + + // Tensor poniter + _kernel.setArg(idx++, tensor->cl_buffer()); + + // Add stride_y, stride_z and stride_w + _kernel.setArg<cl_uint>(idx++, strides[1]); + _kernel.setArg<cl_uint>(idx++, strides[2]); + _kernel.setArg<cl_uint>(idx++, strides[3]); + + // Tensor dimensions + _kernel.setArg<cl_uint>(idx++, info->dimension(0)); + _kernel.setArg<cl_uint>(idx++, info->dimension(1)); + _kernel.setArg<cl_uint>(idx++, info->dimension(2)); + _kernel.setArg<cl_uint>(idx++, info->dimension(3)); + + // Offset of first element + unsigned int offset_first_element = info->offset_first_element_in_bytes(); + _kernel.setArg<cl_uint>(idx++, offset_first_element); +} + #ifndef DOXYGEN_SKIP_THIS template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window); template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window); template void ICLKernel::add_tensor_argument<3>(unsigned &idx, const ICLTensor *tensor, const Window &window); template void ICLKernel::add_tensor_argument<4>(unsigned &idx, const ICLTensor *tensor, const Window &window); +template void ICLKernel::add_tensor_argument<5>(unsigned &idx, const ICLTensor *tensor, const Window &window); #endif /* DOXYGEN_SKIP_THIS */ void ICLKernel::set_target(cl::Device &device) @@ -126,16 +184,16 @@ void ICLKernel::set_target(cl::Device &device) size_t ICLKernel::get_max_workgroup_size() { - if(_max_workgroup_size == 0) + if (_max_workgroup_size == 0) { _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel); } return _max_workgroup_size; } -cl::NDRange ICLKernel::gws_from_window(const Window &window) +cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items) { - if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) + if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0) { return cl::NullRange; } @@ -144,5 +202,22 @@ cl::NDRange ICLKernel::gws_from_window(const Window &window) (window.y().end() - window.y().start()) / window.y().step(), (window.z().end() - window.z().start()) / window.z().step()); + if (use_dummy_work_items) + { + gws.get()[0] = get_next_power_two(gws[0]); + gws.get()[1] = get_next_power_two(gws[1]); + } + return gws; } + +cl::NDRange ICLKernel::get_cached_gws() const +{ + return _cached_gws; +} + +void ICLKernel::cache_gws(const cl::NDRange &gws) +{ + _cached_gws = gws; +} +} // namespace arm_compute |