diff options
author | SiCong Li <sicong.li@arm.com> | 2023-06-28 09:49:45 +0100 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2023-07-25 15:48:50 +0000 |
commit | 23882a9014eb3972bca958206866c8e0d0b829cc (patch) | |
tree | 9139b91699099160e26a64abd8cf182bd7447278 /src/dynamic_fusion/runtime/gpu | |
parent | 0a59e69fd922b02d9e3b5b043ee7f891061df7be (diff) | |
download | ComputeLibrary-23882a9014eb3972bca958206866c8e0d0b829cc.tar.gz |
Add GpuKernelArgumentBinding for runtime argument setting
* Add flexible runtime argument setting that accept argument bindings
exported from ckw.
* Introduce internal build flag ACL_INTERNAL_TEST_CKW_IN_DF. If set to
true, ckw will be tested in dynamic fusion validation tests. Otherwise
it will not be tested and the dynamic fusion will keep using
ClTemplateWriter instead.
* Fix CKW sampler for elementwise binary to deal with tile sizes > 1
in both dimensions
Resolves: COMPMID-6282
Partially resolves: COMPMID-6260
Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: I0ab225a4484eb2119643d900a4e72806558626ee
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9917
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Anitha Raj <Anitha.Raj@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/dynamic_fusion/runtime/gpu')
4 files changed, 112 insertions, 34 deletions
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp index 6a57482bb2..92ca8557f1 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp @@ -24,6 +24,9 @@ #include "ClKernelRuntime.h" #include "arm_compute/core/CL/ICLTensor.h" #include "src/core/CL/CLUtils.h" +#ifdef ACL_INTERNAL_TEST_CKW_IN_DF +#include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h" +#endif // ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" #include "src/gpu/cl/ClKernelLibrary.h" @@ -57,6 +60,8 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe _arguments = code.arguments(); } +#ifndef ACL_INTERNAL_TEST_CKW_IN_DF + inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); @@ -163,21 +168,65 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer } } +#else // ACL_INTERNAL_TEST_CKW_IN_DF +inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images) +{ + switch(arg.type()) + { + case GpuKernelArgumentBinding::Type::TensorStorage: + { + switch(arg.tensor_storage_type()) + { + case TensorStorageType::ClBufferUint8Ptr: + { + cl_add_buffer_argument(_kernel, idx, tensor->cl_buffer()); + break; + } + case TensorStorageType::ClImage2dReadOnly: + { + cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::ReadOnly); + cl_images.push_back(tensor_image2d); + cl_add_texture_argument(_kernel, idx, tensor_image2d); + break; + } + case TensorStorageType::ClImage2dWriteOnly: + { + cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::WriteOnly); + cl_images.push_back(tensor_image2d); + cl_add_texture_argument(_kernel, idx, tensor_image2d); + break; + } + default: + { + ARM_COMPUTE_ERROR("Do not accept other TensorStorageType"); + break; + } + } + break; + } + case GpuKernelArgumentBinding::Type::TensorComponent: + { + cl_add_tensor_component_argument(_kernel, idx, tensor, arg.tensor_component_type()); + break; + } + default: + { + ARM_COMPUTE_ERROR("Do not accept other types of kernel arguments"); + break; + } + } +} + +#endif // ACL_INTERNAL_TEST_CKW_IN_DF void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); Window slice = window.first_slice_window_3D(); - // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - Window slice_fixed_z = slice; - slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1)); /// NOTE: Parameters extracted from old kernels. So far they seem to be constant /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future - constexpr bool slide_along_dimz = true; constexpr bool skip_sliding_window = false; constexpr bool use_dummy_work_items = false; @@ -185,23 +234,27 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com do { // Set kernel arguments - Window arg_slice = slice; // CLImages created from tensor arguments. Need to be retained until enqueue std::vector<cl::Image2D> cl_images; +#ifndef ACL_INTERNAL_TEST_CKW_IN_DF for(auto id_arg : _arguments) { const auto arg = id_arg.second; auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first)); ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info()); - if(!slide_along_dimz) - { - // The stride_z for matrix must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0); - arg_slice = slice_fixed_z; - } - add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, arg_slice, cl_images); + add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, slice, cl_images); + } + +#else // ACL_INTERNAL_TEST_CKW_IN_DF + for(const auto &arg : _arguments) + { + auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id())); + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info()); + add_kernel_argument(idx, arg, tensor, cl_images); } +#endif // ACL_INTERNAL_TEST_CKW_IN_DF // Dispatch kernel enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items); diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h index 4787acabcd..92e73503ce 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,6 +29,8 @@ #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" +#include <vector> + namespace arm_compute { namespace experimental @@ -57,6 +59,7 @@ public: virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: +#ifndef ACL_INTERNAL_TEST_CKW_IN_DF /** Set a kernel tensor argument * * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. @@ -66,9 +69,19 @@ private: * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) */ inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images); +#else // ACL_INTERNAL_TEST_CKW_IN_DF + /** Set a kernel argument as part of a tensor + * + * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. + * @param[in] arg Kernel argument binding, as part of @p tensor + * @param[in] tensor Tensor of which the kernel argument @p arg is a part of + * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) + */ + inline void add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images); +#endif // ACL_INTERNAL_TEST_CKW_IN_DF private: - GpuKernelArgumentList _arguments{}; /** All kernel arguments required by the runtime */ + GpuKernelArgumentList _arguments{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp index b273c2a20c..84fb279237 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp @@ -26,7 +26,11 @@ namespace arm_compute { -void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, ICLTensor *tensor, ckw::TensorComponentType component) +namespace experimental +{ +namespace dynamic_fusion +{ +void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component) { ARM_COMPUTE_ERROR_ON(tensor == nullptr); @@ -35,49 +39,49 @@ void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, ICL switch(component) { - case ckw::TensorComponentType::OffsetFirstElement: + case TensorComponentType::OffsetFirstElement: kernel.setArg<cl_uint>(idx++, info->offset_first_element_in_bytes()); break; - case ckw::TensorComponentType::Stride0: + case TensorComponentType::Stride0: kernel.setArg<cl_uint>(idx++, strides[0]); break; - case ckw::TensorComponentType::Stride1: + case TensorComponentType::Stride1: kernel.setArg<cl_uint>(idx++, strides[1]); break; - case ckw::TensorComponentType::Stride2: + case TensorComponentType::Stride2: kernel.setArg<cl_uint>(idx++, strides[2]); break; - case ckw::TensorComponentType::Stride3: + case TensorComponentType::Stride3: kernel.setArg<cl_uint>(idx++, strides[3]); break; - case ckw::TensorComponentType::Stride4: + case TensorComponentType::Stride4: kernel.setArg<cl_uint>(idx++, strides[4]); break; - case ckw::TensorComponentType::Dim0: + case TensorComponentType::Dim0: kernel.setArg<cl_uint>(idx++, info->dimension(0)); break; - case ckw::TensorComponentType::Dim1: + case TensorComponentType::Dim1: kernel.setArg<cl_uint>(idx++, info->dimension(1)); break; - case ckw::TensorComponentType::Dim2: + case TensorComponentType::Dim2: kernel.setArg<cl_uint>(idx++, info->dimension(2)); break; - case ckw::TensorComponentType::Dim3: + case TensorComponentType::Dim3: kernel.setArg<cl_uint>(idx++, info->dimension(3)); break; - case ckw::TensorComponentType::Dim4: + case TensorComponentType::Dim4: kernel.setArg<cl_uint>(idx++, info->dimension(4)); break; - case ckw::TensorComponentType::Dim1xDim2: + case TensorComponentType::Dim1xDim2: kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2)); break; - case ckw::TensorComponentType::Dim2xDim3: + case TensorComponentType::Dim2xDim3: kernel.setArg<cl_uint>(idx++, info->dimension(2) * info->dimension(3)); break; - case ckw::TensorComponentType::Dim1xDim2xDim3: + case TensorComponentType::Dim1xDim2xDim3: kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2) * info->dimension(3)); break; - case ckw::TensorComponentType::Unknown: + case TensorComponentType::Unknown: default: ARM_COMPUTE_ERROR("Unknown tensor component"); } @@ -93,4 +97,6 @@ void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Im kernel.setArg(idx++, image); } +} // namespace dynamic_fusion +} // namespace experimental } // namespace arm_compute diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h index 3c785732a5..4cbb157a48 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h +++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h @@ -27,10 +27,14 @@ #include "arm_compute/core/CL/ICLTensor.h" -#include "ckw/TensorInfo.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" namespace arm_compute { +namespace experimental +{ +namespace dynamic_fusion +{ /** Select a Compute Kernel Writer tensor component from a tensor and add to the kernel's arguments at the specified index idx. * * @param[in,out] kernel OpenCL kernel to configure with the provided argument. @@ -38,7 +42,7 @@ namespace arm_compute * @param[in] tensor Tensor from which to access the tensor component. * @param[in] component Tensor component to select such as tensor dimensions, strides, etc. */ -void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, ICLTensor *tensor, ckw::TensorComponentType component); +void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component); /** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx. * @@ -56,6 +60,8 @@ void cl_add_buffer_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Buf */ void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Image &image); +} // namespace dynamic_fusion +} // namespace experimental } // namespace arm_compute #endif /* ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS */ |