From 23882a9014eb3972bca958206866c8e0d0b829cc Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Wed, 28 Jun 2023 09:49:45 +0100 Subject: Add GpuKernelArgumentBinding for runtime argument setting * Add flexible runtime argument setting that accept argument bindings exported from ckw. * Introduce internal build flag ACL_INTERNAL_TEST_CKW_IN_DF. If set to true, ckw will be tested in dynamic fusion validation tests. Otherwise it will not be tested and the dynamic fusion will keep using ClTemplateWriter instead. * Fix CKW sampler for elementwise binary to deal with tile sizes > 1 in both dimensions Resolves: COMPMID-6282 Partially resolves: COMPMID-6260 Signed-off-by: SiCong Li Change-Id: I0ab225a4484eb2119643d900a4e72806558626ee Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9917 Tested-by: Arm Jenkins Reviewed-by: Jakub Sujak Reviewed-by: Anitha Raj Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../sketch/gpu/ckw_driver/components/GpuCkwCast.cpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp') diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp index dd71c55df2..8d7e6a8c37 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp @@ -23,14 +23,15 @@ */ #include "GpuCkwCast.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "ckw/TensorTileSampler.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h" #include @@ -84,8 +85,8 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_ } // namespace GpuCkwCast::GpuCkwCast(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes) + const ArgumentPack &tensors, + const Attributes &attributes) : IGpuCkwComponentDriver{ id, tensors }, _src{}, _dst{}, @@ -102,8 +103,8 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst"); + GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Load the source tile and prepare the sampler. if(!src->has_tile()) @@ -124,7 +125,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa if(!dst->has_tile()) { // Get Target datatype and convert it to ckw::DataType. - ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type()); + ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type()); // Create dst_tile based on src_tile dimensions and with target DataType. const TileInfo src_tile_info = src_tile.tile_info(); @@ -166,9 +167,9 @@ Window GpuCkwCast::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } -- cgit v1.2.1