From 04f4620cf999846a44089c81720aa920edec6993 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Wed, 14 Dec 2022 14:49:56 +0000 Subject: Add multiple output support for dynamic fusion * The dependency graph now can schedule any acyclic graph into a sequential list of operators. This is needed as the output operators now form branches in the graph. * Fix the definition of input, output and intermediate tensors in GpuKernelComponentGroup to support non-linear but sequential list of operators. * Add constraint on GpuOperatorGroup to enforce strictly linear fusion style, but allow output operator as the only form of branch. Resolves: COMPMID-5771 Signed-off-by: Viet-Hoa Do Change-Id: I68de3a31a2456145081f0a397e4e61dd66327682 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8823 Reviewed-by: Gunes Bayir Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../validation/dynamic_fusion/gpu/Integration.cpp | 101 +++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'tests/validation/dynamic_fusion') diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp index 58d2215e64..a5716ce1e1 100644 --- a/tests/validation/dynamic_fusion/gpu/Integration.cpp +++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp @@ -28,6 +28,7 @@ #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h" #include "tests/CL/CLAccessor.h" @@ -36,6 +37,7 @@ #include "tests/validation/dynamic_fusion/Utils.h" #include "tests/validation/reference/ConvolutionLayer.h" #include "tests/validation/reference/Permute.h" +#include "tests/validation/reference/ElementwiseOperations.h" using namespace arm_compute::experimental::dynamic_fusion; using namespace arm_compute::test::validation::utils; @@ -137,6 +139,105 @@ TEST_CASE(Conv2d, framework::DatasetMode::ALL) RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); } +TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL) +{ + /* Computation: + * out_0 = in_0 + in_1 + * out_1 = out_0 + in_2 + */ + CLScheduler::get().default_reinit(); + + const auto data_type = DataType::F32; + const auto t_input_shape = TensorShape(8, 2, 1); + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + auto in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + auto in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + auto in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + + auto out_0_info = sketch.create_tensor_info(); + auto out_1_info = sketch.create_tensor_info(); + + auto ans_0_info = sketch.create_tensor_info(); + auto ans_1_info = sketch.create_tensor_info(); + + GpuAdd::create_op(sketch, &in_0_info, &in_1_info, &ans_0_info); + GpuOutput::create_op(sketch, &ans_0_info, &out_0_info); + GpuAdd::create_op(sketch, &ans_0_info, &in_2_info, &ans_1_info); + GpuOutput::create_op(sketch, &ans_1_info, &out_1_info); + + // Configure runtime + ClWorkloadRuntime runtime; + runtime.configure(sketch); + + // (Important) Allocate auxiliary tensor memory if there are any + // Instead of using ACL allocated memory, the user can choose to import memory into the tensors + for(auto &data : runtime.get_auxiliary_tensors()) + { + CLTensor *tensor = data.first; + AuxMemoryInfo aux_mem_req = data.second; + tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment); + tensor->allocator()->allocate(); // Use ACL allocated memory + // auto buf = cl::Buffer(); + // tensor->allocator()->import_memory(buf); // Or, import external memory + } + + // Construct user tensors + CLTensor t_in_0{}; + CLTensor t_in_1{}; + CLTensor t_in_2{}; + + CLTensor t_out_0{}; + CLTensor t_out_1{}; + + // Initialize user tensors + t_in_0.allocator()->init(in_0_info); + t_in_1.allocator()->init(in_1_info); + t_in_2.allocator()->init(in_2_info); + + t_out_0.allocator()->init(out_0_info); + t_out_1.allocator()->init(out_1_info); + + // Allocate and fill user tensors + // Instead of using ACL allocator, the user can choose to import memory into the tensors + t_in_0.allocator()->allocate(); + t_in_1.allocator()->allocate(); + t_in_2.allocator()->allocate(); + + t_out_0.allocator()->allocate(); + t_out_1.allocator()->allocate(); + + fill(CLAccessor(t_in_0), 0, library.get()); + fill(CLAccessor(t_in_1), 1, library.get()); + fill(CLAccessor(t_in_2), 2, library.get()); + + // Run runtime + runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 }); + + // Create reference + SimpleTensor ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() }; + + SimpleTensor ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor ref_t_out_1{ t_input_shape, data_type, 1, QuantizationInfo() }; + + // Fill reference + fill(ref_t_in_0, 0, library.get()); + fill(ref_t_in_1, 1, library.get()); + fill(ref_t_in_2, 2, library.get()); + + reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP); + reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_out_1, ConvertPolicy::WRAP); + + RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ + validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32); + validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32); +} TEST_SUITE(Invalid_Fusion_Should_Fail) TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL) { -- cgit v1.2.1