From 04f4620cf999846a44089c81720aa920edec6993 Mon Sep 17 00:00:00 2001
From: Viet-Hoa Do <viet-hoa.do@arm.com>
Date: Wed, 14 Dec 2022 14:49:56 +0000
Subject: Add multiple output support for dynamic fusion

* The dependency graph now can schedule any acyclic graph into
  a sequential list of operators. This is needed as the output
  operators now form branches in the graph.
* Fix the definition of input, output and intermediate tensors
  in GpuKernelComponentGroup to support non-linear but sequential
  list of operators.
* Add constraint on GpuOperatorGroup to enforce strictly linear
  fusion style, but allow output operator as the only form of
  branch.

Resolves: COMPMID-5771
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I68de3a31a2456145081f0a397e4e61dd66327682
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8823
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../validation/dynamic_fusion/gpu/Integration.cpp  | 101 +++++++++++++++++++++
 1 file changed, 101 insertions(+)

(limited to 'tests/validation/dynamic_fusion')
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index 58d2215e64..a5716ce1e1 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -28,6 +28,7 @@
 #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -36,6 +37,7 @@
 #include "tests/validation/dynamic_fusion/Utils.h"
 #include "tests/validation/reference/ConvolutionLayer.h"
 #include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
 
 using namespace arm_compute::experimental::dynamic_fusion;
 using namespace arm_compute::test::validation::utils;
@@ -137,6 +139,105 @@ TEST_CASE(Conv2d, framework::DatasetMode::ALL)
     RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
     validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
 }
+TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     *   out_0 = in_0 + in_1
+     *   out_1 = out_0 + in_2
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto t_input_shape  = TensorShape(8, 2, 1);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    auto in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+    auto in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+    auto in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+
+    auto out_0_info = sketch.create_tensor_info();
+    auto out_1_info = sketch.create_tensor_info();
+
+    auto ans_0_info = sketch.create_tensor_info();
+    auto ans_1_info = sketch.create_tensor_info();
+
+    GpuAdd::create_op(sketch, &in_0_info, &in_1_info, &ans_0_info);
+    GpuOutput::create_op(sketch, &ans_0_info, &out_0_info);
+    GpuAdd::create_op(sketch, &ans_0_info, &in_2_info, &ans_1_info);
+    GpuOutput::create_op(sketch, &ans_1_info, &out_1_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for(auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = data.first;
+        AuxMemoryInfo aux_mem_req = data.second;
+        tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_in_0{};
+    CLTensor t_in_1{};
+    CLTensor t_in_2{};
+
+    CLTensor t_out_0{};
+    CLTensor t_out_1{};
+
+    // Initialize user tensors
+    t_in_0.allocator()->init(in_0_info);
+    t_in_1.allocator()->init(in_1_info);
+    t_in_2.allocator()->init(in_2_info);
+
+    t_out_0.allocator()->init(out_0_info);
+    t_out_1.allocator()->init(out_1_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_in_0.allocator()->allocate();
+    t_in_1.allocator()->allocate();
+    t_in_2.allocator()->allocate();
+
+    t_out_0.allocator()->allocate();
+    t_out_1.allocator()->allocate();
+
+    fill<float>(CLAccessor(t_in_0), 0, library.get());
+    fill<float>(CLAccessor(t_in_1), 1, library.get());
+    fill<float>(CLAccessor(t_in_2), 2, library.get());
+
+    // Run runtime
+    runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 });
+
+    // Create reference
+    SimpleTensor<float> ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float> ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float> ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() };
+
+    SimpleTensor<float> ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float> ref_t_out_1{ t_input_shape, data_type, 1, QuantizationInfo() };
+
+    // Fill reference
+    fill<float>(ref_t_in_0, 0, library.get());
+    fill<float>(ref_t_in_1, 1, library.get());
+    fill<float>(ref_t_in_2, 2, library.get());
+
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_out_1, ConvertPolicy::WRAP);
+
+    RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32);
+    validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32);
+}
 TEST_SUITE(Invalid_Fusion_Should_Fail)
 TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
 {
-- 
cgit v1.2.1