diff options
author | Viet-Hoa Do <viet-hoa.do@arm.com> | 2022-12-14 14:49:56 +0000 |
---|---|---|
committer | Viet-Hoa Do <viet-hoa.do@arm.com> | 2022-12-23 14:11:34 +0000 |
commit | 04f4620cf999846a44089c81720aa920edec6993 (patch) | |
tree | 1c0080ac59d5b2aa500cd2b2ceffe0575e22a4b6 /tests/validation/dynamic_fusion/gpu | |
parent | 81fdaddaf36cb4c7ff0d2c52a370dd977a13dc72 (diff) | |
download | ComputeLibrary-04f4620cf999846a44089c81720aa920edec6993.tar.gz |
Add multiple output support for dynamic fusion
* The dependency graph now can schedule any acyclic graph into
a sequential list of operators. This is needed as the output
operators now form branches in the graph.
* Fix the definition of input, output and intermediate tensors
in GpuKernelComponentGroup to support non-linear but sequential
list of operators.
* Add constraint on GpuOperatorGroup to enforce strictly linear
fusion style, but allow output operator as the only form of
branch.
Resolves: COMPMID-5771
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I68de3a31a2456145081f0a397e4e61dd66327682
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8823
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'tests/validation/dynamic_fusion/gpu')
-rw-r--r-- | tests/validation/dynamic_fusion/gpu/Integration.cpp | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp index 58d2215e64..a5716ce1e1 100644 --- a/tests/validation/dynamic_fusion/gpu/Integration.cpp +++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp @@ -28,6 +28,7 @@ #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h" #include "tests/CL/CLAccessor.h" @@ -36,6 +37,7 @@ #include "tests/validation/dynamic_fusion/Utils.h" #include "tests/validation/reference/ConvolutionLayer.h" #include "tests/validation/reference/Permute.h" +#include "tests/validation/reference/ElementwiseOperations.h" using namespace arm_compute::experimental::dynamic_fusion; using namespace arm_compute::test::validation::utils; @@ -137,6 +139,105 @@ TEST_CASE(Conv2d, framework::DatasetMode::ALL) RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); } +TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL) +{ + /* Computation: + * out_0 = in_0 + in_1 + * out_1 = out_0 + in_2 + */ + CLScheduler::get().default_reinit(); + + const auto data_type = DataType::F32; + const auto t_input_shape = TensorShape(8, 2, 1); + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + auto in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + auto in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + auto in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + + auto out_0_info = sketch.create_tensor_info(); + auto out_1_info = sketch.create_tensor_info(); + + auto ans_0_info = sketch.create_tensor_info(); + auto ans_1_info = sketch.create_tensor_info(); + + GpuAdd::create_op(sketch, &in_0_info, &in_1_info, &ans_0_info); + GpuOutput::create_op(sketch, &ans_0_info, &out_0_info); + GpuAdd::create_op(sketch, &ans_0_info, &in_2_info, &ans_1_info); + GpuOutput::create_op(sketch, &ans_1_info, &out_1_info); + + // Configure runtime + ClWorkloadRuntime runtime; + runtime.configure(sketch); + + // (Important) Allocate auxiliary tensor memory if there are any + // Instead of using ACL allocated memory, the user can choose to import memory into the tensors + for(auto &data : runtime.get_auxiliary_tensors()) + { + CLTensor *tensor = data.first; + AuxMemoryInfo aux_mem_req = data.second; + tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment); + tensor->allocator()->allocate(); // Use ACL allocated memory + // auto buf = cl::Buffer(); + // tensor->allocator()->import_memory(buf); // Or, import external memory + } + + // Construct user tensors + CLTensor t_in_0{}; + CLTensor t_in_1{}; + CLTensor t_in_2{}; + + CLTensor t_out_0{}; + CLTensor t_out_1{}; + + // Initialize user tensors + t_in_0.allocator()->init(in_0_info); + t_in_1.allocator()->init(in_1_info); + t_in_2.allocator()->init(in_2_info); + + t_out_0.allocator()->init(out_0_info); + t_out_1.allocator()->init(out_1_info); + + // Allocate and fill user tensors + // Instead of using ACL allocator, the user can choose to import memory into the tensors + t_in_0.allocator()->allocate(); + t_in_1.allocator()->allocate(); + t_in_2.allocator()->allocate(); + + t_out_0.allocator()->allocate(); + t_out_1.allocator()->allocate(); + + fill<float>(CLAccessor(t_in_0), 0, library.get()); + fill<float>(CLAccessor(t_in_1), 1, library.get()); + fill<float>(CLAccessor(t_in_2), 2, library.get()); + + // Run runtime + runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 }); + + // Create reference + SimpleTensor<float> ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor<float> ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor<float> ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() }; + + SimpleTensor<float> ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor<float> ref_t_out_1{ t_input_shape, data_type, 1, QuantizationInfo() }; + + // Fill reference + fill<float>(ref_t_in_0, 0, library.get()); + fill<float>(ref_t_in_1, 1, library.get()); + fill<float>(ref_t_in_2, 2, library.get()); + + reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP); + reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_out_1, ConvertPolicy::WRAP); + + RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ + validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32); + validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32); +} TEST_SUITE(Invalid_Fusion_Should_Fail) TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL) { |