diff options
author | Viet-Hoa Do <viet-hoa.do@arm.com> | 2022-12-16 14:45:57 +0000 |
---|---|---|
committer | Viet-Hoa Do <viet-hoa.do@arm.com> | 2022-12-30 13:59:23 +0000 |
commit | 3558c5840e7c973e2b1a86ae3a9335b44cad59d4 (patch) | |
tree | b5f14b344ff8bc03e5143a54a5f3480263db543e /tests | |
parent | 9d3bd41030366326e9c8afe5db3a5812a76b135b (diff) | |
download | ComputeLibrary-3558c5840e7c973e2b1a86ae3a9335b44cad59d4.tar.gz |
Add temporary tile support for dynamic fusion
* Multiple intermediate tensors can share the same tile.
- A simple operator can reuse the input tensor for the result
if the input tensor has the same shape, data type and it is
only consumed by that operator.
- The special case is a simple operator and an output operator
consume the same tensor. However as the output operator
doesn't change the content of the input tensor, it doesn't
count as "consuming" the input tensor.
* These temporary tiles are declared automatically by the template
writer. Individual operator doesn't need to generate output tile
declaration.
* Cast is now simple operator.
Resolves: COMPMID-5778
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I232647ac976645e2d266a62e055b9eb48c356a8e
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8877
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'tests')
-rw-r--r-- | tests/validation/dynamic_fusion/gpu/Integration.cpp | 117 |
1 files changed, 116 insertions, 1 deletions
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp index a5716ce1e1..0a689fa4b6 100644 --- a/tests/validation/dynamic_fusion/gpu/Integration.cpp +++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp @@ -26,9 +26,11 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h" #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h" #include "tests/CL/CLAccessor.h" @@ -38,6 +40,7 @@ #include "tests/validation/reference/ConvolutionLayer.h" #include "tests/validation/reference/Permute.h" #include "tests/validation/reference/ElementwiseOperations.h" +#include "tests/validation/reference/DepthConvertLayer.h" using namespace arm_compute::experimental::dynamic_fusion; using namespace arm_compute::test::validation::utils; @@ -148,7 +151,7 @@ TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL) CLScheduler::get().default_reinit(); const auto data_type = DataType::F32; - const auto t_input_shape = TensorShape(8, 2, 1); + const auto t_input_shape = TensorShape(33, 3, 2); // Create a new workload sketch auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); @@ -238,6 +241,118 @@ TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL) validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32); validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32); } +TEST_CASE(Add_Output_Add_Cast_Cast_Output, framework::DatasetMode::ALL) +{ + /* Computation: + * out_0 = in_0 + in_1 + * out_1 = float(int32_t(out_0 + in_2)) + */ + CLScheduler::get().default_reinit(); + + const auto data_type = DataType::F32; + const auto t_input_shape = TensorShape(3, 8, 5); + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + auto in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + auto in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + auto in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + + auto out_0_info = sketch.create_tensor_info(); + auto out_1_info = sketch.create_tensor_info(); + + auto ans_0_info = sketch.create_tensor_info(); + auto ans_1_info = sketch.create_tensor_info(); + auto ans_2_info = sketch.create_tensor_info(); + auto ans_3_info = sketch.create_tensor_info(); + + CastAttributes cast_0_attr; + cast_0_attr.data_type(DataType::S32).convert_policy(ConvertPolicy::SATURATE); + + CastAttributes cast_1_attr; + cast_1_attr.data_type(DataType::F32).convert_policy(ConvertPolicy::SATURATE); + + GpuAdd::create_op(sketch, &in_0_info, &in_1_info, &ans_0_info); + GpuOutput::create_op(sketch, &ans_0_info, &out_0_info); + GpuAdd::create_op(sketch, &ans_0_info, &in_2_info, &ans_1_info); + GpuCast::create_op(sketch, &ans_1_info, &ans_2_info, cast_0_attr); + GpuCast::create_op(sketch, &ans_2_info, &ans_3_info, cast_1_attr); + GpuOutput::create_op(sketch, &ans_3_info, &out_1_info); + + // Configure runtime + ClWorkloadRuntime runtime; + runtime.configure(sketch); + + // (Important) Allocate auxiliary tensor memory if there are any + // Instead of using ACL allocated memory, the user can choose to import memory into the tensors + for(auto &data : runtime.get_auxiliary_tensors()) + { + CLTensor *tensor = data.first; + AuxMemoryInfo aux_mem_req = data.second; + tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment); + tensor->allocator()->allocate(); // Use ACL allocated memory + // auto buf = cl::Buffer(); + // tensor->allocator()->import_memory(buf); // Or, import external memory + } + + // Construct user tensors + CLTensor t_in_0{}; + CLTensor t_in_1{}; + CLTensor t_in_2{}; + + CLTensor t_out_0{}; + CLTensor t_out_1{}; + + // Initialize user tensors + t_in_0.allocator()->init(in_0_info); + t_in_1.allocator()->init(in_1_info); + t_in_2.allocator()->init(in_2_info); + + t_out_0.allocator()->init(out_0_info); + t_out_1.allocator()->init(out_1_info); + + // Allocate and fill user tensors + // Instead of using ACL allocator, the user can choose to import memory into the tensors + t_in_0.allocator()->allocate(); + t_in_1.allocator()->allocate(); + t_in_2.allocator()->allocate(); + + t_out_0.allocator()->allocate(); + t_out_1.allocator()->allocate(); + + fill<float>(CLAccessor(t_in_0), 0, library.get()); + fill<float>(CLAccessor(t_in_1), 1, library.get()); + fill<float>(CLAccessor(t_in_2), 2, library.get()); + + // Run runtime + runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 }); + + // Create reference + SimpleTensor<float> ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor<float> ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor<float> ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() }; + + SimpleTensor<float> ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor<float> ref_t_ans_1{ t_input_shape, data_type, 1, QuantizationInfo() }; + + // Fill reference + fill<float>(ref_t_in_0, 0, library.get()); + fill<float>(ref_t_in_1, 1, library.get()); + fill<float>(ref_t_in_2, 2, library.get()); + + reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP); + reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_ans_1, ConvertPolicy::WRAP); + const auto ref_t_ans_2 = reference::depth_convert<float, int32_t>(ref_t_ans_1, DataType::S32, ConvertPolicy::SATURATE, 0); + const auto ref_t_out_1 = reference::depth_convert<int32_t, float>(ref_t_ans_2, DataType::F32, ConvertPolicy::SATURATE, 0); + + RelativeTolerance<float> tolerance_add_f32(0.001f); + AbsoluteTolerance<float> tolerance_cast_f32(1.0f); + validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_add_f32); + validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_cast_f32); +} TEST_SUITE(Invalid_Fusion_Should_Fail) TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL) { |