1 files changed, 116 insertions, 1 deletions
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index a5716ce1e1..0a689fa4b6 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -26,9 +26,11 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
 #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -38,6 +40,7 @@
 #include "tests/validation/reference/ConvolutionLayer.h"
 #include "tests/validation/reference/Permute.h"
 #include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/DepthConvertLayer.h"
 
 using namespace arm_compute::experimental::dynamic_fusion;
 using namespace arm_compute::test::validation::utils;
@@ -148,7 +151,7 @@ TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
     CLScheduler::get().default_reinit();
 
     const auto data_type      = DataType::F32;
-    const auto t_input_shape  = TensorShape(8, 2, 1);
+    const auto t_input_shape  = TensorShape(33, 3, 2);
 
     // Create a new workload sketch
     auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
@@ -238,6 +241,118 @@ TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
     validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32);
     validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32);
 }
+TEST_CASE(Add_Output_Add_Cast_Cast_Output, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     *   out_0 = in_0 + in_1
+     *   out_1 = float(int32_t(out_0 + in_2))
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto t_input_shape  = TensorShape(3, 8, 5);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    auto in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+    auto in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+    auto in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+
+    auto out_0_info = sketch.create_tensor_info();
+    auto out_1_info = sketch.create_tensor_info();
+
+    auto ans_0_info = sketch.create_tensor_info();
+    auto ans_1_info = sketch.create_tensor_info();
+    auto ans_2_info = sketch.create_tensor_info();
+    auto ans_3_info = sketch.create_tensor_info();
+
+    CastAttributes cast_0_attr;
+    cast_0_attr.data_type(DataType::S32).convert_policy(ConvertPolicy::SATURATE);
+
+    CastAttributes cast_1_attr;
+    cast_1_attr.data_type(DataType::F32).convert_policy(ConvertPolicy::SATURATE);
+
+    GpuAdd::create_op(sketch, &in_0_info, &in_1_info, &ans_0_info);
+    GpuOutput::create_op(sketch, &ans_0_info, &out_0_info);
+    GpuAdd::create_op(sketch, &ans_0_info, &in_2_info, &ans_1_info);
+    GpuCast::create_op(sketch, &ans_1_info, &ans_2_info, cast_0_attr);
+    GpuCast::create_op(sketch, &ans_2_info, &ans_3_info, cast_1_attr);
+    GpuOutput::create_op(sketch, &ans_3_info, &out_1_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for(auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = data.first;
+        AuxMemoryInfo aux_mem_req = data.second;
+        tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_in_0{};
+    CLTensor t_in_1{};
+    CLTensor t_in_2{};
+
+    CLTensor t_out_0{};
+    CLTensor t_out_1{};
+
+    // Initialize user tensors
+    t_in_0.allocator()->init(in_0_info);
+    t_in_1.allocator()->init(in_1_info);
+    t_in_2.allocator()->init(in_2_info);
+
+    t_out_0.allocator()->init(out_0_info);
+    t_out_1.allocator()->init(out_1_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_in_0.allocator()->allocate();
+    t_in_1.allocator()->allocate();
+    t_in_2.allocator()->allocate();
+
+    t_out_0.allocator()->allocate();
+    t_out_1.allocator()->allocate();
+
+    fill<float>(CLAccessor(t_in_0), 0, library.get());
+    fill<float>(CLAccessor(t_in_1), 1, library.get());
+    fill<float>(CLAccessor(t_in_2), 2, library.get());
+
+    // Run runtime
+    runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 });
+
+    // Create reference
+    SimpleTensor<float> ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float> ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float> ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() };
+
+    SimpleTensor<float>   ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float>   ref_t_ans_1{ t_input_shape, data_type, 1, QuantizationInfo() };
+
+    // Fill reference
+    fill<float>(ref_t_in_0, 0, library.get());
+    fill<float>(ref_t_in_1, 1, library.get());
+    fill<float>(ref_t_in_2, 2, library.get());
+
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_ans_1, ConvertPolicy::WRAP);
+    const auto ref_t_ans_2 = reference::depth_convert<float, int32_t>(ref_t_ans_1, DataType::S32, ConvertPolicy::SATURATE, 0);
+    const auto ref_t_out_1 = reference::depth_convert<int32_t, float>(ref_t_ans_2, DataType::F32, ConvertPolicy::SATURATE, 0);
+
+    RelativeTolerance<float> tolerance_add_f32(0.001f);
+    AbsoluteTolerance<float> tolerance_cast_f32(1.0f);
+    validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_add_f32);
+    validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_cast_f32);
+}
 TEST_SUITE(Invalid_Fusion_Should_Fail)
 TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
 {