From b63b1196adea8b07dd8db77c2492a212650deba0 Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Fri, 28 Jan 2022 18:24:39 +0000 Subject: Integrate Dynamic Fusion patches * Add public interfaces: * OperatorGraph: Describe a workload that could contain fused kernels * IWorkload: Generic interface for workloads built from OperatorGraph * ClWorkload: OpenCL workloads built from OperatorGraph * ClCompositeOperator: Runtime async operator to execute a ClWorkload * DependencyGraph (will likely be deprecated in later iterations) * Add example * cl_fused_conv2d_elementwise_add.cpp to explain how to use the new interfaces * Add internal translation layer * Refactor ClKernelBuildingAPI * Remove non-tile based gemm native kernel component * Minor interface changes * Add integration tests Resolves COMPMID-5161 Signed-off-by: SiCong Li Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510 Reviewed-by: Gian Marco Iodice Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp | 515 +++------------------ .../CL/UNIT/dynamic_fusion/DependencyGraph.cpp | 267 +++++++++++ .../Integration_OperatorFuseMovenetSubGraph1.cpp | 403 ++++++++++++++++ tests/validation/CL/UNIT/dynamic_fusion/Utils.h | 71 +++ 4 files changed, 793 insertions(+), 463 deletions(-) create mode 100644 tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp create mode 100644 tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp create mode 100644 tests/validation/CL/UNIT/dynamic_fusion/Utils.h (limited to 'tests/validation/CL/UNIT/dynamic_fusion') diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp index 9e1b4d897b..a6b09ccdea 100644 --- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp +++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp @@ -21,9 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" #include "src/core/utils/helpers/float_ops.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" @@ -42,9 +45,12 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" + #include using namespace arm_compute::experimental::dynamic_fusion; +using namespace arm_compute::test::validation::utils; namespace arm_compute { @@ -52,149 +58,12 @@ namespace test { namespace validation { -namespace -{ -/** Macros which measures the wall clock time, and records it into a map measurement_map with name clock_name */ -#define TICK(clock_name) \ - auto clock_name##_tick = std::chrono::high_resolution_clock::now(); -#define TOCK(clock_name, measurement_map) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); -#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); - -template -void fill(U &&tensor, int seed) -{ - static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); - using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; - - DistributionType distribution{ T(-1.0f), T(1.0f) }; - library->fill(tensor, distribution, seed); - - // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) - DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; - library->fill_borders_with_garbage(tensor, distribution_inf, seed); -} -} // namespace - TEST_SUITE(CL) TEST_SUITE(UNIT) TEST_SUITE(DYNAMIC_FUSION) TEST_SUITE(ClCompositeKernel) TEST_SUITE(Validate) -TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL) -{ - /* Computation: - * out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast) - */ - const auto data_type = DataType::F32; - const auto m = 5U; - const auto n = 4U; - const auto k = 3U; - const auto t_lhs_shape = TensorShape(k, m); - const auto t_rhs_shape = TensorShape(n, k); - const auto t_dst_shape = TensorShape(n, m); - auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); - auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); - auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); - - const ClTensorDescriptor t_lhs_desc{ &t_lhs_info }; - const ClTensorDescriptor t_rhs_desc{ &t_rhs_info }; - const ClTensorDescriptor t_bias_desc{ &t_bias_info }; - const ClTensorDescriptor t_addend_desc{ &t_dst_info }; - const ClTensorDescriptor t_dst_desc{ &t_dst_info }; - - ClKernelBlueprint bp; - ArgumentID tid_lhs; - ArgumentID tid_rhs; - ArgumentID tid_l0_bias = g_arg_placeholder; - ArgumentID tid_l1_addend; - ArgumentID tid_dst; - auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs); - st = add_tensor_argument(bp, t_rhs_desc, tid_rhs); - st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend); - st = add_tensor_argument(bp, t_dst_desc, tid_dst); - - const auto common_kernel_desc = ClKernelComponentDescriptor{}; - const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k }; - const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT }; - - ArgumentID tid_acc; - st = add_tensor_intermed(bp, tid_acc); - st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc); - st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc); - st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware); - - ClKernelCode cl_code; - - st = set_tile_info(bp, store_tile_info); - st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - - ClExecutionDescriptor exec_desc{}; - st = tune_static(exec_desc, cl_code); - - CLScheduler::get().default_reinit(); - ClCompositeKernel kernel; - kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - CLTensor t_dst{}; - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - t_dst.allocator()->init(t_dst_info); - } - // "Pack" tensors - TensorBinding tensors({ { tid_lhs, &t_lhs }, - { tid_rhs, &t_rhs }, - { tid_l1_addend, &t_l1_addend }, - { tid_dst, &t_dst } - }); - // Allocate and fill tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - t_dst.allocator()->allocate(); - fill(CLAccessor(t_lhs), 0); - fill(CLAccessor(t_rhs), 1); - fill(CLAccessor(t_l1_addend), 2); - } - - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - - // Create reference - SimpleTensor ref_t_lhs{ t_lhs_shape, data_type, 1 }; - SimpleTensor ref_t_rhs{ t_rhs_shape, data_type, 1 }; - SimpleTensor ref_t_bias_placeholder{ t_dst_shape, data_type, 1 }; - SimpleTensor ref_t_l1_addend{ t_dst_shape, data_type, 1 }; - - // Fill reference - fill(ref_t_lhs, 0); - fill(ref_t_rhs, 1); - fill(ref_t_l1_addend, 2); - const auto ref_t_dst = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_t_l1_addend, - reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */), - data_type, - eltwise_add_desc.convert_policy); - - RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32); -} - TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) { /* Computation: @@ -208,7 +77,7 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) Status st{}; const auto data_type = DataType::F32; - const auto conv_info = PadStrideInfo(1U, 1U, 1U, 1U); + const auto conv_info = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ }; const auto width = 7U; const auto height = 6U; @@ -216,47 +85,44 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) const auto OFM = 4U; const auto kernel_sz = 3U; - const auto src_shape = TensorShape(IFM, width, height); - const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM); - const auto bia_shape = TensorShape(OFM); - const auto dst_shape = TensorShape(OFM, width, height); + const auto src_shape = TensorShape(IFM, width, height); + const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM); + const auto bia_shape = TensorShape(OFM); + const auto addend_shape = TensorShape(1, 1); + const auto dst_shape = TensorShape(OFM, width, height); - auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC); - auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC); - auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC); - auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC); - - const auto src_desc = ClTensorDescriptor(&src_info); - const auto wei_desc = ClTensorDescriptor(&wei_info); - const auto bia_desc = ClTensorDescriptor(&bia_info); - const auto addend_desc = ClTensorDescriptor(&dst_info); - const auto dst_desc = ClTensorDescriptor(&dst_info); + auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC); + auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC); + auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC); + auto addend_info = TensorInfo(addend_shape, 1, data_type, DataLayout::NHWC); + auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC); const auto n0 = std::min(OFM, 4u); const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U; - const ClKernelComponentDescriptor common_kernel_desc{}; - const DirectConvolutionDescriptor direct_conv2d_desc{ conv_info }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT }; + const ClDirectConv2dKernelDescriptor direct_conv2d_desc{ conv_info }; + const ClEltwiseAddKernelDescriptor eltwise_add_desc{}; + const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT }; ArgumentID src_id{ g_arg_placeholder }; ArgumentID wei_id{ g_arg_placeholder }; ArgumentID bia_id{ g_arg_placeholder }; ArgumentID acc_id{ g_arg_placeholder }; + ArgumentID acc_1_id{ g_arg_placeholder }; ArgumentID addend_id{ g_arg_placeholder }; ArgumentID dst_id{ g_arg_placeholder }; - st = add_tensor_argument(bp, src_desc, src_id); - st = add_tensor_argument(bp, wei_desc, wei_id); - st = add_tensor_argument(bp, bia_desc, bia_id); - st = add_tensor_intermed(bp, acc_id); - st = add_tensor_argument(bp, addend_desc, addend_id); - st = add_tensor_argument(bp, dst_desc, dst_id); + st = add_tensor(bp, &src_info, src_id); + st = add_tensor(bp, &wei_info, wei_id); + st = add_tensor(bp, &bia_info, bia_id); + st = add_tensor(bp, &dst_info, acc_id); + st = add_tensor(bp, &dst_info, acc_1_id); + st = add_tensor(bp, &addend_info, addend_id); + st = add_tensor(bp, &dst_info, dst_id); - st = add_kcomp_direct_conv(bp, common_kernel_desc, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id); - st = add_kcomp_eltwise_add(bp, common_kernel_desc, eltwise_add_desc, addend_id, acc_id, acc_id); - st = add_kcomp_store(bp, common_kernel_desc, acc_id, dst_id, StoreType::TStoreIndirectWidthSelect); + st = add_kcomp_direct_conv2d(bp, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id); + st = add_kcomp_eltwise_add(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id); + st = add_kcomp_store(bp, StoreType::TStoreIndirectWidthSelect, acc_1_id, dst_id); exec_desc.skip_sliding_window = true; @@ -282,12 +148,11 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) dst.allocator()->init(dst_info); // "Pack" tensors - TensorBinding tensors({ { src_id, &src }, + ITensorPack tensors{ { src_id, &src }, { wei_id, &wei }, { bia_id, &bia }, { addend_id, &addend }, - { dst_id, &dst } - }); + { dst_id, &dst } }; // Allocate and fill tensors src.allocator()->allocate(); @@ -296,10 +161,10 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) addend.allocator()->allocate(); dst.allocator()->allocate(); - fill(CLAccessor(src), 0); - fill(CLAccessor(wei), 1); - fill(CLAccessor(bia), 2); - fill(CLAccessor(addend), 3); + fill(CLAccessor(src), 0, library.get()); + fill(CLAccessor(wei), 1, library.get()); + fill(CLAccessor(bia), 2, library.get()); + fill(CLAccessor(addend), 3, library.get()); CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); @@ -310,10 +175,10 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) SimpleTensor ref_addend_nhwc{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; // Fill reference - fill(ref_src_nhwc, 0); - fill(ref_wei_nhwc, 1); - fill(ref_bia_nhwc, 2); - fill(ref_addend_nhwc, 3); + fill(ref_src_nhwc, 0, library.get()); + fill(ref_wei_nhwc, 1, library.get()); + fill(ref_bia_nhwc, 2, library.get()); + fill(ref_addend_nhwc, 3, library.get()); auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U)); auto ref_wei = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U)); @@ -326,301 +191,25 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) const auto ref_dst = reference::arithmetic_operation( ArithmeticOperation::ADD, ref_addend, - reference::convolution_layer(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info), - data_type, - eltwise_add_desc.convert_policy); + reference::convolution_layer(ref_src, ref_wei, ref_bia, dst_shape_nchw, + PadStrideInfo + { + static_cast(conv_info.stride.x()), + static_cast(conv_info.stride.y()), + static_cast(conv_info.pad.left), + static_cast(conv_info.pad.top) }), + data_type, + ConvertPolicy::SATURATE); RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ validate(CLAccessor(dst), ref_dst, tolerance_f32); } TEST_SUITE_END() // Validate - -TEST_SUITE(Benchmark) -TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL) -{ - using std::chrono::duration_cast; - using std::chrono::microseconds; - const int num_iterations = 200; - std::map measurements; - /* Computation: - * out = add(addend, gemm_native(lhs, rhs, bias)) - */ - const auto data_type = DataType::F32; - const auto m = 12U * 12U; - const auto n = 64U; - const auto k = 384U; - const auto t_lhs_shape = TensorShape(k, m); - const auto t_rhs_shape = TensorShape(n, k); - const auto t_dst_shape = TensorShape(n, m); - auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); - auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); - auto t_bias_info = TensorInfo(TensorShape(), 1, data_type); - auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3 - auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); - - const auto common_kernel_desc = ClKernelComponentDescriptor{}; - const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k }; - const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT }; - - // Create reference - SimpleTensor ref_t_lhs{ t_lhs_shape, data_type, 1 }; - SimpleTensor ref_t_rhs{ t_rhs_shape, data_type, 1 }; - SimpleTensor ref_t_bias_placeholder{ t_dst_shape, data_type, 1 }; - SimpleTensor ref_t_l1_addend{ t_dst_shape, data_type, 1 }; - - // Fill reference - fill(ref_t_lhs, 0); - fill(ref_t_rhs, 1); - fill(ref_t_l1_addend, 2); - const auto ref_t_dst = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_t_l1_addend, - reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */), - data_type, - eltwise_add_desc.convert_policy); - - CLScheduler::get().default_reinit(); - - /* Condition 0: Dynamic Fused Kernel */ - CLTensor cond0_t_dst{}; - { - TICK(cond0_0_startup_time); - - ClKernelBlueprint bp; - ArgumentID tid_lhs; - ArgumentID tid_rhs; - ArgumentID tid_l0_bias = g_arg_placeholder; - ArgumentID tid_l1_addend; - ArgumentID tid_dst; - - const ClTensorDescriptor t_lhs_desc{ &t_lhs_info }; - const ClTensorDescriptor t_rhs_desc{ &t_rhs_info }; - const ClTensorDescriptor t_bias_desc{ &t_bias_info }; - const ClTensorDescriptor t_addend_desc{ &t_dst_info }; - const ClTensorDescriptor t_dst_desc{ &t_dst_info }; - - ClKernelCode cl_code; - TICK(cond0_build_time) - auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs); - st = add_tensor_argument(bp, t_rhs_desc, tid_rhs); - st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend); - st = add_tensor_argument(bp, t_dst_desc, tid_dst); - - ArgumentID tid_acc; - st = add_tensor_intermed(bp, tid_acc); - st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc); - - st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc); - - st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware); - - st = set_tile_info(bp, store_tile_info); - st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - TOCK(cond0_build_time, measurements) - - TICK(cond0_tune_time) - ClExecutionDescriptor exec_desc{}; - st = tune_static(exec_desc, cl_code); - TOCK(cond0_tune_time, measurements) - - TICK(cond0_configure_time) - ClCompositeKernel kernel; - kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); - TOCK(cond0_configure_time, measurements) - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - cond0_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond0_t_dst.allocator()->allocate(); - fill(CLAccessor(t_lhs), 0); - fill(CLAccessor(t_rhs), 1); - fill(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - TensorBinding tensors({ { tid_lhs, &t_lhs }, { tid_rhs, &t_rhs }, { tid_l1_addend, &t_l1_addend }, { tid_dst, &cond0_t_dst } }); - - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - CLScheduler::get().sync(); - TOCK(cond0_0_startup_time, measurements) - - TICK(cond0_1_latency) - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond0_1_latency, measurements, num_iterations) - } - /* Condition 1: Dynamic Unfused Kernel */ - /* Condition 2: Static Fused Kernel (current) */ - CLTensor cond2_t_dst{}; - { - TICK(cond2_0_startup_time); - arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm; - - TICK(cond2_configure_time); - experimental::PostOpList post_ops; - post_ops.push_back_op>(&t_dst_info, 1, eltwise_add_desc.convert_policy); - GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0, post_ops }; - l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info, - gemm_native_desc.rhs_info, gemm_info); - TOCK(cond2_configure_time, measurements); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - cond2_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond2_t_dst.allocator()->allocate(); - fill(CLAccessor(t_lhs), 0); - fill(CLAccessor(t_rhs), 1); - fill(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - ITensorPack tensors - { - { ACL_SRC_0, &t_lhs }, - { ACL_SRC_1, &t_rhs }, - { EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, &t_l1_addend }, - { ACL_DST, &cond2_t_dst }, - }; - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true); - CLScheduler::get().sync(); - TOCK(cond2_0_startup_time, measurements); - - TICK(cond2_1_latency); - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond2_1_latency, measurements, num_iterations); - } - /* Condition 3: Static Unfused Kernel (current) */ - CLTensor cond3_t_dst{}; - { - TICK(cond3_0_startup_time); - arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm; - arm_compute::opencl::kernels::ClSaturatedArithmeticKernel l1_add; - - TICK(cond3_configure_time); - GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_l0_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info, - gemm_native_desc.rhs_info, gemm_info); - l1_add.configure(CLKernelLibrary::get().get_compile_context(), ArithmeticOperation::ADD, &t_l0_dst_info, &t_l1_rhs_info, &t_dst_info, eltwise_add_desc.convert_policy); - TOCK(cond3_configure_time, measurements); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l0_dst{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l0_dst.allocator()->init(t_l0_dst_info); - t_l1_addend.allocator()->init(t_dst_info); - cond3_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l0_dst.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond3_t_dst.allocator()->allocate(); - fill(CLAccessor(t_lhs), 0); - fill(CLAccessor(t_rhs), 1); - fill(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - ITensorPack tensors_l0 - { - { ACL_SRC_0, &t_lhs }, - { ACL_SRC_1, &t_rhs }, - { ACL_DST, &t_l0_dst }, - }; - ITensorPack tensors_l1 - { - { ACL_SRC_0, &t_l0_dst }, - { ACL_SRC_1, &t_l1_addend }, - { ACL_DST, &cond3_t_dst }, - }; - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true); - CLScheduler::get().enqueue_op(l1_add, tensors_l1, true); - CLScheduler::get().sync(); - TOCK(cond3_0_startup_time, measurements); - - TICK(cond3_1_latency); - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true); - CLScheduler::get().enqueue_op(l1_add, tensors_l1, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond3_1_latency, measurements, num_iterations); - } - - RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - std::cout << "cond0 validation: " << std::endl; - validate(CLAccessor(cond0_t_dst), ref_t_dst, tolerance_f32); - std::cout << "cond2 validation: " << std::endl; - validate(CLAccessor(cond2_t_dst), ref_t_dst, tolerance_f32); - std::cout << "cond3 validation: " << std::endl; - validate(CLAccessor(cond3_t_dst), ref_t_dst, tolerance_f32); - - /* Report */ - std::cout << "Performance comparison (gemm native + add)" << std::endl; - std::cout << "cond0: dynamic fusion module" << std::endl; - std::cout << "cond2: static fused with post ops" << std::endl; - std::cout << "cond3: static unfused" << std::endl; - for(auto m : measurements) - { - std::cout << m.first << ": " << m.second.count() << "us" << std::endl; - } -} -TEST_SUITE_END() // Benchmark TEST_SUITE_END() // ClCompositeKernel TEST_SUITE_END() // DYNAMIC_FUSION TEST_SUITE_END() // UNIT TEST_SUITE_END() // CL } // namespace validation } // namespace test -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp b/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp new file mode 100644 index 0000000000..6962f0e6d1 --- /dev/null +++ b/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/experimental/DependencyGraph.h" + +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" + +using namespace arm_compute::experimental::dynamic_fusion; + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +TEST_SUITE(CL) + +TEST_SUITE(UNIT) +TEST_SUITE(DYNAMIC_FUSION) +TEST_SUITE(DependencyGraph) + +TEST_CASE(Correct_Graph_Creation_Should_Pass, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + const auto t3 = graph.add_tensor(); + const auto t4 = graph.add_tensor(); + + const auto o0 = graph.add_operator({ t0, t1 }, { t2 }).second; + const auto o1 = graph.add_operator({ t3, t2 }, { t4 }).second; + + ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_ops(), 2U, framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_tensors(), 5U, framework::LogLevel::ERRORS); + + const DependencyGraph ref_graph + { + { + // src_tensors + { o0, { t0, t1 } }, + { o1, { t3, t2 } }, + }, + { + // dst_tensors + { o0, { t2 } }, + { o1, { t4 } }, + }, + { + // src_ops + { t0, {} }, + { t1, {} }, + { t2, { o0 } }, + { t3, {} }, + { t4, { o1 } }, + }, + { + // dst_ops + { t0, { o0 } }, + { t1, { o0 } }, + { t2, { o1 } }, + { t3, { o1 } }, + { t4, {} }, + } + + }; + ARM_COMPUTE_EXPECT(graph == ref_graph, framework::LogLevel::ERRORS); +} + +TEST_CASE(Correct_Merge_Points_Should_Enable_Graph_Expansion, framework::DatasetMode::ALL) +{ + // Merge points are a simple way to collapse "graph of graphs" into a single graph + // Suppose we have a top-level graph g0 + DependencyGraph g0{}; + const auto g0_t0 = g0.add_tensor(); + const auto g0_t1 = g0.add_tensor(); + const auto g0_t2 = g0.add_tensor(); + const auto g0_t3 = g0.add_tensor(); + const auto g0_t4 = g0.add_tensor(); + g0.add_operator({ g0_t0, g0_t1 }, { g0_t2 }); // g0_o0 + g0.add_operator({ g0_t3, g0_t2 }, { g0_t4 }); // g0_o1 + + // Then g0 expands into g1, with additional nodes added in-between "merge point tensors" + // Note that the expansion logic may be local to each operator node + DependencyGraph g1{}; + // g0_o0 expands into g1_o0, g1_o1, g1_o2 + const auto g1_t0 = g1.add_tensor(g0_t0); + const auto g1_t1 = g1.add_tensor(g0_t1); + const auto g1_t2 = g1.add_tensor(); + const auto g1_t3 = g1.add_tensor(); + const auto g1_t4 = g1.add_tensor(g0_t2); + const auto g1_o0 = g1.add_operator({ g1_t0 }, { g1_t2 }).second; + const auto g1_o1 = g1.add_operator({ g1_t1 }, { g1_t3 }).second; + const auto g1_o2 = g1.add_operator({ g1_t2, g1_t3 }, { g1_t4 }).second; + + // g0_o1 expands into g1_o3 + const auto g1_t5 = g1.add_tensor(g0_t3); + const auto g1_t6 = g1.add_tensor(g0_t2); + const auto g1_t7 = g1.add_tensor(g0_t4); + ARM_COMPUTE_EXPECT_EQUAL(g1_t4, g1_t6, framework::LogLevel::ERRORS); // both associate with the same merge point g0_t2, thus they should point to the same tensor in g1 + const auto g1_o3 = g1.add_operator({ g1_t5, g1_t6 }, { g1_t7 }).second; + + const DependencyGraph ref_graph + { + { + // src_tensors + { g1_o0, { g1_t0 } }, + { g1_o1, { g1_t1 } }, + { g1_o2, { g1_t2, g1_t3 } }, + { g1_o3, { g1_t5, g1_t4 } }, + }, + { + // dst_tensors + { g1_o0, { g1_t2 } }, + { g1_o1, { g1_t3 } }, + { g1_o2, { g1_t4 } }, + { g1_o3, { g1_t7 } }, + }, + { + // src_ops + { g1_t0, {} }, + { g1_t1, {} }, + { g1_t2, { g1_o0 } }, + { g1_t3, { g1_o1 } }, + { g1_t4, { g1_o2 } }, + { g1_t5, {} }, + { g1_t7, { g1_o3 } }, + }, + { + // dst_ops + { g1_t0, { g1_o0 } }, + { g1_t1, { g1_o1 } }, + { g1_t2, { g1_o2 } }, + { g1_t3, { g1_o2 } }, + { g1_t4, { g1_o3 } }, + { g1_t5, { g1_o3 } }, + { g1_t7, {} }, + }, + { + // merge points + { g0_t0, g1_t0 }, + { g0_t1, g1_t1 }, + { g0_t2, g1_t4 }, + { g0_t3, g1_t5 }, + { g0_t4, g1_t7 }, + } + }; + ARM_COMPUTE_EXPECT(g1 == ref_graph, framework::LogLevel::ERRORS); +} + +TEST_CASE(Path_Existence_Check_0, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + const auto t3 = graph.add_tensor(); + const auto t4 = graph.add_tensor(); + const auto t5 = graph.add_tensor(); + const auto t6 = graph.add_tensor(); + const auto t7 = graph.add_tensor(); + const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second; + const auto o1 = graph.add_operator({ t3 }, { t5 }).second; + const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second; + const auto o3 = graph.add_operator({ t4 }, { t6 }).second; + const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second; + + ARM_COMPUTE_UNUSED(o1, o3); + + ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t3, o2)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t1, o4)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t2, o4)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t0, o2)), framework::LogLevel::ERRORS); + + ARM_COMPUTE_EXPECT((graph.path_exists_from_op_to_op(o0, o2)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o0)), framework::LogLevel::ERRORS); + + ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o4)), framework::LogLevel::ERRORS); +} + +TEST_CASE(Correct_Topological_Sort_Should_Pass, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + const auto t3 = graph.add_tensor(); + const auto t4 = graph.add_tensor(); + const auto t5 = graph.add_tensor(); + const auto t6 = graph.add_tensor(); + const auto t7 = graph.add_tensor(); + const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second; + const auto o1 = graph.add_operator({ t3 }, { t5 }).second; + const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second; + const auto o3 = graph.add_operator({ t4 }, { t6 }).second; + const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second; + + const auto res = graph.topological_sort(); + ARM_COMPUTE_EXPECT(bool(res.first), framework::LogLevel::ERRORS); + std::vector ref_sorted_op_packs + { + { o0, { t1 }, { t3, t4 } }, + { o1, { t3 }, { t5 } }, + { o3, { t4 }, { t6 } }, + { o4, { t0, t5 }, { t2 } }, + { o2, { t5, t6 }, { t7 } }, + + }; + ARM_COMPUTE_EXPECT((res.second == ref_sorted_op_packs), framework::LogLevel::ERRORS); +} + +TEST_CASE(Cycles_Should_Fail, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + const auto t3 = graph.add_tensor(); + + graph.add_operator({ t0, t1 }, { t2 }); + graph.add_operator({ t2 }, { t1, t3 }); // Ideally error should occur here + + const auto res = graph.topological_sort(); + ARM_COMPUTE_EXPECT(!bool(res.first), framework::LogLevel::ERRORS); +} +TEST_CASE(Loops_Should_Fail, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + + ARM_COMPUTE_EXPECT_THROW(graph.add_operator({ t0, t2 }, { t1, t2 }).first, framework::LogLevel::ERRORS); + ARM_COMPUTE_UNUSED(t0, t1, t2); +} +TEST_SUITE_END() // DependencyGraph +TEST_SUITE_END() // DYNAMIC_FUSION +TEST_SUITE_END() // UNIT + +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp new file mode 100644 index 0000000000..1b04b0cee0 --- /dev/null +++ b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/TensorInfo.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/experimental/ClWorkload.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/experimental/ClCompositeOperator.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" +#include "src/gpu/cl/operators/ClAdd.h" +#include "src/gpu/cl/operators/ClConv2d.h" +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" +#include "tests/validation/Validation.h" + +#include "tests/validation/reference/ConvolutionLayer.h" +#include "tests/validation/reference/ElementwiseOperations.h" +#include "tests/validation/reference/Permute.h" + +#ifdef ARM_COMPUTE_ASSERTS_ENABLED +#include "tests/SimpleTensorPrinter.h" +#endif /* ARM_COMPUTE_ASSERTS_ENABLED */ + +using namespace arm_compute::experimental::dynamic_fusion; +using namespace arm_compute::test::validation::utils; + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +TEST_SUITE(CL) +TEST_SUITE(INTEGRATION) +TEST_SUITE(DYNAMIC_FUSION) +TEST_CASE(Operator_Fuse_Movenet_SubGraph_1_F32, framework::DatasetMode::ALL) +{ + // Please refer to: https://confluence.arm.com/pages/viewpage.action?pageId=886243697 + /* Computation: + * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + const auto t_input_shape = TensorShape(384, 12, 12); + // const auto t_weight_shape = TensorShape(384, 1, 1, 64); + // const auto t_dst_shape = TensorShape(64, 12, 12); + const auto t_weight_shape = TensorShape(384, 1, 1, 16); + const auto t_dst_shape = TensorShape(16, 12, 12); + auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); + auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + auto t_acc_info = TensorInfo(); // Intermediate tensor for cond3 + auto t_dst_info = TensorInfo(); + + Conv2dDescriptor conv2d_desc{}; + AddDescriptor add_desc{}; + + // Create reference + SimpleTensor ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_t_l1_addend{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + + // Fill reference + fill(ref_t_input, 0, library.get()); + fill(ref_t_weight, 1, library.get()); + fill(ref_t_l1_addend, 2, library.get()); + + auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U)); + auto ref_t_weight_nchw = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U)); + auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U)); + auto ref_t_l1_addend_nchw = reference::permute(ref_t_l1_addend, PermutationVector(1U, 2U, 0U)); + auto t_dst_shape_nchw = t_dst_shape; + permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U)); + + PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{}); + auto ref_t_dst_nchw = reference::arithmetic_operation( + ArithmeticOperation::ADD, + ref_t_l1_addend_nchw, + reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_desc.dilation), + data_type, + ConvertPolicy{}); + const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U)); + + CLScheduler::get().default_reinit(); + const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + OperatorGraph op_graph; + + const auto op_t_input = add_tensor(op_graph, t_input_info); + const auto op_t_weight = add_tensor(op_graph, t_weight_info); + const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); + const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc); + force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); + add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + build(workload, op_graph, workload_ctx); + + ClCompositeOperator op; + op.configure(cl_compile_ctx, workload); + + // Construct tensors + CLTensor t_input{}; + CLTensor t_weight{}; + CLTensor t_l1_addend{}; + CLTensor t_dst{}; + + // Init tensors + t_input.allocator()->init(t_input_info); + t_weight.allocator()->init(t_weight_info); + t_l1_addend.allocator()->init(t_dst_info); + t_dst.allocator()->init(t_dst_info); + + // Allocate and fill tensors + t_input.allocator()->allocate(); + t_weight.allocator()->allocate(); + t_l1_addend.allocator()->allocate(); + t_dst.allocator()->allocate(); + fill(CLAccessor(t_input), 0, library.get()); + fill(CLAccessor(t_weight), 1, library.get()); + fill(CLAccessor(t_l1_addend), 2, library.get()); + // "Pack" tensors + OpTensorBinding bp_tensors({ { op_t_input, &t_input }, + { op_t_weight, &t_weight }, + { op_t_l1_addend, &t_l1_addend }, + { op_t_dst, &t_dst } + }); + + // Populate prepare and run pack-maps (including allocating aux tensors) + ClAuxTensorData aux_tensor_data{}; + TensorPackMap prepare_pack_map{}; + TensorPackMap run_pack_map{}; + bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors); + + op.prepare(prepare_pack_map); + op.run(run_pack_map); + RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ + validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); +} +TEST_SUITE(Unsupported) +TEST_CASE(DataType_QASYMM8, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::QASYMM8; + const auto data_layout = DataLayout::NHWC; + const auto t_input_shape = TensorShape(384, 12, 12); + const auto t_weight_shape = TensorShape(384, 1, 1, 64); + const auto t_dst_shape = TensorShape(64, 12, 12); + auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); + auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + auto t_acc_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + + Conv2dDescriptor conv2d_desc{}; + AddDescriptor add_desc{}; + + OperatorGraph op_graph; + + const auto op_t_input = add_tensor(op_graph, t_input_info); + const auto op_t_weight = add_tensor(op_graph, t_weight_info); + const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); + const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc); + add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); + force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_CASE(DataLayout_NCHW, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NCHW; + const auto t_input_shape = TensorShape(384, 12, 12); + const auto t_weight_shape = TensorShape(384, 1, 1, 64); + const auto t_dst_shape = TensorShape(64, 12, 12); + auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); + auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + + Conv2dDescriptor conv2d_desc{}; + + OperatorGraph op_graph; + + const auto op_t_input = add_tensor(op_graph, t_input_info); + const auto op_t_weight = add_tensor(op_graph, t_weight_info); + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_dst); + force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_SUITE_END() // Unsupported + +TEST_SUITE(Invalid) +TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL) +{ + /* Computation: + * out = conv2d(conv2d(l0_input, l0_weight), l1_weight) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + const auto t_l0_input_shape = TensorShape(1024, 56, 56); + const auto t_l0_weight_shape = TensorShape(512, 1024, 1, 1); + const auto t_l1_weight_shape = TensorShape(512, 256, 1, 1); + + auto t_l0_input_info = TensorInfo(t_l0_input_shape, 1, data_type, data_layout); + auto t_l0_weight_info = TensorInfo(t_l0_weight_shape, 1, data_type, data_layout); + auto t_l1_weight_info = TensorInfo(t_l1_weight_shape, 1, data_type, data_layout); + auto t_l0_dst_info = TensorInfo(); + auto t_dst_info = TensorInfo(); + + OperatorGraph op_graph; + const auto conv2d_desc = Conv2dDescriptor{}; + + const auto op_t_l0_input = add_tensor(op_graph, t_l0_input_info); + const auto op_t_l0_weight = add_tensor(op_graph, t_l0_weight_info); + const auto op_t_l1_weight = add_tensor(op_graph, t_l1_weight_info); + const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + add_op_conv2d(op_graph, conv2d_desc, op_t_l0_input, op_t_l0_weight, op_t_l0_dst); + add_op_conv2d(op_graph, conv2d_desc, op_t_l0_dst, op_t_l1_weight, op_t_dst); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_CASE(Enlarging_Execution_Space, framework::DatasetMode::ALL) +{ + /* Computation: + * out = add(l2_lhs, add(add(l0_lhs, l0_rhs), l1_rhs)) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + const auto t_l0_lhs_shape = TensorShape(1, 256, 3); + const auto t_l0_rhs_shape = TensorShape(1, 256, 3); + const auto t_l1_rhs_shape = TensorShape(1, 1, 3); + const auto t_l2_lhs_shape = TensorShape(1024, 1, 3); + + auto t_l0_lhs_info = TensorInfo(t_l0_lhs_shape, 1, data_type, data_layout); + auto t_l0_rhs_info = TensorInfo(t_l0_rhs_shape, 1, data_type, data_layout); + auto t_l1_rhs_info = TensorInfo(t_l1_rhs_shape, 1, data_type, data_layout); + auto t_l2_lhs_info = TensorInfo(t_l2_lhs_shape, 1, data_type, data_layout); + auto t_l0_dst_info = TensorInfo(); + auto t_l1_dst_info = TensorInfo(); + auto t_dst_info = TensorInfo(); + + OperatorGraph op_graph; + const auto add_desc = AddDescriptor{}; + + const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info); + const auto op_t_l0_rhs = add_tensor(op_graph, t_l0_rhs_info); + const auto op_t_l1_rhs = add_tensor(op_graph, t_l1_rhs_info); + const auto op_t_l2_lhs = add_tensor(op_graph, t_l2_lhs_info); + const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_l1_dst = add_tensor(op_graph, t_l1_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + add_op_elementwise_add(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst); + add_op_elementwise_add(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst); + add_op_elementwise_add(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_CASE(Root_Simple_And_Complex, framework::DatasetMode::ALL) +{ + /* Computation: + * out = add(conv(l0_0_input, l0_0_weight), add(l0_1_lhs, l0_1_rhs)) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + + const auto t_l0_0_input_shape = TensorShape(128, 21, 21); + const auto t_l0_0_weight_shape = TensorShape(144, 128, 1, 1); + const auto t_l0_1_lhs_shape = TensorShape(144, 21, 21); + const auto t_l0_1_rhs_shape = TensorShape(1, 1, 21); + + auto t_l0_0_input_info = TensorInfo(t_l0_0_input_shape, 1, data_type, data_layout); + auto t_l0_0_weight_info = TensorInfo(t_l0_0_weight_shape, 1, data_type, data_layout); + auto t_l0_1_lhs_info = TensorInfo(t_l0_1_lhs_shape, 1, data_type, data_layout); + auto t_l0_1_rhs_info = TensorInfo(t_l0_1_rhs_shape, 1, data_type, data_layout); + auto t_l0_0_dst_info = TensorInfo(); + auto t_l0_1_dst_info = TensorInfo(); + auto t_dst_info = TensorInfo(); + + OperatorGraph op_graph; + const auto conv2d_desc = Conv2dDescriptor{}; + const auto add_desc = AddDescriptor{}; + + const auto op_t_l0_0_input = add_tensor(op_graph, t_l0_0_input_info); + const auto op_t_l0_0_weight = add_tensor(op_graph, t_l0_0_weight_info); + const auto op_t_l0_1_lhs = add_tensor(op_graph, t_l0_1_lhs_info); + const auto op_t_l0_1_rhs = add_tensor(op_graph, t_l0_1_rhs_info); + const auto op_t_l0_0_dst = add_tensor(op_graph, t_l0_0_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_l0_1_dst = add_tensor(op_graph, t_l0_1_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + add_op_conv2d(op_graph, conv2d_desc, op_t_l0_0_input, op_t_l0_0_weight, op_t_l0_0_dst); + add_op_elementwise_add(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst); + add_op_elementwise_add(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_CASE(Loop, framework::DatasetMode::ALL) +{ + /* Computation: + * tensor state0; + * state1 = add(l0_lhs, state0) + * state0 = add(l1_lhs, state1) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + + const auto t_shape = TensorShape(13, 21); + + auto t_l0_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout); + auto t_l1_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout); + auto state0_info = TensorInfo(t_shape, 1, data_type, data_layout); + auto state1_info = TensorInfo(); + + OperatorGraph op_graph; + const auto conv2d_desc = Conv2dDescriptor{}; + const auto add_desc = AddDescriptor{}; + + const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info); + const auto op_t_l1_lhs = add_tensor(op_graph, t_l1_lhs_info); + const auto op_t_state0 = add_tensor(op_graph, state0_info); + const auto op_t_state1 = add_tensor(op_graph, state1_info); + + add_op_conv2d(op_graph, conv2d_desc, op_t_l0_lhs, op_t_state0, op_t_state1); + add_op_elementwise_add(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_SUITE_END() // Invalid + +TEST_SUITE_END() // DYNAMIC_FUSION +TEST_SUITE_END() // INTEGRATION +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Utils.h b/tests/validation/CL/UNIT/dynamic_fusion/Utils.h new file mode 100644 index 0000000000..4512305c1e --- /dev/null +++ b/tests/validation/CL/UNIT/dynamic_fusion/Utils.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS +#define TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS + +#include "tests/AssetsLibrary.h" +#include "utils/Utils.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace utils +{ +/** A pair of macros which measures the wall clock time, and records it into a map measurement_map with name clock_name + * + */ +#define TICK(clock_name) \ + auto clock_name##_tick = std::chrono::high_resolution_clock::now(); +#define TOCK(clock_name, measurement_map) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); +#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); + +template +void fill(U &&tensor, int seed, AssetsLibrary *library) +{ + static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); + using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; + + DistributionType distribution{ T(-1.0f), T(1.0f) }; + library->fill(tensor, distribution, seed); + + // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) + DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; + library->fill_borders_with_garbage(tensor, distribution_inf, seed); +} +} // namespace utils +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif //TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS \ No newline at end of file -- cgit v1.2.1