From 0e7053da06b4d73058abbc3782611259b00066b5 Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Tue, 24 May 2022 14:37:31 +0100 Subject: Fix reference example for fusion of conv2d and adddition * Make sure the reference is running the same computation as the fused example * Add finer breakdown of start-up time Related to COMPMID-5365 Signed-off-by: SiCong Li Change-Id: Ia5c73d4af318cafda1335438f88d0d4f74a5355c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7643 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- .../cl_fused_conv2d_elementwise_add.cpp | 7 ++++ .../cl_ref_conv2d_elementwise_add.cpp | 37 +++++++++++++++------- 2 files changed, 33 insertions(+), 11 deletions(-) (limited to 'examples') diff --git a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp index af00efdfb9..285509b586 100644 --- a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp +++ b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp @@ -138,6 +138,7 @@ public: CLScheduler::get().default_init(tuner_to_use); TICK(startup_time); + TICK(configure); /* Computation: * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) */ @@ -251,7 +252,9 @@ public: const auto success = ClCompositeOperator::validate(workload); // Optional op.configure(CLKernelLibrary::get().get_compile_context(), workload); // [Validate and configure ClCompositeOperator] + TOCK(configure, measurements); + TICK(tensor_allocation); /// @page example_dynamic_fusion_cl_conv2d_elementwise_add /// @subsection run_clcompositeoperator Run ClCompositeOperator /// Construct the runtime CLTensor s with backing memory @@ -312,7 +315,9 @@ public: tensor_data.tensor->allocator()->allocate(); } // [Initialize and Allocate Auxiliary CLTensor objects] + TOCK(tensor_allocation, measurements); + TICK(dummy_run); /// @page example_dynamic_fusion_cl_conv2d_elementwise_add /// Run the ClCompositeOperator prepare job. This performs any jobs that are required for the first run, like /// reshaping tensors for a more performant format. @@ -327,6 +332,8 @@ public: // [Run ClCompositeOperator] op.run(run_pack_map); // [Run ClCompositeOperator] + CLScheduler::get().sync(); + TOCK(dummy_run, measurements); TOCK(startup_time, measurements); return true; } diff --git a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp index 4f68372b49..3aedcc0f41 100644 --- a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp +++ b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp @@ -52,6 +52,9 @@ using namespace utils; using std::chrono::duration_cast; using std::chrono::microseconds; +/** A reference for comparing against the fusion of a direct convolution with an elementwise addition: + * examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp + */ class ClRefConv2dEltwiseAddExample : public Example { public: @@ -69,7 +72,7 @@ public: if(argc < 10) { // Print help - std::cout << "Usage: ./cl_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive)\n"; + std::cout << "Usage: ./cl_ref_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n"; std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n"; ih = 512; iw = 512; @@ -126,6 +129,7 @@ public: CLScheduler::get().default_init(tuner_to_use); TICK(startup_time); + TICK(configure); /* Computation: * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) @@ -133,54 +137,64 @@ public: const auto data_type = DataType::F32; const auto data_layout = DataLayout::NHWC; const PadStrideInfo conv_info{ 1, 1, pad_x, pad_y }; - // const auto t_input_shape = TensorShape(384, 12, 12); - // const auto t_weight_shape = TensorShape(384, 1, 1, 64); - // const auto t_dst_shape = TensorShape(64, 12, 12); - const auto t_input_shape = TensorShape(ifm, iw, ih); - const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm); - const auto t_dst_shape = misc::shape_calculator::compute_deep_convolution_shape(t_input_shape, data_layout, t_weight_shape, conv_info); + const auto t_input_shape = TensorShape(ifm, iw, ih); + const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm); + const auto t_bias_shape = TensorShape(ofm); + const auto t_l1_addend_shape = TensorShape(ofm, iw); + const auto t_dst_shape = misc::shape_calculator::compute_deep_convolution_shape(t_input_shape, data_layout, t_weight_shape, conv_info); std::cout << "input_shape: " << t_input_shape << std::endl; std::cout << "weight_shape: " << t_weight_shape << std::endl; + std::cout << "bias_shape: " << t_bias_shape << std::endl; + std::cout << "addend_shape: " << t_l1_addend_shape << std::endl; std::cout << "dst_shape: " << t_dst_shape << std::endl; auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_bias_info = TensorInfo(t_bias_shape, 1, data_type, data_layout); auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); // Intermediate tensor for cond3 - auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + auto t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout); auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); // Init tensors { t_input.allocator()->init(t_input_info); t_weight.allocator()->init(t_weight_info); + t_bias.allocator()->init(t_bias_info); t_l1_addend.allocator()->init(t_dst_info); t_l0_dst.allocator()->init(t_l0_dst_info); t_dst.allocator()->init(t_dst_info); } - op0.configure(&t_input, &t_weight, nullptr, &t_l0_dst, conv_info); + op0.configure(&t_input, &t_weight, &t_bias, &t_l0_dst, conv_info); op1.configure(&t_l0_dst, &t_l1_addend, &t_dst, ConvertPolicy{}); + TOCK(configure, measurements); + TICK(tensor_allocation); // Construct tensors // Allocate and fill tensors { t_input.allocator()->allocate(); t_weight.allocator()->allocate(); + t_bias.allocator()->allocate(); t_l1_addend.allocator()->allocate(); t_l0_dst.allocator()->allocate(); t_dst.allocator()->allocate(); fill_random_tensor(t_input, -1.f, 1.f); fill_random_tensor(t_weight, -1.f, 1.f); + fill_random_tensor(t_bias, -1.f, 1.f); fill_random_tensor(t_l1_addend, -1.f, 1.f); } + TOCK(tensor_allocation, measurements); // Dummy run for CLTuner + TICK(dummy_run); op0.run(); - op1.run(); + CLScheduler::get().sync(); + TOCK(dummy_run, measurements); TOCK(startup_time, measurements); return true; } void do_run() override { - // Run the fused op + // Run the ops op0.run(); op1.run(); @@ -199,6 +213,7 @@ public: private: CLTensor t_input{}; CLTensor t_weight{}; + CLTensor t_bias{}; CLTensor t_l1_addend{}; CLTensor t_l0_dst{}; CLTensor t_dst{}; -- cgit v1.2.1