From 837c31c2912386e4cdfac36f019dd73620970715 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Wed, 24 Nov 2021 15:39:46 +0000 Subject: MLBEDSW-5507: Fix vela summary for passes - Removed the passes information as this was no longer correct or useful - Fixed the reporting of the number of CPU operators Signed-off-by: Tim Hall Change-Id: I80bf3f023de7d470af9aa5c6fe7bcb58c60ccd0b --- PERFORMANCE.md | 15 ++++------ ethosu/vela/stats_writer.py | 73 ++++++++++++++++++++++++++------------------- 2 files changed, 48 insertions(+), 40 deletions(-) diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 6bcfbbfe..9cd05b2d 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -18,8 +18,9 @@ Design peak Off-chip Flash bandwidth 0.50 GB/s Total SRAM used 0.95 KiB Total Off-chip Flash used 106.98 KiB -51 passes fused into 51 -0/106 (0.0%) operations falling back to the CPU +CPU operators = 0 (0.0%) +NPU operators = 44 (100.0%) + Average SRAM bandwidth 0.04 GB/s Input SRAM bandwidth 0.01 MB/batch Weight SRAM bandwidth 0.00 MB/batch @@ -115,15 +116,11 @@ system config and memory mode. ## Operator information -Information about cascading and operators. -The first line shows the number of passes (i.e. operations) and how many NPU -passes they have been fused or combined into. -The second line shows how many operators in the network are falling back to -the CPU (i.e. not supported by the NPU). +Information about the number of operators that will run on the CPU and NPU. ``` -51 passes fused into 51 -0/106 (0.0%) operations falling back to the CPU +CPU operators = 0 (0.0%) +NPU operators = 44 (100.0%) ``` ## Estimated memory bandwidth diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py index 32e4fd55..86f531a8 100644 --- a/ethosu/vela/stats_writer.py +++ b/ethosu/vela/stats_writer.py @@ -227,10 +227,8 @@ def print_performance_metrics_for_strat( bandwidths, batch_size, memory_used, - num_passes, - num_cascaded_passes, - n_operations=0, cpu_operations=None, + npu_operations=None, show_cpu_operations=False, weights_data=None, f=sys.stdout, @@ -271,32 +269,32 @@ def print_performance_metrics_for_strat( print(f"Total {aug_label:25} {memory_used[mem_area] / 1024.0:12.2f} KiB", file=f) print(file=f) - print(f"{num_passes:d} passes fused into {num_cascaded_passes:d}", file=f) if cpu_operations is None: cpu_operations = [] + if npu_operations is None: + npu_operations = [] n_cpu_operations = len(cpu_operations) - if n_operations > 0: - print( - f"{n_cpu_operations:d}/{n_operations:d}" - f" ({n_cpu_operations / n_operations:4.1%}) operations falling back to the CPU", - file=f, - ) - - if show_cpu_operations: - for op in cpu_operations: - - def format_tens_list(lst): - return " ".join(str(list(tens.shape)) for tens in lst) - - print( - f"CPU operation: {op.type}" - f" inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)}", - file=f, - ) - - print("", file=f) + n_npu_operations = len(npu_operations) + n_total_operations = n_cpu_operations + n_npu_operations + + def format_tens_list(lst): + return " ".join(str(list(tens.shape)) for tens in lst) + + for str_ops_type, n_ops, ops in ( + ("CPU", n_cpu_operations, cpu_operations), + ("NPU", n_npu_operations, npu_operations), + ): + print(f"{str_ops_type} operators = {n_ops:d} ({n_ops / n_total_operations:4.1%})", file=f) + if show_cpu_operations: + for op in ops: + print( + f" {str_ops_type}: {op.type} = {op.name}" + f" (inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)})" + ) + + print("", file=f) for mem_area, label in mem_area_labels: bws = bandwidths[mem_area] @@ -354,10 +352,25 @@ def print_performance_metrics_for_strat( def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weights=False, f=sys.stdout): - n_passes = sum(len(sg.passes) for sg in nng.subgraphs) - n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) - n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes) - cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), []) + cpu_operations = [] + npu_operations = [] + ir_only_ops = ( + Op.Const, + Op.Placeholder, + Op.CustomNpuOp, + Op.SubgraphInput, + ) + + for sg in nng.subgraphs: + if sg.placement == PassPlacement.Cpu: + for op in sg.get_all_ops(): + if op.type not in ir_only_ops: + cpu_operations.append(op) + elif sg.placement == PassPlacement.Npu: + for op in sg.get_all_ops(): + if op.type not in ir_only_ops: + npu_operations.append(op) + weights_data = ( { "original": nng.total_original_weights, @@ -375,10 +388,8 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig nng.bandwidths, nng.batch_size, nng.memory_used, - n_passes, - n_cascaded_passes, - n_operations, cpu_operations, + npu_operations, show_cpu_operations, weights_data, f, -- cgit v1.2.1