aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2021-11-24 15:39:46 +0000
committertim.hall <tim.hall@arm.com>2021-11-25 14:36:32 +0000
commit837c31c2912386e4cdfac36f019dd73620970715 (patch)
treeab8c822126eaab24f571ce2f9f01b8d0d29e9084
parent3584a9cfdf0bcf0e75d38b78ec39e5b083947e19 (diff)
downloadethos-u-vela-837c31c2912386e4cdfac36f019dd73620970715.tar.gz
MLBEDSW-5507: Fix vela summary for passes
- Removed the passes information as this was no longer correct or useful - Fixed the reporting of the number of CPU operators Signed-off-by: Tim Hall <tim.hall@arm.com> Change-Id: I80bf3f023de7d470af9aa5c6fe7bcb58c60ccd0b
-rw-r--r--PERFORMANCE.md15
-rw-r--r--ethosu/vela/stats_writer.py73
2 files changed, 48 insertions, 40 deletions
diff --git a/PERFORMANCE.md b/PERFORMANCE.md
index 6bcfbbf..9cd05b2 100644
--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
@@ -18,8 +18,9 @@ Design peak Off-chip Flash bandwidth 0.50 GB/s
Total SRAM used 0.95 KiB
Total Off-chip Flash used 106.98 KiB
-51 passes fused into 51
-0/106 (0.0%) operations falling back to the CPU
+CPU operators = 0 (0.0%)
+NPU operators = 44 (100.0%)
+
Average SRAM bandwidth 0.04 GB/s
Input SRAM bandwidth 0.01 MB/batch
Weight SRAM bandwidth 0.00 MB/batch
@@ -115,15 +116,11 @@ system config and memory mode.
## Operator information
-Information about cascading and operators.
-The first line shows the number of passes (i.e. operations) and how many NPU
-passes they have been fused or combined into.
-The second line shows how many operators in the network are falling back to
-the CPU (i.e. not supported by the NPU).
+Information about the number of operators that will run on the CPU and NPU.
```
-51 passes fused into 51
-0/106 (0.0%) operations falling back to the CPU
+CPU operators = 0 (0.0%)
+NPU operators = 44 (100.0%)
```
## Estimated memory bandwidth
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 32e4fd5..86f531a 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -227,10 +227,8 @@ def print_performance_metrics_for_strat(
bandwidths,
batch_size,
memory_used,
- num_passes,
- num_cascaded_passes,
- n_operations=0,
cpu_operations=None,
+ npu_operations=None,
show_cpu_operations=False,
weights_data=None,
f=sys.stdout,
@@ -271,32 +269,32 @@ def print_performance_metrics_for_strat(
print(f"Total {aug_label:25} {memory_used[mem_area] / 1024.0:12.2f} KiB", file=f)
print(file=f)
- print(f"{num_passes:d} passes fused into {num_cascaded_passes:d}", file=f)
if cpu_operations is None:
cpu_operations = []
+ if npu_operations is None:
+ npu_operations = []
n_cpu_operations = len(cpu_operations)
- if n_operations > 0:
- print(
- f"{n_cpu_operations:d}/{n_operations:d}"
- f" ({n_cpu_operations / n_operations:4.1%}) operations falling back to the CPU",
- file=f,
- )
-
- if show_cpu_operations:
- for op in cpu_operations:
-
- def format_tens_list(lst):
- return " ".join(str(list(tens.shape)) for tens in lst)
-
- print(
- f"CPU operation: {op.type}"
- f" inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)}",
- file=f,
- )
-
- print("", file=f)
+ n_npu_operations = len(npu_operations)
+ n_total_operations = n_cpu_operations + n_npu_operations
+
+ def format_tens_list(lst):
+ return " ".join(str(list(tens.shape)) for tens in lst)
+
+ for str_ops_type, n_ops, ops in (
+ ("CPU", n_cpu_operations, cpu_operations),
+ ("NPU", n_npu_operations, npu_operations),
+ ):
+ print(f"{str_ops_type} operators = {n_ops:d} ({n_ops / n_total_operations:4.1%})", file=f)
+ if show_cpu_operations:
+ for op in ops:
+ print(
+ f" {str_ops_type}: {op.type} = {op.name}"
+ f" (inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)})"
+ )
+
+ print("", file=f)
for mem_area, label in mem_area_labels:
bws = bandwidths[mem_area]
@@ -354,10 +352,25 @@ def print_performance_metrics_for_strat(
def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weights=False, f=sys.stdout):
- n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
- n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
- n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
- cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
+ cpu_operations = []
+ npu_operations = []
+ ir_only_ops = (
+ Op.Const,
+ Op.Placeholder,
+ Op.CustomNpuOp,
+ Op.SubgraphInput,
+ )
+
+ for sg in nng.subgraphs:
+ if sg.placement == PassPlacement.Cpu:
+ for op in sg.get_all_ops():
+ if op.type not in ir_only_ops:
+ cpu_operations.append(op)
+ elif sg.placement == PassPlacement.Npu:
+ for op in sg.get_all_ops():
+ if op.type not in ir_only_ops:
+ npu_operations.append(op)
+
weights_data = (
{
"original": nng.total_original_weights,
@@ -375,10 +388,8 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig
nng.bandwidths,
nng.batch_size,
nng.memory_used,
- n_passes,
- n_cascaded_passes,
- n_operations,
cpu_operations,
+ npu_operations,
show_cpu_operations,
weights_data,
f,