aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/stats_writer.py
diff options
context:
space:
mode:
Diffstat (limited to 'ethosu/vela/stats_writer.py')
-rw-r--r--ethosu/vela/stats_writer.py367
1 files changed, 367 insertions, 0 deletions
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
new file mode 100644
index 00000000..c4b4cd9e
--- /dev/null
+++ b/ethosu/vela/stats_writer.py
@@ -0,0 +1,367 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Writes out per-pass and summary performance statistics to CSV files.
+
+import numpy as np
+from .nn_graph import MemArea, TensorPurpose, PassPlacement
+from .npu_performance import PassCycles, MacCount, BandwidthDirection
+import csv
+from .numeric_util import round_up_to_int
+import sys
+
+
+def write_summary_metrics_csv(nng, summary_filename, arch):
+ with open(summary_filename, "w") as f:
+ writer = csv.writer(f)
+
+ labels = [
+ "experiment",
+ "network",
+ ]
+
+ labels += (
+ ["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
+ + [area.identifier_name() + "_bandwidth" for area in MemArea.all()]
+ + ["weights_storage_area", "feature_map_storage_area"]
+ )
+
+ labels += [
+ "inferences_per_second",
+ "batch_size",
+ "inference_time",
+ "passes_before_fusing",
+ "passes_after_fusing",
+ ]
+ labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()]
+ labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
+
+ for mem_area in MemArea.all():
+ labels += [
+ mem_area.identifier_name() + "_feature_map_read_bytes",
+ mem_area.identifier_name() + "_feature_map_write_bytes",
+ mem_area.identifier_name() + "_weight_read_bytes",
+ mem_area.identifier_name() + "_weight_write_bytes",
+ mem_area.identifier_name() + "_total_bytes",
+ ]
+
+ labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
+
+ labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
+
+ writer.writerow(labels)
+
+ data_items = [
+ "default",
+ nng.name,
+ ]
+
+ if arch:
+ data_items += (
+ [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
+ + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()]
+ + [
+ arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
+ arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
+ ]
+ )
+
+ midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
+ midpoint_fps = 1 / midpoint_inference_time
+
+ n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+ n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+
+ data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
+ data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()]
+
+ data_items += [
+ nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
+ nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
+ ]
+
+ for mem_area in MemArea.all():
+ bws = nng.bandwidths[mem_area]
+ total_bw = np.sum(bws)
+ weight_bws = bws[TensorPurpose.Weights]
+ fm_bws = bws[TensorPurpose.FeatureMap]
+ data_items += [
+ fm_bws[BandwidthDirection.Read],
+ fm_bws[BandwidthDirection.Write],
+ weight_bws[BandwidthDirection.Read],
+ weight_bws[BandwidthDirection.Write],
+ total_bw,
+ ]
+
+ data_items += [
+ nng.macs[MacCount.NeuralNetworkMacs],
+ nng.macs[MacCount.HardwareMacs],
+ nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
+ nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
+ ]
+
+ data_items += [nng.cycles[kind] for kind in PassCycles.all()]
+
+ writer.writerow(data_items)
+
+
+def write_pass_metrics_csv(nng, pass_filename):
+
+ with open(pass_filename, "w") as f:
+ writer = csv.writer(f)
+
+ purpose_list = (
+ ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
+ ("weights", (TensorPurpose.Weights,)),
+ ("feature_map", (TensorPurpose.FeatureMap,)),
+ )
+
+ direction_list = (
+ ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
+ ("read", (BandwidthDirection.Read,)),
+ ("write", (BandwidthDirection.Write,)),
+ )
+ bandwidth_names = []
+ bandwidth_indices = []
+ for mem_area in MemArea.all():
+ for purpose, purpose_candidates in purpose_list:
+ for direction, direction_candidates in direction_list:
+ label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
+ bandwidth_names.append(label)
+ bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
+
+ all_macs = MacCount.all()
+ all_cycles = (
+ PassCycles.Total,
+ PassCycles.Dpu,
+ PassCycles.ElementWise,
+ PassCycles.Cpu,
+ PassCycles.SramAccess,
+ PassCycles.DramAccess,
+ PassCycles.OnChipFlashAccess,
+ PassCycles.OffChipFlashAccess,
+ )
+ writer.writerow(
+ [
+ "name",
+ "operators",
+ "placement",
+ "streaming_strategy",
+ "block_config_height",
+ "block_config_width",
+ "block_config_input_channels",
+ "block_config_output_channels",
+ "n_blocks_in_pass",
+ ]
+ + ["cycles_" + v.identifier_name() for v in all_cycles]
+ + [v.identifier_name() for v in all_macs]
+ + bandwidth_names
+ + ["sram_used"]
+ )
+
+ def write_subgraph(sg):
+ for cps in sg.cascaded_passes:
+ if cps.placement == PassPlacement.StartupInit:
+ continue # skip the dummy init pass
+
+ for ps in cps.passes:
+ if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
+ # just treat this as a call, unroll it
+ write_subgraph(ps.ops[0].attrs["subgraph"])
+ continue
+ stats = [ps.name, " ".join(op.type for op in ps.ops)]
+ stats += [ps.placement.name]
+ stats += [cps.strategy.name]
+ stats += list(ps.block_config)
+ stats += [ps.n_blocks]
+ stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
+ stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
+ for indices in bandwidth_indices:
+ res = 0
+ i = indices[0]
+ for j in indices[1]:
+ for k in indices[2]:
+ res += round_up_to_int(ps.bandwidths[i, j, k])
+ stats.append(res)
+ stats += [ps.sram_used]
+
+ writer.writerow(stats)
+
+ write_subgraph(nng.get_root_subgraph())
+
+
+def print_performance_metrics_for_strat(
+ arch,
+ name,
+ cycles,
+ macs,
+ bandwidths,
+ batch_size,
+ memory_used,
+ num_passes,
+ num_cascaded_passes,
+ n_operations=0,
+ cpu_operations=[],
+ bits_per_element=None,
+ show_cpu_operations=False,
+ f=sys.stdout,
+):
+
+ orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()]
+
+ midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
+ midpoint_fps = 1 / midpoint_inference_time
+
+ mem_area_labels = [
+ (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
+ ]
+
+ if name:
+ print("", file=f)
+ print("Network summary for", name, file=f)
+ print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f)
+ print("System configuration %20s" % (arch.system_config,), file=f)
+ print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f)
+ for mem_area, label in mem_area_labels:
+ print(
+ "Design peak %-25s %12.2f GB/s"
+ % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
+ file=f,
+ )
+
+ print(file=f)
+ for mem_area, label in mem_area_labels:
+ if not mem_area in memory_used:
+ continue
+
+ aug_label = label + " used"
+
+ extra = ""
+ if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
+ extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
+
+ print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
+
+ print(file=f)
+ print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
+
+ n_cpu_operations = len(cpu_operations)
+ if n_operations > 0:
+ print(
+ "%d/%d (%4.1f %%) operations falling back to the CPU"
+ % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
+ file=f,
+ )
+
+ if show_cpu_operations:
+ for op in cpu_operations:
+
+ def format_tens_list(lst):
+ return " ".join(str(list(tens.shape)) for tens in lst)
+
+ print(
+ "CPU operation: %s, inputs %s, outputs %s"
+ % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
+ file=f,
+ )
+
+ print("", file=f)
+
+ for mem_area, label in mem_area_labels:
+ bws = bandwidths[mem_area]
+ total_bw = np.sum(bws)
+ weight_bws = bws[TensorPurpose.Weights]
+ fm_bws = bws[TensorPurpose.FeatureMap]
+ aug_label = label + " bandwidth"
+ print(
+ "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
+ file=f,
+ )
+ print(
+ "Input %-25s %12.2f MB/batch"
+ % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
+ file=f,
+ )
+ print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
+ print(
+ "Output %-25s %12.2f MB/batch"
+ % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
+ file=f,
+ )
+ print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
+ print(
+ "Total %-25s per input %9.2f MB/inference (batch size %d)"
+ % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
+ file=f,
+ )
+ print(file=f)
+
+ print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
+ print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
+ print(
+ "Network Tops/s %12.2f Tops/s"
+ % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
+ file=f,
+ )
+ print(
+ "Hardware Tops/s %12.2f Tops/s"
+ % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
+ file=f,
+ )
+ print(file=f)
+
+ for kind in PassCycles.all():
+ aug_label = kind.display_name() + " cycles"
+ cyc = cycles[kind]
+ print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f)
+ print(file=f)
+
+ print(
+ "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)"
+ % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
+ file=f,
+ )
+ print(file=f)
+
+
+def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
+ n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+ n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+ n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
+ cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
+ return print_performance_metrics_for_strat(
+ arch,
+ nng.name,
+ nng.cycles,
+ nng.macs,
+ nng.bandwidths,
+ nng.batch_size,
+ nng.memory_used,
+ n_passes,
+ n_cascaded_passes,
+ n_operations,
+ cpu_operations,
+ nng.bits_per_element,
+ show_cpu_operations,
+ f,
+ )
+
+
+def write_human_friendly_metrics(nng, arch, filename):
+ f = open(filename, "w")
+ print_performance_metrics(nng, arch, f=f)