# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the License); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an AS IS BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Description: # Writes out per-pass and summary performance statistics to CSV files. import csv import sys import numpy as np from .nn_graph import PassPlacement from .npu_performance import BandwidthDirection from .npu_performance import MacCount from .npu_performance import PassCycles from .numeric_util import round_up_to_int from .operation import Op from .tensor import MemArea from .tensor import TensorPurpose def mem_areas_to_report(): # Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage return [area for area in MemArea.all() if area != MemArea.Shram] def write_summary_metrics_csv(nng, summary_filename, arch): with open(summary_filename, "w") as f: writer = csv.writer(f) mem_areas = mem_areas_to_report() labels = [ "experiment", "network", ] labels += ( ["accelerator_configuration", "system_config", "npu_clock", "sram_size"] + [area.identifier_name() + "_bandwidth" for area in mem_areas] + ["weights_storage_area", "feature_map_storage_area"] ) labels += [ "inferences_per_second", "batch_size", "inference_time", "passes_before_fusing", "passes_after_fusing", ] labels += [area.identifier_name() + "_memory_used" for area in mem_areas] labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"] for mem_area in mem_areas: labels += [ mem_area.identifier_name() + "_feature_map_read_bytes", mem_area.identifier_name() + "_feature_map_write_bytes", mem_area.identifier_name() + "_weight_read_bytes", mem_area.identifier_name() + "_weight_write_bytes", mem_area.identifier_name() + "_total_bytes", ] labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"] labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()] writer.writerow(labels) data_items = [ "default", nng.name, ] if arch: data_items += ( [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024] + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas] + [ arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(), arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(), ] ) midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock if midpoint_inference_time > 0: midpoint_fps = 1 / midpoint_inference_time else: midpoint_fps = np.nan n_passes = sum(len(sg.passes) for sg in nng.subgraphs) n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes] data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas] data_items += [ nng.bits_per_element.get(MemArea.OnChipFlash, 0.0), nng.bits_per_element.get(MemArea.OffChipFlash, 0.0), ] for mem_area in mem_areas: bws = nng.bandwidths[mem_area] total_bw = np.sum(bws) weight_bws = bws[TensorPurpose.Weights] fm_bws = bws[TensorPurpose.FeatureMap] data_items += [ fm_bws[BandwidthDirection.Read], fm_bws[BandwidthDirection.Write], weight_bws[BandwidthDirection.Read], weight_bws[BandwidthDirection.Write], total_bw, ] data_items += [ nng.macs[MacCount.NeuralNetworkMacs], nng.macs[MacCount.HardwareMacs], nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12, nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12, ] data_items += [nng.cycles[kind] for kind in PassCycles.all()] writer.writerow(data_items) def write_pass_metrics_csv(nng, pass_filename): with open(pass_filename, "w") as f: writer = csv.writer(f) purpose_list = ( ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)), ("weights", (TensorPurpose.Weights,)), ("feature_map", (TensorPurpose.FeatureMap,)), ) direction_list = ( ("total", (BandwidthDirection.Read, BandwidthDirection.Write)), ("read", (BandwidthDirection.Read,)), ("write", (BandwidthDirection.Write,)), ) bandwidth_names = [] bandwidth_indices = [] for mem_area in mem_areas_to_report(): for purpose, purpose_candidates in purpose_list: for direction, direction_candidates in direction_list: label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction) bandwidth_names.append(label) bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates)) all_macs = MacCount.all() all_cycles = ( PassCycles.Total, PassCycles.Npu, PassCycles.Cpu, PassCycles.SramAccess, PassCycles.DramAccess, PassCycles.OnChipFlashAccess, PassCycles.OffChipFlashAccess, ) writer.writerow( [ "name", "operators", "placement", "streaming_strategy", "block_config_height", "block_config_width", "block_config_input_channels", "block_config_output_channels", "n_blocks_in_pass", ] + ["cycles_" + v.identifier_name() for v in all_cycles] + [v.identifier_name() for v in all_macs] + bandwidth_names + ["sram_used"] ) def write_subgraph(sg): for cps in sg.cascaded_passes: if cps.placement == PassPlacement.StartupInit: continue # skip the dummy init pass for ps in cps.passes: if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp: # just treat this as a call, unroll it write_subgraph(ps.ops[0].attrs["subgraph"]) continue stats = [ps.name, " ".join(op.type.name for op in ps.ops)] stats += [ps.placement.name] stats += [cps.strategy.name] stats += list(ps.block_config) stats += [ps.n_blocks] stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles] stats += [round_up_to_int(ps.macs[v]) for v in all_macs] for indices in bandwidth_indices: res = 0 i = indices[0] for j in indices[1]: for k in indices[2]: res += round_up_to_int(ps.bandwidths[i, j, k]) stats.append(res) try: stats += [ps.sram_used] except AttributeError: stats += [0] writer.writerow(stats) write_subgraph(nng.get_root_subgraph()) def print_performance_metrics_for_strat( arch, name, cycles, macs, bandwidths, batch_size, memory_used, num_passes, num_cascaded_passes, n_operations=0, cpu_operations=[], bits_per_element=None, show_cpu_operations=False, f=sys.stdout, ): orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()] midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock if midpoint_inference_time > 0: midpoint_fps = 1 / midpoint_inference_time else: midpoint_fps = np.nan mem_area_labels = [ (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0 ] if name: print("", file=f) print("Network summary for", name, file=f) print("Accelerator configuration {:20}".format(arch.accelerator_config), file=f) print("System configuration {:20}".format(arch.system_config), file=f) print("Accelerator clock {:12d} MHz".format(int(arch.npu_clock / 1e6)), file=f) for mem_area, label in mem_area_labels: print( "Design peak {:25} {:12.2f} GB/s".format( label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 ), file=f, ) print(file=f) for mem_area, label in mem_area_labels: if mem_area not in memory_used: continue aug_label = label + " used" extra = "" if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None: extra = " ({:.2f} bits per element)".format(bits_per_element[mem_area]) print("Total {:25} {:12.2f} KiB{}".format(aug_label, memory_used[mem_area] / 1024.0, extra), file=f) print(file=f) print("{:d} passes fused into {:d}".format(num_passes, num_cascaded_passes), file=f) n_cpu_operations = len(cpu_operations) if n_operations > 0: print( "{:d}/{:d} ({:4.1%}) operations falling back to the CPU".format( n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100 ), file=f, ) if show_cpu_operations: for op in cpu_operations: def format_tens_list(lst): return " ".join(str(list(tens.shape)) for tens in lst) print( "CPU operation: {} inputs {}, outputs {}".format( op.type, format_tens_list(op.inputs), format_tens_list(op.outputs) ), file=f, ) print("", file=f) for mem_area, label in mem_area_labels: bws = bandwidths[mem_area] total_bw = np.sum(bws) weight_bws = bws[TensorPurpose.Weights] fm_bws = bws[TensorPurpose.FeatureMap] aug_label = label + " bandwidth" print( "Average {:25} {:12.2f} GB/s".format(aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0), file=f, ) print( "Input {:25} {:12.2f} MB/batch".format( aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0 ), file=f, ) print("Weight {:25} {:12.2f} MB/batch".format(aug_label, np.sum(weight_bws) / 1000.0 / 1000.0), file=f) print( "Output {:25} {:12.2f} MB/batch".format( aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0 ), file=f, ) print("Total {:25} {:12.2f} MB/batch".format(aug_label, total_bw / 1000.0 / 1000.0), file=f) print( "Total {:25} per input {:9.2f} MB/inference (batch size {:d})".format( aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size ), file=f, ) print(file=f) print( "Neural network macs {:12d} MACs/batch".format(int(macs[MacCount.NeuralNetworkMacs])), file=f, ) print("Hardware macs {:12d} MACs/batch".format(int(macs[MacCount.HardwareMacs])), file=f) print( "Network Tops/s {:12.2f} Tops/s".format( macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12 ), file=f, ) print( "Hardware Tops/s {:12.2f} Tops/s".format( macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12 ), file=f, ) print(file=f) for kind in PassCycles.all(): aug_label = kind.display_name() + " cycles" cyc = cycles[kind] print("{:30} {:12d} cycles/batch".format(aug_label, int(cyc)), file=f) print(file=f) print( "Batch Inference time {:7.2f} ms, {:7.2f} inferences/s (batch size {:d})".format( midpoint_inference_time * 1000, midpoint_fps, batch_size ), file=f, ) print(file=f) def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout): n_passes = sum(len(sg.passes) for sg in nng.subgraphs) n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes) cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), []) return print_performance_metrics_for_strat( arch, nng.name, nng.cycles, nng.macs, nng.bandwidths, nng.batch_size, nng.memory_used, n_passes, n_cascaded_passes, n_operations, cpu_operations, nng.bits_per_element, show_cpu_operations, f, ) def write_human_friendly_metrics(nng, arch, filename): f = open(filename, "w") print_performance_metrics(nng, arch, f=f)