From 79d07d2cbf1c5013ab40bb46a6ccd4c569966536 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Mon, 27 Apr 2020 18:20:16 +0100 Subject: Add Vela codebase - Added modules ethosu.vela and ethosu.mlw_codec. - Added README and various configuration files. Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee --- ethosu/vela/stats_writer.py | 367 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 ethosu/vela/stats_writer.py (limited to 'ethosu/vela/stats_writer.py') diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py new file mode 100644 index 00000000..c4b4cd9e --- /dev/null +++ b/ethosu/vela/stats_writer.py @@ -0,0 +1,367 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Writes out per-pass and summary performance statistics to CSV files. + +import numpy as np +from .nn_graph import MemArea, TensorPurpose, PassPlacement +from .npu_performance import PassCycles, MacCount, BandwidthDirection +import csv +from .numeric_util import round_up_to_int +import sys + + +def write_summary_metrics_csv(nng, summary_filename, arch): + with open(summary_filename, "w") as f: + writer = csv.writer(f) + + labels = [ + "experiment", + "network", + ] + + labels += ( + ["accelerator_configuration", "system_config", "npu_clock", "sram_size"] + + [area.identifier_name() + "_bandwidth" for area in MemArea.all()] + + ["weights_storage_area", "feature_map_storage_area"] + ) + + labels += [ + "inferences_per_second", + "batch_size", + "inference_time", + "passes_before_fusing", + "passes_after_fusing", + ] + labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()] + labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"] + + for mem_area in MemArea.all(): + labels += [ + mem_area.identifier_name() + "_feature_map_read_bytes", + mem_area.identifier_name() + "_feature_map_write_bytes", + mem_area.identifier_name() + "_weight_read_bytes", + mem_area.identifier_name() + "_weight_write_bytes", + mem_area.identifier_name() + "_total_bytes", + ] + + labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"] + + labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()] + + writer.writerow(labels) + + data_items = [ + "default", + nng.name, + ] + + if arch: + data_items += ( + [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024] + + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()] + + [ + arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(), + arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(), + ] + ) + + midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock + midpoint_fps = 1 / midpoint_inference_time + + n_passes = sum(len(sg.passes) for sg in nng.subgraphs) + n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) + + data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes] + data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()] + + data_items += [ + nng.bits_per_element.get(MemArea.OnChipFlash, 0.0), + nng.bits_per_element.get(MemArea.OffChipFlash, 0.0), + ] + + for mem_area in MemArea.all(): + bws = nng.bandwidths[mem_area] + total_bw = np.sum(bws) + weight_bws = bws[TensorPurpose.Weights] + fm_bws = bws[TensorPurpose.FeatureMap] + data_items += [ + fm_bws[BandwidthDirection.Read], + fm_bws[BandwidthDirection.Write], + weight_bws[BandwidthDirection.Read], + weight_bws[BandwidthDirection.Write], + total_bw, + ] + + data_items += [ + nng.macs[MacCount.NeuralNetworkMacs], + nng.macs[MacCount.HardwareMacs], + nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12, + nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12, + ] + + data_items += [nng.cycles[kind] for kind in PassCycles.all()] + + writer.writerow(data_items) + + +def write_pass_metrics_csv(nng, pass_filename): + + with open(pass_filename, "w") as f: + writer = csv.writer(f) + + purpose_list = ( + ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)), + ("weights", (TensorPurpose.Weights,)), + ("feature_map", (TensorPurpose.FeatureMap,)), + ) + + direction_list = ( + ("total", (BandwidthDirection.Read, BandwidthDirection.Write)), + ("read", (BandwidthDirection.Read,)), + ("write", (BandwidthDirection.Write,)), + ) + bandwidth_names = [] + bandwidth_indices = [] + for mem_area in MemArea.all(): + for purpose, purpose_candidates in purpose_list: + for direction, direction_candidates in direction_list: + label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction) + bandwidth_names.append(label) + bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates)) + + all_macs = MacCount.all() + all_cycles = ( + PassCycles.Total, + PassCycles.Dpu, + PassCycles.ElementWise, + PassCycles.Cpu, + PassCycles.SramAccess, + PassCycles.DramAccess, + PassCycles.OnChipFlashAccess, + PassCycles.OffChipFlashAccess, + ) + writer.writerow( + [ + "name", + "operators", + "placement", + "streaming_strategy", + "block_config_height", + "block_config_width", + "block_config_input_channels", + "block_config_output_channels", + "n_blocks_in_pass", + ] + + ["cycles_" + v.identifier_name() for v in all_cycles] + + [v.identifier_name() for v in all_macs] + + bandwidth_names + + ["sram_used"] + ) + + def write_subgraph(sg): + for cps in sg.cascaded_passes: + if cps.placement == PassPlacement.StartupInit: + continue # skip the dummy init pass + + for ps in cps.passes: + if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp": + # just treat this as a call, unroll it + write_subgraph(ps.ops[0].attrs["subgraph"]) + continue + stats = [ps.name, " ".join(op.type for op in ps.ops)] + stats += [ps.placement.name] + stats += [cps.strategy.name] + stats += list(ps.block_config) + stats += [ps.n_blocks] + stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles] + stats += [round_up_to_int(ps.macs[v]) for v in all_macs] + for indices in bandwidth_indices: + res = 0 + i = indices[0] + for j in indices[1]: + for k in indices[2]: + res += round_up_to_int(ps.bandwidths[i, j, k]) + stats.append(res) + stats += [ps.sram_used] + + writer.writerow(stats) + + write_subgraph(nng.get_root_subgraph()) + + +def print_performance_metrics_for_strat( + arch, + name, + cycles, + macs, + bandwidths, + batch_size, + memory_used, + num_passes, + num_cascaded_passes, + n_operations=0, + cpu_operations=[], + bits_per_element=None, + show_cpu_operations=False, + f=sys.stdout, +): + + orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()] + + midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock + midpoint_fps = 1 / midpoint_inference_time + + mem_area_labels = [ + (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0 + ] + + if name: + print("", file=f) + print("Network summary for", name, file=f) + print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f) + print("System configuration %20s" % (arch.system_config,), file=f) + print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f) + for mem_area, label in mem_area_labels: + print( + "Design peak %-25s %12.2f GB/s" + % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,), + file=f, + ) + + print(file=f) + for mem_area, label in mem_area_labels: + if not mem_area in memory_used: + continue + + aug_label = label + " used" + + extra = "" + if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None: + extra = " (%.2f bits per element)" % (bits_per_element[mem_area],) + + print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f) + + print(file=f) + print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f) + + n_cpu_operations = len(cpu_operations) + if n_operations > 0: + print( + "%d/%d (%4.1f %%) operations falling back to the CPU" + % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100), + file=f, + ) + + if show_cpu_operations: + for op in cpu_operations: + + def format_tens_list(lst): + return " ".join(str(list(tens.shape)) for tens in lst) + + print( + "CPU operation: %s, inputs %s, outputs %s" + % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)), + file=f, + ) + + print("", file=f) + + for mem_area, label in mem_area_labels: + bws = bandwidths[mem_area] + total_bw = np.sum(bws) + weight_bws = bws[TensorPurpose.Weights] + fm_bws = bws[TensorPurpose.FeatureMap] + aug_label = label + " bandwidth" + print( + "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,), + file=f, + ) + print( + "Input %-25s %12.2f MB/batch" + % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,), + file=f, + ) + print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f) + print( + "Output %-25s %12.2f MB/batch" + % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,), + file=f, + ) + print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f) + print( + "Total %-25s per input %9.2f MB/inference (batch size %d)" + % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size), + file=f, + ) + print(file=f) + + print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f) + print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f) + print( + "Network Tops/s %12.2f Tops/s" + % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12), + file=f, + ) + print( + "Hardware Tops/s %12.2f Tops/s" + % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12), + file=f, + ) + print(file=f) + + for kind in PassCycles.all(): + aug_label = kind.display_name() + " cycles" + cyc = cycles[kind] + print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f) + print(file=f) + + print( + "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)" + % (midpoint_inference_time * 1000, midpoint_fps, batch_size), + file=f, + ) + print(file=f) + + +def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout): + n_passes = sum(len(sg.passes) for sg in nng.subgraphs) + n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) + n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes) + cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), []) + return print_performance_metrics_for_strat( + arch, + nng.name, + nng.cycles, + nng.macs, + nng.bandwidths, + nng.batch_size, + nng.memory_used, + n_passes, + n_cascaded_passes, + n_operations, + cpu_operations, + nng.bits_per_element, + show_cpu_operations, + f, + ) + + +def write_human_friendly_metrics(nng, arch, filename): + f = open(filename, "w") + print_performance_metrics(nng, arch, f=f) -- cgit v1.2.1