From 89a8cdd5425521f68674ac23a78790f0f6dc98ed Mon Sep 17 00:00:00 2001 From: wilisa01 Date: Mon, 22 Aug 2022 16:13:06 +0000 Subject: MLBEDSW-6755: Add per-layer performance to CSV file Dump the current per-layer performance estimation information that appears on the terminal to a CSV file. Change-Id: I00e94168704be8c3c674c8779fb807ed28607ccd Signed-off-by: wilisa01 --- ethosu/vela/compiler_driver.py | 8 ++- ethosu/vela/npu_performance.py | 121 ++++++++++++++++++++++++++++------------- ethosu/vela/vela.py | 4 +- 3 files changed, 91 insertions(+), 42 deletions(-) (limited to 'ethosu/vela') diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index 1d8756b5..cace0f08 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -149,7 +149,7 @@ def _check_schedule(nng, arch, scheduler_options): ) -def compiler_driver(nng, arch, options, scheduler_options, network_type): +def compiler_driver(nng, arch, options, scheduler_options, network_type, output_basename): assert verify_graph_health(nng) # Pre-optimisation operator tracking @@ -254,4 +254,6 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type): cpu_tensor_alignment=options.cpu_tensor_alignment, ) - npu_performance.calc_new_performance_for_network(nng, arch, network_type, options.verbose_performance) + npu_performance.calc_new_performance_for_network( + nng, arch, network_type, options.verbose_performance, output_basename + ) diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 6d99dea0..b6ebe11f 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -20,6 +20,7 @@ # Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance # estimate. import copy +import csv from enum import auto from enum import IntEnum from typing import Optional @@ -759,6 +760,7 @@ def print_performance( macs: dict, cycles: dict, mem_usage: dict, + output_basename: str, ): if network_type == NetworkType.TFLite: nng_optype_to_input_op_type = tflite_optype_to_builtintype @@ -793,41 +795,86 @@ def print_performance( f"Name:" ) - for sched_op in sg.sched_ops: - # get source op name - sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1] - if sched_op_src_uid == DebugDatabase.NULLREF: - src_op_type = None - else: - src_op_type = suid_inv_map[sched_op_src_uid].type - - src_op_name = nng_optype_to_input_op_type(src_op_type) - - max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores - peak_sram = ( - mem_usage[sched_op] / nng.memory_used[MemArea.Sram] * 100 if MemArea.Sram in nng.memory_used else 0 - ) - print( - f" {src_op_name:20s}" - f" {sched_op.op_type:20s}" - f" {mem_usage[sched_op]:10.0f}" - f" ({peak_sram:6.2f}%)" - f" {cycles[sched_op][PassCycles.Total]:10.0f}" - f" ({cycles[sched_op][PassCycles.Total] / nng.cycles[PassCycles.Total] * 100:6.2f}%)" - f" [" - f" {cycles[sched_op][PassCycles.Npu]:10.0f}" - f" {cycles[sched_op][PassCycles.SramAccess]:10.0f}" - f" {cycles[sched_op][PassCycles.DramAccess]:10.0f}" - f" {cycles[sched_op][PassCycles.OnChipFlashAccess]:10.0f}" - f" {cycles[sched_op][PassCycles.OffChipFlashAccess]:10.0f}" - f" ]" - f" {macs[sched_op]:10d}" - f" ({macs[sched_op] / nng.macs * 100:6.2f}% / {macs[sched_op] / max_macs * 100:6.2f}%)" - f" {sched_op.name:s}" - ) - - -def calc_new_performance_for_network(nng: Graph, arch, network_type: NetworkType, verbose_performance: bool): + with open(output_basename + "_per-layer.csv", "w", encoding="UTF8") as f: + writer = csv.writer(f) + header = [ + f"{network_type.name}_operator", + "NNG_operator", + "SRAM_usage", + "Peak", + "Op_cycles", + "Network", + "NPU", + "SRAM_AC", + "DRAM_AC", + "OnFlash_AC", + "OffFlash_AC", + "MAC_count", + "Network", + "Util", + "Name", + ] + writer.writerow(header) + + for sched_op in sg.sched_ops: + # get source op name + sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1] + if sched_op_src_uid == DebugDatabase.NULLREF: + src_op_type = None + else: + src_op_type = suid_inv_map[sched_op_src_uid].type + + src_op_name = nng_optype_to_input_op_type(src_op_type) + + max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores + peak_sram = ( + mem_usage[sched_op] / nng.memory_used[MemArea.Sram] * 100 if MemArea.Sram in nng.memory_used else 0 + ) + print( + f" {src_op_name:20s}" + f" {sched_op.op_type:20s}" + f" {mem_usage[sched_op]:10.0f}" + f" ({peak_sram:6.2f}%)" + f" {cycles[sched_op][PassCycles.Total]:10.0f}" + f" ({cycles[sched_op][PassCycles.Total] / nng.cycles[PassCycles.Total] * 100:6.2f}%)" + f" [" + f" {cycles[sched_op][PassCycles.Npu]:10.0f}" + f" {cycles[sched_op][PassCycles.SramAccess]:10.0f}" + f" {cycles[sched_op][PassCycles.DramAccess]:10.0f}" + f" {cycles[sched_op][PassCycles.OnChipFlashAccess]:10.0f}" + f" {cycles[sched_op][PassCycles.OffChipFlashAccess]:10.0f}" + f" ]" + f" {macs[sched_op]:10d}" + f" ({macs[sched_op] / nng.macs * 100:6.2f}% / {macs[sched_op] / max_macs * 100:6.2f}%)" + f" {sched_op.name:s}" + ) + data = [ + f"{src_op_name}", + f"{sched_op.op_type}", + f"{mem_usage[sched_op]}", + f"{peak_sram}", + f"{cycles[sched_op][PassCycles.Total]}", + f"{cycles[sched_op][PassCycles.Total] / nng.cycles[PassCycles.Total]}", + f"{cycles[sched_op][PassCycles.Npu]}", + f"{cycles[sched_op][PassCycles.SramAccess]}", + f"{cycles[sched_op][PassCycles.DramAccess]}", + f"{cycles[sched_op][PassCycles.OnChipFlashAccess]}", + f"{cycles[sched_op][PassCycles.OffChipFlashAccess]}", + f"{macs[sched_op]}", + f"{macs[sched_op] / nng.macs}", + f"{macs[sched_op] / max_macs}", + f"{sched_op.name}", + ] + writer.writerow(x for x in data) + + +def calc_new_performance_for_network( + nng: Graph, + arch, + network_type: NetworkType, + verbose_performance: bool, + output_basename: str = "output/unnamed_network", +): total_bws = make_bandwidth_array() total_macs = 0 total_cycles = np.zeros(PassCycles.Size) @@ -886,4 +933,4 @@ def calc_new_performance_for_network(nng: Graph, arch, network_type: NetworkType nng.total_npu_encoded_weights = total_encoded_weight_size if verbose_performance: - print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage) + print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage, output_basename) diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index 1de437bb..a42b2188 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -75,7 +75,7 @@ def process(input_name, enable_debug_db, arch, model_reader_options, compiler_op print("Model reading took %f s" % (stop - start)) start = time.time() - compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options, network_type) + compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options, network_type, output_basename) summary_csv_file = "{0}_summary_{1}.csv".format(output_basename, arch.system_config) stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch) -- cgit v1.2.1