From b801dda26bbcff8ec4f7967d60f38239fd16912b Mon Sep 17 00:00:00 2001 From: Ayaan Masood Date: Tue, 22 Feb 2022 11:28:55 +0000 Subject: MLBEDSW-5880 Fixed Vela verbose weight flag *Original weights and encoded NPU weight now report correct size instead of zero when running vela with --verbose-weights flag (Code to update the aforementioned attributes was missing) *Removed print references to unencoded NPU weight size Change-Id: I6d3e41c04cc46d24eeb54cab89818a35e5df27be Signed-off-by: Ayaan Masood --- ethosu/vela/npu_performance.py | 33 +++++++++++++++++++++++++++++++-- ethosu/vela/scheduler.py | 3 +-- ethosu/vela/stats_writer.py | 8 +------- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 21b420bf..08967f49 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -22,6 +22,8 @@ import copy from enum import auto from enum import IntEnum +from typing import Set +from uuid import UUID import numpy as np @@ -31,11 +33,13 @@ from .architecture_features import Accelerator from .architecture_features import NpuBlockType from .architecture_features import SHRAMElements from .architecture_features import TensorFormat +from .nn_graph import Graph from .numeric_util import round_up from .operation import Kernel from .operation import Op from .scheduler import Schedule from .scheduler import SchedulerOperation +from .scheduler import SchedulerOpInfo from .shape4d import Shape4D from .tensor import BandwidthDirection from .tensor import MemArea @@ -725,16 +729,39 @@ def estimate_full_op_performance( return bws, macs, cycles_a -def calc_new_performance_for_network(nng, arch): +def calc_new_performance_for_network(nng: Graph, arch): total_bws = make_bandwidth_array() total_macs = 0 total_cycles = np.zeros(PassCycles.Size) + total_weight_size = 0 + total_encoded_weight_size = 0 + + # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights + original_weight_uuids: Set[UUID] = set() + encoded_npu_weight_uuids: Set[UUID] = set() for sg in nng.subgraphs: prev_op = None for sched_op in sg.sched_ops: - op_info = sg.schedule.cost_map[sched_op] + op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op] bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config) + + # Tensors for calculating weight sizes + original_weight = sched_op.parent_op.weights + encoded_npu_weight = op_info.npu_weights_tensor + + # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights + if original_weight and (original_weight.equivalence_id not in original_weight_uuids): + + original_weight_uuids.add(original_weight.equivalence_id) + total_weight_size += original_weight.values.itemsize * original_weight.values.size + + # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights + if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids): + + encoded_npu_weight_uuids.add(encoded_npu_weight) + total_encoded_weight_size += len(encoded_npu_weight.buffer) + total_bws += bws total_macs += macs total_cycles += cycles @@ -743,3 +770,5 @@ def calc_new_performance_for_network(nng, arch): nng.bandwidths = total_bws nng.macs = total_macs nng.cycles = total_cycles + nng.total_original_weights = total_weight_size + nng.total_npu_encoded_weights = total_encoded_weight_size diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 8f2426c1..6b084593 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -339,7 +339,7 @@ class Scheduler: self.nng = nng self.sg = sg self.arch = arch - self.sched_ops: List(SchedulerOperation) = [] + self.sched_ops: List[SchedulerOperation] = [] self.max_schedule = None self.scheduler_options = options @@ -459,7 +459,6 @@ class Scheduler: def create_initial_schedule(self) -> Schedule: """Creates an initial schedule with no cascading or buffering of any kind""" schedule = Schedule(self.sg, "MAX") - for op in self.sched_ops: cost = op.create_scheduler_info(self.nng, op.ofm.shape) cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth) diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py index 86f531a8..d8a274b0 100644 --- a/ethosu/vela/stats_writer.py +++ b/ethosu/vela/stats_writer.py @@ -110,7 +110,6 @@ def write_summary_metrics_csv(nng, summary_filename, arch): data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes] data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas] data_items += [nng.total_original_weights] - data_items += [nng.total_npu_weights] data_items += [nng.total_npu_encoded_weights] for mem_area in mem_areas: @@ -325,7 +324,6 @@ def print_performance_metrics_for_strat( if weights_data: print(f"Original Weights Size {weights_data['original'] / 1024.0:12.2f} KiB", file=f) - print(f"NPU Weights Size {weights_data['npu'] / 1024.0:12.2f} KiB", file=f) print(f"NPU Encoded Weights Size {weights_data['npu_encoded'] / 1024.0:12.2f} KiB", file=f) print(file=f) @@ -372,11 +370,7 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig npu_operations.append(op) weights_data = ( - { - "original": nng.total_original_weights, - "npu": nng.total_npu_weights, - "npu_encoded": nng.total_npu_encoded_weights, - } + {"original": nng.total_original_weights, "npu_encoded": nng.total_npu_encoded_weights} if verbose_weights else None ) -- cgit v1.2.1