aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2022-03-03 17:50:52 +0000
committerTim Hall <tim.hall@arm.com>2022-05-17 20:06:24 +0100
commitc1be0873d9e28a21c7873793da896e6dd576292f (patch)
tree8d8392a7ae2a821e6dbc9f048343f9ccbe9e5828
parent3dae1b6088a469f1073222bf249d17c8cdf39dbf (diff)
downloadethos-u-vela-c1be0873d9e28a21c7873793da896e6dd576292f.tar.gz
MLBEDSW-6271: MLCE: Layer wise Utilization info from Vela
- Added support to print per operator sram usage and performance information - Added new CLI option --verbose-performance to control this feature Signed-off-by: Tim Hall <tim.hall@arm.com> Change-Id: I368599b410e5d441d9804871fc51b7a1049d85b3
-rw-r--r--ethosu/vela/architecture_features.py1
-rw-r--r--ethosu/vela/compiler_driver.py4
-rw-r--r--ethosu/vela/npu_performance.py107
-rw-r--r--ethosu/vela/scheduler.py4
-rw-r--r--ethosu/vela/vela.py2
5 files changed, 110 insertions, 8 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 08ff260..679947f 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -301,6 +301,7 @@ class ArchitectureFeatures:
self.num_elem_wise_units = accel_config.elem_units
self.num_macs_per_cycle = dpu_min_height * dpu_min_width * dpu_dot_product_width * dpu_min_ofm_channels
+ assert self.num_macs_per_cycle == accel_config.macs, f"{self.num_macs_per_cycle} != {accel_config.macs}"
# Max value in address offsets
self.max_address_offset = 1 << axi_port_address_width
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 2715c8f..cec37ef 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -60,6 +60,7 @@ class CompilerOptions:
verbose_register_command_stream=False,
verbose_operators=False,
verbose_weights=False,
+ verbose_performance=False,
show_cpu_operations=False,
tensor_allocator=TensorAllocator.Greedy,
timing=False,
@@ -77,6 +78,7 @@ class CompilerOptions:
self.verbose_register_command_stream = verbose_register_command_stream
self.verbose_operators = verbose_operators
self.verbose_weights = verbose_weights
+ self.verbose_performance = verbose_performance
self.show_cpu_operations = show_cpu_operations
self.tensor_allocator = tensor_allocator
self.timing = timing
@@ -250,4 +252,4 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type):
cpu_tensor_alignment=options.cpu_tensor_alignment,
)
- npu_performance.calc_new_performance_for_network(nng, arch)
+ npu_performance.calc_new_performance_for_network(nng, arch, network_type, options.verbose_performance)
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 0c8a907..b7607e6 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -31,10 +31,14 @@ import numpy as np
from . import numeric_util
from .architecture_allocator import ArchitectureBlockConfig
from .architecture_features import Accelerator
+from .architecture_features import ArchitectureFeatures
from .architecture_features import NpuBlockType
from .architecture_features import SHRAMElements
from .architecture_features import TensorFormat
+from .debug_database import DebugDatabase
from .nn_graph import Graph
+from .nn_graph import NetworkType
+from .nn_graph import PassPlacement
from .numeric_util import round_up
from .numeric_util import round_up_to_int
from .operation import Kernel
@@ -46,6 +50,8 @@ from .shape4d import Shape4D
from .tensor import BandwidthDirection
from .tensor import MemArea
from .tensor import TensorPurpose
+from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype
+from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type
from .weight_compressor import WeightKey
@@ -736,7 +742,81 @@ def estimate_full_op_performance(
return bws, macs, cycles_a
-def calc_new_performance_for_network(nng: Graph, arch):
+def print_performance(
+ nng: Graph,
+ arch: ArchitectureFeatures,
+ network_type: NetworkType,
+ bws: dict,
+ macs: dict,
+ cycles: dict,
+ mem_usage: dict,
+):
+ if network_type == NetworkType.TFLite:
+ nng_optype_to_input_op_type = tflite_optype_to_builtintype
+ else:
+ nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type
+
+ suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()}
+
+ for sg in nng.subgraphs:
+
+ if sg.placement != PassPlacement.Npu:
+ continue
+
+ print(f"\n{str('#') * 80}")
+ print(f"Performance for NPU Subgraph {sg.name}")
+ print(
+ f" {network_type.name + str(' Operator:'):20s}"
+ f" {str('NNG Operator:'):20s}"
+ f" {str('SRAM Usage'):>10s}"
+ f" ({str('Peak'):>6s}%):"
+ f"{str('Op Cycles'):>10s}"
+ f" ({str('Netwrk'):>6s}%)"
+ f" ["
+ f" {str('NPU'):>10s}"
+ f" {str('SRAM AC'):>10s}"
+ f" {str('DRAM AC'):>10s}"
+ f" {str('OnFlash AC'):>10s}"
+ f" {str('OffFlashAC'):>10s}"
+ f" ]:"
+ f"{str('MAC Count'):>10s}"
+ f" ({str('Netwrk'):>6s}% / {str('Util'):>6s}%):"
+ f"Name:"
+ )
+
+ for sched_op in sg.sched_ops:
+ # get source op name
+ sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1]
+ if sched_op_src_uid == DebugDatabase.NULLREF:
+ src_op_type = None
+ else:
+ src_op_type = suid_inv_map[sched_op_src_uid].type
+
+ src_op_name = nng_optype_to_input_op_type(src_op_type)
+
+ max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores
+
+ print(
+ f" {src_op_name:20s}"
+ f" {sched_op.op_type:20s}"
+ f" {mem_usage[sched_op]:10.0f}"
+ f" ({mem_usage[sched_op] / nng.memory_used[MemArea.Sram] * 100:6.2f}%)"
+ f" {cycles[sched_op][PassCycles.Total]:10.0f}"
+ f" ({cycles[sched_op][PassCycles.Total] / nng.cycles[PassCycles.Total] * 100:6.2f}%)"
+ f" ["
+ f" {cycles[sched_op][PassCycles.Npu]:10.0f}"
+ f" {cycles[sched_op][PassCycles.SramAccess]:10.0f}"
+ f" {cycles[sched_op][PassCycles.DramAccess]:10.0f}"
+ f" {cycles[sched_op][PassCycles.OnChipFlashAccess]:10.0f}"
+ f" {cycles[sched_op][PassCycles.OffChipFlashAccess]:10.0f}"
+ f" ]"
+ f" {macs[sched_op]:10d}"
+ f" ({macs[sched_op] / nng.macs * 100:6.2f}% / {macs[sched_op] / max_macs * 100:6.2f}%)"
+ f" {sched_op.name:s}"
+ )
+
+
+def calc_new_performance_for_network(nng: Graph, arch, network_type: NetworkType, verbose_performance: bool):
total_bws = make_bandwidth_array()
total_macs = 0
total_cycles = np.zeros(PassCycles.Size)
@@ -747,11 +827,25 @@ def calc_new_performance_for_network(nng: Graph, arch):
original_weight_uuids: Set[UUID] = set()
encoded_npu_weight_uuids: Set[UUID] = set()
+ bws = {}
+ macs = {}
+ cycles = {}
+ mem_usage = {}
+
for sg in nng.subgraphs:
prev_op = None
for sched_op in sg.sched_ops:
op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]
- bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config)
+ bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance(
+ arch, sg.schedule, sched_op, prev_op, op_info.block_config
+ )
+
+ # get op sram usage
+ mem_usage[sched_op] = (
+ sg.schedule.memory_snapshot[op_info.time_index]
+ if op_info.time_index < len(sg.schedule.memory_snapshot)
+ else 0
+ )
# Tensors for calculating weight sizes
original_weight = sched_op.parent_op.weights
@@ -769,9 +863,9 @@ def calc_new_performance_for_network(nng: Graph, arch):
encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id)
total_encoded_weight_size += len(encoded_npu_weight.buffer)
- total_bws += bws
- total_macs += macs
- total_cycles += cycles
+ total_bws += bws[sched_op]
+ total_macs += macs[sched_op]
+ total_cycles += cycles[sched_op]
prev_op = sched_op
nng.bandwidths = total_bws
@@ -779,3 +873,6 @@ def calc_new_performance_for_network(nng: Graph, arch):
nng.cycles = total_cycles
nng.total_original_weights = total_weight_size
nng.total_npu_encoded_weights = total_encoded_weight_size
+
+ if verbose_performance:
+ print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage)
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index bc05876..d65f1dc 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -1006,7 +1006,7 @@ class Scheduler:
options: SchedulerOptions,
):
default_schedule = self.sg.schedule
- npu_performance.calc_new_performance_for_network(self.nng, self.arch)
+ npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
@@ -1069,7 +1069,7 @@ class Scheduler:
self.apply_schedule(self.sg.schedule)
self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
- npu_performance.calc_new_performance_for_network(self.nng, self.arch)
+ npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index efe8edb..2108e20 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -341,6 +341,7 @@ def main(args=None):
)
parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
parser.add_argument("--verbose-weights", action="store_true", help="Verbose weights information")
+ parser.add_argument("--verbose-performance", action="store_true", help="Verbose performance information")
parser.add_argument(
"--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
)
@@ -476,6 +477,7 @@ def main(args=None):
verbose_register_command_stream=args.verbose_register_command_stream,
verbose_operators=args.verbose_operators,
verbose_weights=args.verbose_weights,
+ verbose_performance=args.verbose_performance,
show_cpu_operations=args.show_cpu_operations,
tensor_allocator=args.tensor_allocator,
timing=args.timing,