From 64556f32ff7bfca6036a6598034464b13b64a4ef Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Mon, 17 May 2021 22:57:46 +0100 Subject: MLBEDSW-4219: Add tensor allocation info to summary - Moved new tensor allocation info under --verbose-allocation flag - Tidied up and added histogram to --verbose--allocation print Signed-off-by: Tim Hall Change-Id: I76fb5187319aedf86f599f57b766220cafc17326 --- ethosu/vela/greedy_allocation.py | 6 ++-- ethosu/vela/nn_graph.py | 1 - ethosu/vela/scheduler.py | 1 - ethosu/vela/stats_writer.py | 8 ----- ethosu/vela/tensor_allocation.py | 78 ++++++++++++++++++++++++---------------- 5 files changed, 50 insertions(+), 44 deletions(-) (limited to 'ethosu/vela') diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py index c68a507d..6f4f8019 100644 --- a/ethosu/vela/greedy_allocation.py +++ b/ethosu/vela/greedy_allocation.py @@ -58,7 +58,7 @@ class GreedyAllocator: def dealloc(self, lr_to_dealloc): self.current_allocs = [(start_addr, lr) for start_addr, lr in self.current_allocs if lr != lr_to_dealloc] - def allocate_live_ranges(self, verbose_allocation, alignment): + def allocate_live_ranges(self, alignment): lrs = set() for lr in self.live_ranges.lrs: lrs.add((lr.start_time, -lr.end_time, lr)) @@ -75,6 +75,6 @@ class GreedyAllocator: return self.memory_required -def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment, verbose_allocation=False): +def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment): g = GreedyAllocator(nng, arch, live_ranges, mem_area) - return g.allocate_live_ranges(verbose_allocation, alignment) + return g.allocate_live_ranges(alignment) diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py index adc7904f..f810df0f 100644 --- a/ethosu/vela/nn_graph.py +++ b/ethosu/vela/nn_graph.py @@ -154,7 +154,6 @@ class Subgraph: self.memory_used = {} self.memory_used_per_type = {} - self.min_mem_usage = 0 def __str__(self): return "" % ( diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 91bad46c..65d3313b 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -528,7 +528,6 @@ class DynamicProgrammingScheduler: strat_set.bws, self.nng.batch_size, memory_used, - self.sg.min_mem_usage, len(self.sg.passes), len(strat_set.strats), ) diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py index 18b8092e..fbc47f80 100644 --- a/ethosu/vela/stats_writer.py +++ b/ethosu/vela/stats_writer.py @@ -227,7 +227,6 @@ def print_performance_metrics_for_strat( bandwidths, batch_size, memory_used, - min_mem_usage, num_passes, num_cascaded_passes, n_operations=0, @@ -270,11 +269,6 @@ def print_performance_metrics_for_strat( aug_label = label + " used" print(f"Total {aug_label:25} {memory_used[mem_area] / 1024.0:12.2f} KiB", file=f) - if mem_area == MemArea.Sram and min_mem_usage: - mem_used = memory_used[[mem_area for mem_area, label in mem_area_labels if "SRAM" in label][0]] / 1024.0 - fraction = (mem_used - min_mem_usage / 1024.0) / (min_mem_usage / 1024.0) - print(f"Theoretical minimum SRAM usage{min_mem_usage/1024.0:23.2F} KiB", file=f) - print(f"Allocator overhead{100*fraction:35.2F} %", file=f) print(file=f) print(f"{num_passes:d} passes fused into {num_cascaded_passes:d}", file=f) @@ -364,7 +358,6 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes) cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), []) - min_mem_usage = max(sg.min_mem_usage for sg in nng.subgraphs) weights_data = ( { "original": nng.total_original_weights, @@ -382,7 +375,6 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig nng.bandwidths, nng.batch_size, nng.memory_used, - min_mem_usage, n_passes, n_cascaded_passes, n_operations, diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index 7ffc6f3d..724c7c0d 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -120,37 +120,53 @@ def mark_sram_used_for_cascaded_passes(sg, lrs): ps.sram_used = sram_used -def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation): - if verbose_allocation: - if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)): - print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)") - else: - print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs") - mem_usage = 0 - for start_time, start, end, name, end_time in sorted( - ( - lr.start_time, - tens.address, - tens.address + int(math.ceil(tens.storage_size())), - tens.name + " " + str(tens.purpose), - lr.end_time, - ) - for tens, lr in lrs.ranges.items() - ): - name = name.replace("\x00", "") - print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name)) - mem_usage = max(mem_usage, end) - print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024)) - print() - - -def calculate_allocation_efficiency(lrs: List[LiveRange]): - size_at_time = [0] * (1 + max(lr.end_time for lr in lrs)) +def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc): + print("\n" + "#" * 80) + sg_placement = ( + sg.placement.name + if mem_type_set.intersection((MemType.Permanent_NPU, MemType.Permanent_CPU,)) + else "Cpu and Npu" + ) + print( + f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set (" + f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}' + f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:" + ) + + memory_hist = memory_usage_histogram(lrs.lrs) + min_mem_usage_for_alloc = max(memory_hist) + print("Start Time - End Time: Start Addr - End Addr: Tensor Size: Memory Usage: Tensor Purpose: Tensor Name") + for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted( + (lr.start_time, lr.end_time, lr.size, tens.address, tens.address + lr.size, tens.purpose, tens.name,) + for tens, lr in lrs.ranges.items() + ): + print( + f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:" + f" {memory_hist[start_time]:12d}: {purpose.display_name():15s}: {name:s}" + ) + + alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc + print( + f"Allocation Peak Tensor Size: {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})" + f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB" + ) + print( + f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})" + f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB" + ) + print( + f"Allocation Overhead: {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}" + f" Bytes ({100*alloc_overhead_fraction:.2f} %)" + ) + + +def memory_usage_histogram(lrs: List[LiveRange]): + histogram = [0] * (1 + max(lr.end_time for lr in lrs)) for lr in lrs: for t in range(lr.start_time, lr.end_time + 1): - size_at_time[t] += lr.size + histogram[t] += lr.size - return max(size_at_time) + return histogram def allocate_tensors( @@ -180,7 +196,7 @@ def allocate_tensors( if lrs.ranges: tens_alloc = tensor_allocator if tens_alloc == TensorAllocator.Greedy: - total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation) + total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment) verify_allocation(lrs, cpu_tensor_alignment) elif tens_alloc == TensorAllocator.LinearAlloc: total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment) @@ -207,10 +223,10 @@ def allocate_tensors( else: sg.memory_used_per_type[mem_type] += total_sz - print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation) + if verbose_allocation: + print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz) if mem_area == MemArea.Sram: - sg.min_mem_usage = calculate_allocation_efficiency(lrs.lrs) # Mark Sram usage for all subgraphs for sg_ in nng.subgraphs: mark_sram_used_for_cascaded_passes(sg_, lrs) -- cgit v1.2.1