aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2021-05-17 22:57:46 +0100
committertim.hall <tim.hall@arm.com>2021-05-21 15:42:06 +0000
commit64556f32ff7bfca6036a6598034464b13b64a4ef (patch)
treeedf438352185ce88beddbd7f8a2ac62f459aacaa
parent0fe2209156719ff0c17fab604dee3fef7d8b8fd7 (diff)
downloadethos-u-vela-64556f32ff7bfca6036a6598034464b13b64a4ef.tar.gz
MLBEDSW-4219: Add tensor allocation info to summary
- Moved new tensor allocation info under --verbose-allocation flag - Tidied up and added histogram to --verbose--allocation print Signed-off-by: Tim Hall <tim.hall@arm.com> Change-Id: I76fb5187319aedf86f599f57b766220cafc17326
-rw-r--r--ethosu/vela/greedy_allocation.py6
-rw-r--r--ethosu/vela/nn_graph.py1
-rw-r--r--ethosu/vela/scheduler.py1
-rw-r--r--ethosu/vela/stats_writer.py8
-rw-r--r--ethosu/vela/tensor_allocation.py78
5 files changed, 50 insertions, 44 deletions
diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py
index c68a507d..6f4f8019 100644
--- a/ethosu/vela/greedy_allocation.py
+++ b/ethosu/vela/greedy_allocation.py
@@ -58,7 +58,7 @@ class GreedyAllocator:
def dealloc(self, lr_to_dealloc):
self.current_allocs = [(start_addr, lr) for start_addr, lr in self.current_allocs if lr != lr_to_dealloc]
- def allocate_live_ranges(self, verbose_allocation, alignment):
+ def allocate_live_ranges(self, alignment):
lrs = set()
for lr in self.live_ranges.lrs:
lrs.add((lr.start_time, -lr.end_time, lr))
@@ -75,6 +75,6 @@ class GreedyAllocator:
return self.memory_required
-def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment, verbose_allocation=False):
+def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment):
g = GreedyAllocator(nng, arch, live_ranges, mem_area)
- return g.allocate_live_ranges(verbose_allocation, alignment)
+ return g.allocate_live_ranges(alignment)
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index adc7904f..f810df0f 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -154,7 +154,6 @@ class Subgraph:
self.memory_used = {}
self.memory_used_per_type = {}
- self.min_mem_usage = 0
def __str__(self):
return "<nng.Subgraph '%s', n_passes=%d, n_cascaded_passes=%d>" % (
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 91bad46c..65d3313b 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -528,7 +528,6 @@ class DynamicProgrammingScheduler:
strat_set.bws,
self.nng.batch_size,
memory_used,
- self.sg.min_mem_usage,
len(self.sg.passes),
len(strat_set.strats),
)
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 18b8092e..fbc47f80 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -227,7 +227,6 @@ def print_performance_metrics_for_strat(
bandwidths,
batch_size,
memory_used,
- min_mem_usage,
num_passes,
num_cascaded_passes,
n_operations=0,
@@ -270,11 +269,6 @@ def print_performance_metrics_for_strat(
aug_label = label + " used"
print(f"Total {aug_label:25} {memory_used[mem_area] / 1024.0:12.2f} KiB", file=f)
- if mem_area == MemArea.Sram and min_mem_usage:
- mem_used = memory_used[[mem_area for mem_area, label in mem_area_labels if "SRAM" in label][0]] / 1024.0
- fraction = (mem_used - min_mem_usage / 1024.0) / (min_mem_usage / 1024.0)
- print(f"Theoretical minimum SRAM usage{min_mem_usage/1024.0:23.2F} KiB", file=f)
- print(f"Allocator overhead{100*fraction:35.2F} %", file=f)
print(file=f)
print(f"{num_passes:d} passes fused into {num_cascaded_passes:d}", file=f)
@@ -364,7 +358,6 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig
n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
- min_mem_usage = max(sg.min_mem_usage for sg in nng.subgraphs)
weights_data = (
{
"original": nng.total_original_weights,
@@ -382,7 +375,6 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weig
nng.bandwidths,
nng.batch_size,
nng.memory_used,
- min_mem_usage,
n_passes,
n_cascaded_passes,
n_operations,
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 7ffc6f3d..724c7c0d 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -120,37 +120,53 @@ def mark_sram_used_for_cascaded_passes(sg, lrs):
ps.sram_used = sram_used
-def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation):
- if verbose_allocation:
- if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)):
- print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
- else:
- print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
- mem_usage = 0
- for start_time, start, end, name, end_time in sorted(
- (
- lr.start_time,
- tens.address,
- tens.address + int(math.ceil(tens.storage_size())),
- tens.name + " " + str(tens.purpose),
- lr.end_time,
- )
- for tens, lr in lrs.ranges.items()
- ):
- name = name.replace("\x00", "")
- print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name))
- mem_usage = max(mem_usage, end)
- print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024))
- print()
-
-
-def calculate_allocation_efficiency(lrs: List[LiveRange]):
- size_at_time = [0] * (1 + max(lr.end_time for lr in lrs))
+def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc):
+ print("\n" + "#" * 80)
+ sg_placement = (
+ sg.placement.name
+ if mem_type_set.intersection((MemType.Permanent_NPU, MemType.Permanent_CPU,))
+ else "Cpu and Npu"
+ )
+ print(
+ f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set ("
+ f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}'
+ f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:"
+ )
+
+ memory_hist = memory_usage_histogram(lrs.lrs)
+ min_mem_usage_for_alloc = max(memory_hist)
+ print("Start Time - End Time: Start Addr - End Addr: Tensor Size: Memory Usage: Tensor Purpose: Tensor Name")
+ for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
+ (lr.start_time, lr.end_time, lr.size, tens.address, tens.address + lr.size, tens.purpose, tens.name,)
+ for tens, lr in lrs.ranges.items()
+ ):
+ print(
+ f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
+ f" {memory_hist[start_time]:12d}: {purpose.display_name():15s}: {name:s}"
+ )
+
+ alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
+ print(
+ f"Allocation Peak Tensor Size: {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})"
+ f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB"
+ )
+ print(
+ f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})"
+ f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB"
+ )
+ print(
+ f"Allocation Overhead: {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}"
+ f" Bytes ({100*alloc_overhead_fraction:.2f} %)"
+ )
+
+
+def memory_usage_histogram(lrs: List[LiveRange]):
+ histogram = [0] * (1 + max(lr.end_time for lr in lrs))
for lr in lrs:
for t in range(lr.start_time, lr.end_time + 1):
- size_at_time[t] += lr.size
+ histogram[t] += lr.size
- return max(size_at_time)
+ return histogram
def allocate_tensors(
@@ -180,7 +196,7 @@ def allocate_tensors(
if lrs.ranges:
tens_alloc = tensor_allocator
if tens_alloc == TensorAllocator.Greedy:
- total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation)
+ total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment)
verify_allocation(lrs, cpu_tensor_alignment)
elif tens_alloc == TensorAllocator.LinearAlloc:
total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
@@ -207,10 +223,10 @@ def allocate_tensors(
else:
sg.memory_used_per_type[mem_type] += total_sz
- print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
+ if verbose_allocation:
+ print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz)
if mem_area == MemArea.Sram:
- sg.min_mem_usage = calculate_allocation_efficiency(lrs.lrs)
# Mark Sram usage for all subgraphs
for sg_ in nng.subgraphs:
mark_sram_used_for_cascaded_passes(sg_, lrs)