diff options
author | Louis Verhaard <louis.verhaard@arm.com> | 2021-03-30 10:18:28 +0200 |
---|---|---|
committer | Louis Verhaard <louis.verhaard@arm.com> | 2021-03-30 16:15:11 +0200 |
commit | 226ecaf4561f421206d1593eac0fa57dd56db82e (patch) | |
tree | 539c69b7f61d0d9452ec1554c861677dfb6a480e /ethosu/vela/tensor_allocation.py | |
parent | 3438c929528583bc019055ad7057c08271b0cee7 (diff) | |
download | ethos-u-vela-226ecaf4561f421206d1593eac0fa57dd56db82e.tar.gz |
Performance improvement in tensor allocation
- Tensor allocation verification was O(N^2), is now closer to O(N)
- Removed a sort in HillClimb allocator
Change-Id: I286a269881490c485cc2b0eeab3b1ecffa8f3df0
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
Diffstat (limited to 'ethosu/vela/tensor_allocation.py')
-rw-r--r-- | ethosu/vela/tensor_allocation.py | 38 |
1 files changed, 19 insertions, 19 deletions
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index b2ea7de6..0ad30e5f 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -17,6 +17,7 @@ # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been # worked out from the allowable overlaps that are calculated by the live range analysis. import math +from typing import List import numpy as np @@ -25,6 +26,7 @@ from . import live_range from . import numeric_util from .errors import AllocationError from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges +from .live_range import LiveRange from .live_range import LiveRangeGraph from .nn_graph import TensorAllocator from .tensor import MemArea @@ -65,14 +67,10 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int: # Allocates using the hill climb allocator - lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()} - lr_list = [lr for _, _, lr in lr_set] - lr_list.sort() - - addresses = hillclimb_allocation.allocate_live_ranges(lr_list) + addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs) # The result is a list containing the allocated addresses total_sz = 0 - for lr, address in zip(lr_list, addresses): + for lr, address in zip(live_ranges.lrs, addresses): total_sz = max(total_sz, address + lr.size) lr.set_address(address) verify_allocation(live_ranges, alloc_granularity) @@ -80,7 +78,7 @@ def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularit def verify_alignment(live_ranges: LiveRangeGraph, alignment: int): - for lr in live_ranges.ranges.values(): + for lr in live_ranges.lrs: for tens in lr.tensors: if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list): # This is a CPU tensor, verify alignment @@ -89,12 +87,16 @@ def verify_alignment(live_ranges: LiveRangeGraph, alignment: int): def verify_allocation(live_ranges: LiveRangeGraph, alignment: int): - lrs = list(live_ranges.ranges.values()) - for n in lrs: - verify_alignment(live_ranges, alignment) - - for m in lrs: - if n != m and n.overlaps_ranges(m): + verify_alignment(live_ranges, alignment) + nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs) + # Contains active live ranges at each timestamp + lrs_at_time = [[] for i in range(nr_time_slots)] + for lr in live_ranges.lrs: + for t in range(lr.start_time, lr.end_time + 1): + lrs_at_time[t].append(lr) + for t in range(nr_time_slots): + for ix, n in enumerate(lrs_at_time[t]): + for m in lrs_at_time[t][ix + 1 :]: overlap, tens_n, tens_m = n.overlaps_address(m) if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address): raise AllocationError( @@ -142,11 +144,9 @@ def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation): print() -def calculate_allocation_efficiency(lrs): - lr_set = set(lrs.ranges.values()) - - size_at_time = [0] * (1 + max(lr.end_time for lr in lr_set)) - for lr in lr_set: +def calculate_allocation_efficiency(lrs: List[LiveRange]): + size_at_time = [0] * (1 + max(lr.end_time for lr in lrs)) + for lr in lrs: for t in range(lr.start_time, lr.end_time + 1): size_at_time[t] += lr.size @@ -210,7 +210,7 @@ def allocate_tensors( print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation) if mem_area == MemArea.Sram: - sg.min_mem_usage = calculate_allocation_efficiency(lrs) + sg.min_mem_usage = calculate_allocation_efficiency(lrs.lrs) # Mark Sram usage for all subgraphs for sg_ in nng.subgraphs: mark_sram_used_for_cascaded_passes(sg_, lrs) |