diff options
Diffstat (limited to 'ethosu')
-rw-r--r-- | ethosu/vela/scheduler.py | 52 |
1 files changed, 44 insertions, 8 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 021bcc9e..fbe2e169 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -1281,9 +1281,9 @@ class Scheduler: for tens in lr.tensors: competing_tens_access[tens] = 0 - sz = len(competing_lrs) - # All lrs and their tensors have been handled if sz is zero, we may thus return - if sz == 0: + competing_lrs_sz = len(competing_lrs) + # All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return + if competing_lrs_sz == 0: return # Estimate element access for all tensors that are competing for a place in fast-storage. @@ -1307,16 +1307,52 @@ class Scheduler: competing_tens_access[tens] += access.ofm_write competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size)) + + # Remove lrs that have a live range that is too long compared to others. + # They are causing problems for the HillClimb Allocator when it has to + # change the allocation indices, in order to fit all the allocations into SRAM. + # This problem only occur in larger networks with complex graphs. + # + # Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS + # at the time. Too many will give too long compilation time + # + # Too long is currently decided to be (based on experience, analyzing many networks): + # Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS. + # If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i. + if competing_lrs_sz > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS: + # create a copy of the original list to iterate over because the original version is modified in-loop + competing_lrs_copy = competing_lrs.copy() + for i, lr in enumerate(competing_lrs_copy): + lr_time = lr.end_time - lr.start_time + if lr_time < FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE: + # Skip small ranges + continue + + # Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS + cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, competing_lrs_sz - 1) + + # Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE + if ( + lr.end_time + > competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE + ): + # Current lr live time stands out, remove it. No use adding it to the + # evicted_fms list since the lr should not be included in the fast storage allocation + FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms) + competing_lrs.remove(lr) + start = 0 - start_time = competing_lrs[0].start_time end_time = competing_lrs[0].end_time + competing_lrs_sz = len(competing_lrs) component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit) # Build up components and then allocate each separately for i, lr in enumerate(competing_lrs): - if lr.start_time <= end_time and i - start < component_allocator.MAX_EXHAUSTIVE_LIFE_RANGE: - start_time = min(start_time, lr.start_time) + nbr_items = i - start + if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS): end_time = max(end_time, lr.end_time) else: + # Number items reached max items or current lr's start time + # does not overlap with previous lr's end time component_allocator.allocate_component( component_allocator, competing_lrs[start:i], @@ -1328,11 +1364,10 @@ class Scheduler: self.evicted_fms, ) start = i - start_time = lr.start_time end_time = lr.end_time component_allocator.allocate_component( component_allocator, - competing_lrs[start:sz], + competing_lrs[start:competing_lrs_sz], max_mem_usage, base_mem_usage, staging_limit, @@ -1446,6 +1481,7 @@ def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options): class FastStorageComponentAllocator: MAX_EXHAUSTIVE_LIFE_RANGE = 20 + MAX_EXHAUSTIVE_ITEMS = 20 def __init__(self, base_mem_usage, max_mem_usage, staging_limit): self.base_mem_usage = base_mem_usage |