aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfvén <johan.alfven@arm.com>2022-10-07 18:03:48 +0200
committerJohan Alfvén <johan.alfven@arm.com>2022-10-26 10:11:09 +0200
commit3a6325f667bfdfce3031b2e6f92ef72759248c99 (patch)
treeb156fe4ba0c071c278027107e1f6680c96e49b29
parent0f2e59f8bff0ca68794db1406e1264531da1d3a5 (diff)
downloadethos-u-vela-3a6325f667bfdfce3031b2e6f92ef72759248c99.tar.gz
MLBEDSW-6984: Optimize fast storage for feature maps
- Remove very long live ranges that are standing out compared to its neighbors. This can be seen on large networks with complex structure. If they are chosen instead of shorter live ranges, it will be difficult for the HillClimb Allocator to find a perfect fit in the final allocation. Signed-off-by: Johan Alfven <johan.alfven@arm.com> Change-Id: I6cf23adfdc06c1e93e12e9cf816453d940ff31f7
-rw-r--r--ethosu/vela/scheduler.py52
1 files changed, 44 insertions, 8 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 021bcc9e..fbe2e169 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -1281,9 +1281,9 @@ class Scheduler:
for tens in lr.tensors:
competing_tens_access[tens] = 0
- sz = len(competing_lrs)
- # All lrs and their tensors have been handled if sz is zero, we may thus return
- if sz == 0:
+ competing_lrs_sz = len(competing_lrs)
+ # All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return
+ if competing_lrs_sz == 0:
return
# Estimate element access for all tensors that are competing for a place in fast-storage.
@@ -1307,16 +1307,52 @@ class Scheduler:
competing_tens_access[tens] += access.ofm_write
competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))
+
+ # Remove lrs that have a live range that is too long compared to others.
+ # They are causing problems for the HillClimb Allocator when it has to
+ # change the allocation indices, in order to fit all the allocations into SRAM.
+ # This problem only occur in larger networks with complex graphs.
+ #
+ # Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS
+ # at the time. Too many will give too long compilation time
+ #
+ # Too long is currently decided to be (based on experience, analyzing many networks):
+ # Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.
+ # If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.
+ if competing_lrs_sz > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:
+ # create a copy of the original list to iterate over because the original version is modified in-loop
+ competing_lrs_copy = competing_lrs.copy()
+ for i, lr in enumerate(competing_lrs_copy):
+ lr_time = lr.end_time - lr.start_time
+ if lr_time < FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:
+ # Skip small ranges
+ continue
+
+ # Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS
+ cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, competing_lrs_sz - 1)
+
+ # Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE
+ if (
+ lr.end_time
+ > competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE
+ ):
+ # Current lr live time stands out, remove it. No use adding it to the
+ # evicted_fms list since the lr should not be included in the fast storage allocation
+ FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
+ competing_lrs.remove(lr)
+
start = 0
- start_time = competing_lrs[0].start_time
end_time = competing_lrs[0].end_time
+ competing_lrs_sz = len(competing_lrs)
component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)
# Build up components and then allocate each separately
for i, lr in enumerate(competing_lrs):
- if lr.start_time <= end_time and i - start < component_allocator.MAX_EXHAUSTIVE_LIFE_RANGE:
- start_time = min(start_time, lr.start_time)
+ nbr_items = i - start
+ if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):
end_time = max(end_time, lr.end_time)
else:
+ # Number items reached max items or current lr's start time
+ # does not overlap with previous lr's end time
component_allocator.allocate_component(
component_allocator,
competing_lrs[start:i],
@@ -1328,11 +1364,10 @@ class Scheduler:
self.evicted_fms,
)
start = i
- start_time = lr.start_time
end_time = lr.end_time
component_allocator.allocate_component(
component_allocator,
- competing_lrs[start:sz],
+ competing_lrs[start:competing_lrs_sz],
max_mem_usage,
base_mem_usage,
staging_limit,
@@ -1446,6 +1481,7 @@ def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
class FastStorageComponentAllocator:
MAX_EXHAUSTIVE_LIFE_RANGE = 20
+ MAX_EXHAUSTIVE_ITEMS = 20
def __init__(self, base_mem_usage, max_mem_usage, staging_limit):
self.base_mem_usage = base_mem_usage