aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfven <johan.alfven@arm.com>2023-02-28 09:03:03 +0100
committerFredrik Svedberg <fredrik.svedberg@arm.com>2023-03-13 08:40:35 +0000
commit6e281afe19ea0cd9dba2cecfb73050c18f29d242 (patch)
tree7741feb7a6ac2f5d7822be8dc46b43f0589aca53
parentc72cac8e8beb6bd52bdf6a41e6f7182b5167ee5d (diff)
downloadethos-u-vela-6e281afe19ea0cd9dba2cecfb73050c18f29d242.tar.gz
MLBEDSW-7393: MLCE: Optimize compile time for large networks
- There is a problem with large networks containing many NPU subgraphs. The scheduling takes too long time since the snapshot memory calculation is always doing a complete update for the full graph. - A complete run is needed in the end to calculate all the time indexes correctly. However, when scheduling a NPU subgraph it is enough to extract live ranges for the current schedule and its operators. Change-Id: Iccb7d6728119c1428ad0b45a2ac34e92158c15bd Signed-off-by: Johan Alfven <johan.alfven@arm.com>
-rw-r--r--ethosu/vela/live_range.py4
-rw-r--r--ethosu/vela/scheduler.py45
2 files changed, 38 insertions, 11 deletions
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 6a2a04ac..05e481e0 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -251,7 +251,7 @@ def extract_live_ranges_from_cascaded_passes(
# If the primary-op is an NpuOp that means this is where an Npu subgraph
# is called. Go into said subgraph and extract live ranges before continuing.
# Use default allocation alignment of 16 for Npu tensors
- lr_graph = _extract_live_ranges_from_schedule(
+ lr_graph = extract_live_ranges_from_schedule(
op_subgraph, target_mem_area, target_mem_type_set, lr_graph
)
else:
@@ -316,7 +316,7 @@ def create_linear_live_range_graph(sg, target_mem_area, target_mem_type_set, lr_
return lr_graph
-def _extract_live_ranges_from_schedule(sg, target_mem_area, target_mem_type_set, lr_graph):
+def extract_live_ranges_from_schedule(sg, target_mem_area, target_mem_type_set, lr_graph):
time_for_cascade = {}
for sched_op in sg.sched_ops:
op_info = sg.schedule.cost_map[sched_op]
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index eeed44fd..a50f262e 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -537,12 +537,11 @@ class Scheduler:
# Collect live ranges from tensors
lr_graph = live_range.LiveRangeGraph()
for mem_area, mem_type_set in memories_list:
- live_range.extract_live_ranges_from_cascaded_passes(
- self.nng.get_root_subgraph(),
+ live_range.extract_live_ranges_from_schedule(
+ self.sg,
mem_area,
mem_type_set,
lr_graph,
- Tensor.AllocationQuantum,
)
# Populate time-array with memory used by live ranges
@@ -1128,12 +1127,11 @@ class Scheduler:
memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
lr_graph = live_range.LiveRangeGraph()
for mem_area, mem_type_set in memories_list:
- live_range.extract_live_ranges_from_cascaded_passes(
- self.nng.get_root_subgraph(),
+ live_range.extract_live_ranges_from_schedule(
+ self.sg,
mem_area,
mem_type_set,
lr_graph,
- Tensor.AllocationQuantum,
)
# Find the relation between the sched_op and the buffering tensor
@@ -1248,12 +1246,11 @@ class Scheduler:
memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
lr_graph = live_range.LiveRangeGraph()
for mem_area, mem_type_set in memories_list:
- live_range.extract_live_ranges_from_cascaded_passes(
- self.nng.get_root_subgraph(),
+ live_range.extract_live_ranges_from_schedule(
+ self.sg,
mem_area,
mem_type_set,
lr_graph,
- Tensor.AllocationQuantum,
)
max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
@@ -1452,6 +1449,33 @@ class Scheduler:
print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
+def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):
+ mem_area = arch.fast_storage_mem_area
+ mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))
+
+ # Collect live ranges for the full graph
+ # extract_live_ranges_from_cascaded_passes will start from the root sg and
+ # all sub graphs/cascaded passes will be visited and the correct time_index
+ # will be set for all the tensors.
+ lr_graph = live_range.LiveRangeGraph()
+ live_range.extract_live_ranges_from_cascaded_passes(
+ nng.get_root_subgraph(),
+ mem_area,
+ mem_type_set,
+ lr_graph,
+ Tensor.AllocationQuantum,
+ )
+ # Populate time-array with memory used by live ranges
+ temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)
+
+ # Update snapshot for all the npu sub graphs
+ # Not needed for the scheduler any longer but npu_performance
+ # is using this information so it must have the correct state
+ for sg in schedulers:
+ sg.schedule.memory_snapshot = temporal_usage
+ sg.schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
+
+
def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
"""
Creates live ranges and runs tensor allocator for the current schedule
@@ -1652,5 +1676,8 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
if scheduler_options.verbose_schedule:
scheduler.print_schedule(sg.schedule)
+ # Make a full live range calculation starting from the root sg
+ _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)
+
# Evaluate schedule
_update_tensor_allocation(nng, arch, options)