From 6e281afe19ea0cd9dba2cecfb73050c18f29d242 Mon Sep 17 00:00:00 2001 From: Johan Alfven Date: Tue, 28 Feb 2023 09:03:03 +0100 Subject: MLBEDSW-7393: MLCE: Optimize compile time for large networks - There is a problem with large networks containing many NPU subgraphs. The scheduling takes too long time since the snapshot memory calculation is always doing a complete update for the full graph. - A complete run is needed in the end to calculate all the time indexes correctly. However, when scheduling a NPU subgraph it is enough to extract live ranges for the current schedule and its operators. Change-Id: Iccb7d6728119c1428ad0b45a2ac34e92158c15bd Signed-off-by: Johan Alfven --- ethosu/vela/live_range.py | 4 ++-- ethosu/vela/scheduler.py | 45 ++++++++++++++++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py index 6a2a04ac..05e481e0 100644 --- a/ethosu/vela/live_range.py +++ b/ethosu/vela/live_range.py @@ -251,7 +251,7 @@ def extract_live_ranges_from_cascaded_passes( # If the primary-op is an NpuOp that means this is where an Npu subgraph # is called. Go into said subgraph and extract live ranges before continuing. # Use default allocation alignment of 16 for Npu tensors - lr_graph = _extract_live_ranges_from_schedule( + lr_graph = extract_live_ranges_from_schedule( op_subgraph, target_mem_area, target_mem_type_set, lr_graph ) else: @@ -316,7 +316,7 @@ def create_linear_live_range_graph(sg, target_mem_area, target_mem_type_set, lr_ return lr_graph -def _extract_live_ranges_from_schedule(sg, target_mem_area, target_mem_type_set, lr_graph): +def extract_live_ranges_from_schedule(sg, target_mem_area, target_mem_type_set, lr_graph): time_for_cascade = {} for sched_op in sg.sched_ops: op_info = sg.schedule.cost_map[sched_op] diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index eeed44fd..a50f262e 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -537,12 +537,11 @@ class Scheduler: # Collect live ranges from tensors lr_graph = live_range.LiveRangeGraph() for mem_area, mem_type_set in memories_list: - live_range.extract_live_ranges_from_cascaded_passes( - self.nng.get_root_subgraph(), + live_range.extract_live_ranges_from_schedule( + self.sg, mem_area, mem_type_set, lr_graph, - Tensor.AllocationQuantum, ) # Populate time-array with memory used by live ranges @@ -1128,12 +1127,11 @@ class Scheduler: memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))] lr_graph = live_range.LiveRangeGraph() for mem_area, mem_type_set in memories_list: - live_range.extract_live_ranges_from_cascaded_passes( - self.nng.get_root_subgraph(), + live_range.extract_live_ranges_from_schedule( + self.sg, mem_area, mem_type_set, lr_graph, - Tensor.AllocationQuantum, ) # Find the relation between the sched_op and the buffering tensor @@ -1248,12 +1246,11 @@ class Scheduler: memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))] lr_graph = live_range.LiveRangeGraph() for mem_area, mem_type_set in memories_list: - live_range.extract_live_ranges_from_cascaded_passes( - self.nng.get_root_subgraph(), + live_range.extract_live_ranges_from_schedule( + self.sg, mem_area, mem_type_set, lr_graph, - Tensor.AllocationQuantum, ) max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area) @@ -1452,6 +1449,33 @@ class Scheduler: print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}") +def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers): + mem_area = arch.fast_storage_mem_area + mem_type_set = set((MemType.Scratch, MemType.Scratch_fast)) + + # Collect live ranges for the full graph + # extract_live_ranges_from_cascaded_passes will start from the root sg and + # all sub graphs/cascaded passes will be visited and the correct time_index + # will be set for all the tensors. + lr_graph = live_range.LiveRangeGraph() + live_range.extract_live_ranges_from_cascaded_passes( + nng.get_root_subgraph(), + mem_area, + mem_type_set, + lr_graph, + Tensor.AllocationQuantum, + ) + # Populate time-array with memory used by live ranges + temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area) + + # Update snapshot for all the npu sub graphs + # Not needed for the scheduler any longer but npu_performance + # is using this information so it must have the correct state + for sg in schedulers: + sg.schedule.memory_snapshot = temporal_usage + sg.schedule.fast_storage_peak_usage = max(temporal_usage, default=0) + + def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options): """ Creates live ranges and runs tensor allocator for the current schedule @@ -1652,5 +1676,8 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o if scheduler_options.verbose_schedule: scheduler.print_schedule(sg.schedule) + # Make a full live range calculation starting from the root sg + _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers) + # Evaluate schedule _update_tensor_allocation(nng, arch, options) -- cgit v1.2.1