diff options
Diffstat (limited to 'ethosu/vela/scheduler.py')
-rw-r--r-- | ethosu/vela/scheduler.py | 75 |
1 files changed, 45 insertions, 30 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index cbd7ce44..6e2cd4a6 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -31,6 +31,8 @@ from typing import Optional from typing import Tuple from typing import TYPE_CHECKING +from .utils import progress_print + # Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import. if TYPE_CHECKING: from .npu_performance import CycleCost @@ -148,10 +150,12 @@ class SchedulerOptions: optimization_strategy, sram_target, verbose_schedule, + verbose_progress=False, ): self.optimization_strategy = optimization_strategy self.optimization_sram_limit = sram_target self.verbose_schedule = verbose_schedule + self.verbose_progress = verbose_progress def __str__(self) -> str: return f"{type(self).__name__}: {str(self.__dict__)}" @@ -531,7 +535,9 @@ class Scheduler: def create_initial_schedule(self) -> Schedule: """Creates an initial schedule with no cascading or buffering of any kind""" schedule = Schedule(self.sg, "MAX") - for op in self.sched_ops: + verbose_progress = self.scheduler_options.verbose_progress + for index, op in enumerate(self.sched_ops): + progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops) cost = op.create_scheduler_info(self.nng, op.ofm.shape) cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth) schedule.cost_map[op] = cost @@ -540,16 +546,12 @@ class Scheduler: def update_op_memory_snapshot(self, schedule: Schedule): memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))] - + verbose_progress = self.scheduler_options.verbose_progress + progress_print(verbose_progress, "") # Collect live ranges from tensors lr_graph = live_range.LiveRangeGraph() for mem_area, mem_type_set in memories_list: - live_range.extract_live_ranges_from_schedule( - self.sg, - mem_area, - mem_type_set, - lr_graph, - ) + live_range.extract_live_ranges_from_schedule(self.sg, mem_area, mem_type_set, lr_graph, verbose_progress) # Populate time-array with memory used by live ranges temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area) @@ -607,9 +609,10 @@ class Scheduler: def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes): """Create a buffered schedule""" buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED") - + verbose_progress = self.scheduler_options.verbose_progress prev_op = None - for sched_op in self.sched_ops: + for index, sched_op in enumerate(self.sched_ops): + progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops) if sched_op not in ref_schedule.cost_map: # sched_op is not part of this sub-schedule - skip continue @@ -871,10 +874,11 @@ class Scheduler: next operators stride""" min_schedule = Schedule(self.sg, "MIN") cost_map = min_schedule.cost_map - + verbose_progress = self.scheduler_options.verbose_progress # Keep track of the previous Op - which consumes the current Op's OFM prev_op: Optional[SchedulerOperation] = None - for sched_op in reversed(self.sched_ops): + for index, sched_op in enumerate(reversed(self.sched_ops)): + progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops) min_stripe_height = prev_op.kernel.stride.y if prev_op else 1 min_stripe = sched_op.ofm.shape.with_height(min_stripe_height) @@ -968,13 +972,15 @@ class Scheduler: return peak_mem_usage def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int): + verbose_progress = self.scheduler_options.verbose_progress # Update memory snapshot self.sg.schedule = min_schedule self.update_op_memory_snapshot(min_schedule) # Calculate residual memory for Min schedule non_local_mem_usage = {} - for sched_op in self.sched_ops: + for index, sched_op in enumerate(self.sched_ops): + progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops) time_index = min_schedule.cost_map[sched_op].time_index if self.arch.is_spilling_enabled(): @@ -1089,13 +1095,16 @@ class Scheduler: options: SchedulerOptions, ) -> Schedule: """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule""" + verbose_progress = options.verbose_progress sram_limit = options.optimization_sram_limit if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled(): # Maximum performance schedule fits within the SRAM target return max_sched # Iterate over a copy of the cascades since they may change during the loop - for cascade_info in list(schedule.cascades.values()): + cascades = list(schedule.cascades.values()) + for index, cascade_info in enumerate(cascades): + progress_print(verbose_progress, "Processing cascade", index, cascades) # Optimize the sub-schedule in this cascade opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit) if opt_sub_schedule: @@ -1119,6 +1128,7 @@ class Scheduler: min_schedule: Schedule, options: SchedulerOptions, ): + verbose_progress = options.verbose_progress default_schedule = self.sg.schedule npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False) default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total] @@ -1135,12 +1145,7 @@ class Scheduler: memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))] lr_graph = live_range.LiveRangeGraph() for mem_area, mem_type_set in memories_list: - live_range.extract_live_ranges_from_schedule( - self.sg, - mem_area, - mem_type_set, - lr_graph, - ) + live_range.extract_live_ranges_from_schedule(self.sg, mem_area, mem_type_set, lr_graph, verbose_progress) # Find the relation between the sched_op and the buffering tensor weight_ops = {} @@ -1416,7 +1421,9 @@ class Scheduler: print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}") -def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers): +def _update_memory_snapshot_for_all_npu_graphs( + nng: Graph, arch: ArchitectureFeatures, schedulers, verbose_progress: bool = False +): mem_area = arch.fast_storage_mem_area mem_type_set = set((MemType.Scratch, MemType.Scratch_fast)) @@ -1426,11 +1433,7 @@ def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFea # will be set for all the tensors. lr_graph = live_range.LiveRangeGraph() live_range.extract_live_ranges_from_cascaded_passes( - nng.get_root_subgraph(), - mem_area, - mem_type_set, - lr_graph, - Tensor.AllocationQuantum, + nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum, verbose_progress ) # Populate time-array with memory used by live ranges temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area) @@ -1471,6 +1474,7 @@ def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options): mem_type_set, tensor_allocator=options.tensor_allocator, verbose_allocation=options.verbose_allocation, + verbose_progress=options.verbose_progress, cpu_tensor_alignment=options.cpu_tensor_alignment, hillclimb_max_iterations=options.hillclimb_max_iterations, ) @@ -1570,14 +1574,17 @@ class FastStorageComponentAllocator: def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions): """Entry point for the Scheduler""" + verbose_progress = scheduler_options.verbose_progress # Initialize CPU subgraphs schedulers = dict() # Initialize schedulers with max schedule. Only schedule NPU subgraphs - for sg in nng.subgraphs: + for sg_idx, sg in enumerate(nng.subgraphs): + progress_print(verbose_progress, "Processing subgraph", sg_idx, nng.subgraphs) if sg.placement != PassPlacement.Npu: # Create cascaded passes for CPU Ops cascaded_passes = [] - for idx, ps in enumerate(sg.passes): + for pass_idx, ps in enumerate(sg.passes): + progress_print(verbose_progress, "Creating cascaded passes for CPU op", pass_idx, sg.passes) cps = CascadedPass( ps.name, SchedulingStrategy.WeightStream, @@ -1589,7 +1596,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o False, ) - cps.time = idx + cps.time = pass_idx ps.cascade = cps cascaded_passes.append(cps) @@ -1599,6 +1606,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o scheduler = Scheduler(nng, sg, arch, scheduler_options) schedulers[sg] = scheduler + progress_print(verbose_progress, "Creating scheduler representation") scheduler.create_scheduler_representation(arch) sg.sched_ops = scheduler.sched_ops @@ -1606,6 +1614,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o max_schedule_template = scheduler.create_initial_schedule() scheduler.max_schedule = max_schedule_template + progress_print(verbose_progress, "Creating optimised max schedule") # Create the optimimised Max schedule sg.schedule = max_schedule_template scheduler.update_op_memory_snapshot(max_schedule_template) @@ -1613,6 +1622,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o sg.schedule = opt_max_schedule scheduler.update_op_memory_snapshot(opt_max_schedule) + progress_print(verbose_progress, "Creating minimal schedule") # Create Min schedule min_schedule = scheduler.propose_minimal_schedule() initial_sram_limit = scheduler_options.optimization_sram_limit @@ -1620,11 +1630,13 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o initial_sram_limit = scheduler.min_memory_req # Build cascades for Min schedule + progress_print(verbose_progress, "Building cascades for minimal schedule") scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit) sg.schedule = min_schedule scheduler.update_op_memory_snapshot(min_schedule) if scheduler_options.optimization_strategy == OptimizationStrategy.Performance: + progress_print(verbose_progress, "Creating schedule optimized for performance") # Create an optimized schedule sg.schedule = scheduler.optimize_schedule( min_schedule, opt_max_schedule, max_schedule_template, scheduler_options @@ -1635,6 +1647,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit) if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms: + progress_print(verbose_progress, "Optimizing weight buffering size") # It might be possible to gain performance by reducing # weight buffer size and instead fit fms in fast storage scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options) @@ -1642,8 +1655,10 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o if scheduler_options.verbose_schedule: scheduler.print_schedule(sg.schedule) + progress_print(verbose_progress, "Update memory snapshot for all NPU graphs") # Make a full live range calculation starting from the root sg - _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers) + _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers, verbose_progress) + progress_print(verbose_progress, "Update tensor allocation") # Evaluate schedule _update_tensor_allocation(nng, arch, options) |