aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/scheduler.py
diff options
context:
space:
mode:
Diffstat (limited to 'ethosu/vela/scheduler.py')
-rw-r--r--ethosu/vela/scheduler.py75
1 files changed, 45 insertions, 30 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index cbd7ce44..6e2cd4a6 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -31,6 +31,8 @@ from typing import Optional
from typing import Tuple
from typing import TYPE_CHECKING
+from .utils import progress_print
+
# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
if TYPE_CHECKING:
from .npu_performance import CycleCost
@@ -148,10 +150,12 @@ class SchedulerOptions:
optimization_strategy,
sram_target,
verbose_schedule,
+ verbose_progress=False,
):
self.optimization_strategy = optimization_strategy
self.optimization_sram_limit = sram_target
self.verbose_schedule = verbose_schedule
+ self.verbose_progress = verbose_progress
def __str__(self) -> str:
return f"{type(self).__name__}: {str(self.__dict__)}"
@@ -531,7 +535,9 @@ class Scheduler:
def create_initial_schedule(self) -> Schedule:
"""Creates an initial schedule with no cascading or buffering of any kind"""
schedule = Schedule(self.sg, "MAX")
- for op in self.sched_ops:
+ verbose_progress = self.scheduler_options.verbose_progress
+ for index, op in enumerate(self.sched_ops):
+ progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops)
cost = op.create_scheduler_info(self.nng, op.ofm.shape)
cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
schedule.cost_map[op] = cost
@@ -540,16 +546,12 @@ class Scheduler:
def update_op_memory_snapshot(self, schedule: Schedule):
memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
-
+ verbose_progress = self.scheduler_options.verbose_progress
+ progress_print(verbose_progress, "")
# Collect live ranges from tensors
lr_graph = live_range.LiveRangeGraph()
for mem_area, mem_type_set in memories_list:
- live_range.extract_live_ranges_from_schedule(
- self.sg,
- mem_area,
- mem_type_set,
- lr_graph,
- )
+ live_range.extract_live_ranges_from_schedule(self.sg, mem_area, mem_type_set, lr_graph, verbose_progress)
# Populate time-array with memory used by live ranges
temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
@@ -607,9 +609,10 @@ class Scheduler:
def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
"""Create a buffered schedule"""
buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
-
+ verbose_progress = self.scheduler_options.verbose_progress
prev_op = None
- for sched_op in self.sched_ops:
+ for index, sched_op in enumerate(self.sched_ops):
+ progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops)
if sched_op not in ref_schedule.cost_map:
# sched_op is not part of this sub-schedule - skip
continue
@@ -871,10 +874,11 @@ class Scheduler:
next operators stride"""
min_schedule = Schedule(self.sg, "MIN")
cost_map = min_schedule.cost_map
-
+ verbose_progress = self.scheduler_options.verbose_progress
# Keep track of the previous Op - which consumes the current Op's OFM
prev_op: Optional[SchedulerOperation] = None
- for sched_op in reversed(self.sched_ops):
+ for index, sched_op in enumerate(reversed(self.sched_ops)):
+ progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops)
min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
@@ -968,13 +972,15 @@ class Scheduler:
return peak_mem_usage
def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):
+ verbose_progress = self.scheduler_options.verbose_progress
# Update memory snapshot
self.sg.schedule = min_schedule
self.update_op_memory_snapshot(min_schedule)
# Calculate residual memory for Min schedule
non_local_mem_usage = {}
- for sched_op in self.sched_ops:
+ for index, sched_op in enumerate(self.sched_ops):
+ progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops)
time_index = min_schedule.cost_map[sched_op].time_index
if self.arch.is_spilling_enabled():
@@ -1089,13 +1095,16 @@ class Scheduler:
options: SchedulerOptions,
) -> Schedule:
"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
+ verbose_progress = options.verbose_progress
sram_limit = options.optimization_sram_limit
if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
# Maximum performance schedule fits within the SRAM target
return max_sched
# Iterate over a copy of the cascades since they may change during the loop
- for cascade_info in list(schedule.cascades.values()):
+ cascades = list(schedule.cascades.values())
+ for index, cascade_info in enumerate(cascades):
+ progress_print(verbose_progress, "Processing cascade", index, cascades)
# Optimize the sub-schedule in this cascade
opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
if opt_sub_schedule:
@@ -1119,6 +1128,7 @@ class Scheduler:
min_schedule: Schedule,
options: SchedulerOptions,
):
+ verbose_progress = options.verbose_progress
default_schedule = self.sg.schedule
npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
@@ -1135,12 +1145,7 @@ class Scheduler:
memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
lr_graph = live_range.LiveRangeGraph()
for mem_area, mem_type_set in memories_list:
- live_range.extract_live_ranges_from_schedule(
- self.sg,
- mem_area,
- mem_type_set,
- lr_graph,
- )
+ live_range.extract_live_ranges_from_schedule(self.sg, mem_area, mem_type_set, lr_graph, verbose_progress)
# Find the relation between the sched_op and the buffering tensor
weight_ops = {}
@@ -1416,7 +1421,9 @@ class Scheduler:
print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
-def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):
+def _update_memory_snapshot_for_all_npu_graphs(
+ nng: Graph, arch: ArchitectureFeatures, schedulers, verbose_progress: bool = False
+):
mem_area = arch.fast_storage_mem_area
mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))
@@ -1426,11 +1433,7 @@ def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFea
# will be set for all the tensors.
lr_graph = live_range.LiveRangeGraph()
live_range.extract_live_ranges_from_cascaded_passes(
- nng.get_root_subgraph(),
- mem_area,
- mem_type_set,
- lr_graph,
- Tensor.AllocationQuantum,
+ nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum, verbose_progress
)
# Populate time-array with memory used by live ranges
temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)
@@ -1471,6 +1474,7 @@ def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
mem_type_set,
tensor_allocator=options.tensor_allocator,
verbose_allocation=options.verbose_allocation,
+ verbose_progress=options.verbose_progress,
cpu_tensor_alignment=options.cpu_tensor_alignment,
hillclimb_max_iterations=options.hillclimb_max_iterations,
)
@@ -1570,14 +1574,17 @@ class FastStorageComponentAllocator:
def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
"""Entry point for the Scheduler"""
+ verbose_progress = scheduler_options.verbose_progress
# Initialize CPU subgraphs
schedulers = dict()
# Initialize schedulers with max schedule. Only schedule NPU subgraphs
- for sg in nng.subgraphs:
+ for sg_idx, sg in enumerate(nng.subgraphs):
+ progress_print(verbose_progress, "Processing subgraph", sg_idx, nng.subgraphs)
if sg.placement != PassPlacement.Npu:
# Create cascaded passes for CPU Ops
cascaded_passes = []
- for idx, ps in enumerate(sg.passes):
+ for pass_idx, ps in enumerate(sg.passes):
+ progress_print(verbose_progress, "Creating cascaded passes for CPU op", pass_idx, sg.passes)
cps = CascadedPass(
ps.name,
SchedulingStrategy.WeightStream,
@@ -1589,7 +1596,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
False,
)
- cps.time = idx
+ cps.time = pass_idx
ps.cascade = cps
cascaded_passes.append(cps)
@@ -1599,6 +1606,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
scheduler = Scheduler(nng, sg, arch, scheduler_options)
schedulers[sg] = scheduler
+ progress_print(verbose_progress, "Creating scheduler representation")
scheduler.create_scheduler_representation(arch)
sg.sched_ops = scheduler.sched_ops
@@ -1606,6 +1614,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
max_schedule_template = scheduler.create_initial_schedule()
scheduler.max_schedule = max_schedule_template
+ progress_print(verbose_progress, "Creating optimised max schedule")
# Create the optimimised Max schedule
sg.schedule = max_schedule_template
scheduler.update_op_memory_snapshot(max_schedule_template)
@@ -1613,6 +1622,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
sg.schedule = opt_max_schedule
scheduler.update_op_memory_snapshot(opt_max_schedule)
+ progress_print(verbose_progress, "Creating minimal schedule")
# Create Min schedule
min_schedule = scheduler.propose_minimal_schedule()
initial_sram_limit = scheduler_options.optimization_sram_limit
@@ -1620,11 +1630,13 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
initial_sram_limit = scheduler.min_memory_req
# Build cascades for Min schedule
+ progress_print(verbose_progress, "Building cascades for minimal schedule")
scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)
sg.schedule = min_schedule
scheduler.update_op_memory_snapshot(min_schedule)
if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
+ progress_print(verbose_progress, "Creating schedule optimized for performance")
# Create an optimized schedule
sg.schedule = scheduler.optimize_schedule(
min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
@@ -1635,6 +1647,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:
+ progress_print(verbose_progress, "Optimizing weight buffering size")
# It might be possible to gain performance by reducing
# weight buffer size and instead fit fms in fast storage
scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)
@@ -1642,8 +1655,10 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
if scheduler_options.verbose_schedule:
scheduler.print_schedule(sg.schedule)
+ progress_print(verbose_progress, "Update memory snapshot for all NPU graphs")
# Make a full live range calculation starting from the root sg
- _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)
+ _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers, verbose_progress)
+ progress_print(verbose_progress, "Update tensor allocation")
# Evaluate schedule
_update_tensor_allocation(nng, arch, options)