aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ethosu/vela/compiler_driver.py3
-rw-r--r--ethosu/vela/scheduler.py23
2 files changed, 20 insertions, 6 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index a2b20e47..7b1ea213 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -174,6 +174,9 @@ def compiler_driver(nng, arch, options, scheduler_options):
# block config, and calc and pack the scales and biases
weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
+ if not scheduler_options.keep_scale_placement:
+ scheduler.move_scales_to_fast_storage(nng, arch)
+
# LiveRanges for constant tensors for all Npu subgraphs
permanent_storage = arch.permanent_storage_mem_area
lr_graph_flash = live_range.LiveRangeGraph()
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 889bd06b..d6e890ab 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -1021,10 +1021,25 @@ class DynamicProgrammingScheduler:
# in use_fast_storage_for_feature_maps
self.sg.scheduling_info["feature_map_rewrites"] = fast_storage_tensor_rewrites
- def move_scales_to_fast_storage(self, sg, arch):
+
+def move_scales_to_fast_storage(nng, arch):
+ for sg in nng.subgraphs:
# IFM streamed ops reads bias tensors several times, move these to fast storage
for cp in sg.cascaded_passes:
if cp.strategy == SchedulingStrategy.IfmStream:
+ # Calculate SRAM usage
+ new_size = 0
+ all_tens = []
+ for ps in cp.passes:
+ pass_tens = np.array([ps.ifm_tensor, ps.ifm2_tensor, ps.ofm_tensor, ps.weight_tensor])
+ pass_tens = np.append(pass_tens, ps.intermediates)
+ for tens in pass_tens:
+ if tens and tens.mem_area == MemArea.Sram and tens not in all_tens:
+ all_tens.append(tens)
+ new_size += tens.storage_size()
+
+ cp.sram_used = new_size
+
for ps in cp.passes:
if ps.scale_tensor:
tens = ps.scale_tensor
@@ -1037,10 +1052,9 @@ class DynamicProgrammingScheduler:
new_tens = tens.clone_into_fast_storage(arch)
new_tens.consumer_list = tens.consumer_list.copy()
new_tens.purpose = TensorPurpose.FSBias
- new_tens.element_size_bytes = 10
new_tens_size = new_tens.storage_size()
- if (cp.sram_used + new_tens_size) <= self.sram_limit:
+ if (cp.sram_used + new_tens_size) <= arch.sram_size:
# Create DMA cmd
dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma")
dma_cmd.inputs = [tens]
@@ -1082,9 +1096,6 @@ def schedule_passes(nng, arch, options: SchedulerOptions):
dps.apply_result(strat_set, arch)
- if not options.keep_scale_placement:
- dps.move_scales_to_fast_storage(sg, arch)
-
if options.verbose_schedule:
sg.print_cascaded_passes()