diff options
author | Andreas Nevalainen <andreas.nevalainen@arm.com> | 2020-11-19 11:27:50 +0100 |
---|---|---|
committer | patrik.gustavsson <patrik.gustavsson@arm.com> | 2020-11-23 11:51:08 +0000 |
commit | 27d36f003d35413beb51c1de8f33259ddeca7543 (patch) | |
tree | a386f0a183ccd2896d6630146bcac554a1828feb | |
parent | 43f8f6424cb942f27599258607ea36c9a852f5ef (diff) | |
download | ethos-u-vela-27d36f003d35413beb51c1de8f33259ddeca7543.tar.gz |
MLBEDSW-3468: Move of scale tensors to SRAM after weight compressor
After weight compressor weights has correct sizes. Placing move of scale
tensors after weight compressor gives more accurate estimate of available
SRAM for scale tensors.
Change-Id: I4571780180778ef43e943c4e98048e17d6f33580
Signed-off-by: Andreas Nevalainen <andreas.nevalainen@arm.com>
-rw-r--r-- | ethosu/vela/compiler_driver.py | 3 | ||||
-rw-r--r-- | ethosu/vela/scheduler.py | 23 |
2 files changed, 20 insertions, 6 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index a2b20e47..7b1ea213 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -174,6 +174,9 @@ def compiler_driver(nng, arch, options, scheduler_options): # block config, and calc and pack the scales and biases weight_compressor.update_pass_weight_and_scale_tensors(nng, arch) + if not scheduler_options.keep_scale_placement: + scheduler.move_scales_to_fast_storage(nng, arch) + # LiveRanges for constant tensors for all Npu subgraphs permanent_storage = arch.permanent_storage_mem_area lr_graph_flash = live_range.LiveRangeGraph() diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 889bd06b..d6e890ab 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -1021,10 +1021,25 @@ class DynamicProgrammingScheduler: # in use_fast_storage_for_feature_maps self.sg.scheduling_info["feature_map_rewrites"] = fast_storage_tensor_rewrites - def move_scales_to_fast_storage(self, sg, arch): + +def move_scales_to_fast_storage(nng, arch): + for sg in nng.subgraphs: # IFM streamed ops reads bias tensors several times, move these to fast storage for cp in sg.cascaded_passes: if cp.strategy == SchedulingStrategy.IfmStream: + # Calculate SRAM usage + new_size = 0 + all_tens = [] + for ps in cp.passes: + pass_tens = np.array([ps.ifm_tensor, ps.ifm2_tensor, ps.ofm_tensor, ps.weight_tensor]) + pass_tens = np.append(pass_tens, ps.intermediates) + for tens in pass_tens: + if tens and tens.mem_area == MemArea.Sram and tens not in all_tens: + all_tens.append(tens) + new_size += tens.storage_size() + + cp.sram_used = new_size + for ps in cp.passes: if ps.scale_tensor: tens = ps.scale_tensor @@ -1037,10 +1052,9 @@ class DynamicProgrammingScheduler: new_tens = tens.clone_into_fast_storage(arch) new_tens.consumer_list = tens.consumer_list.copy() new_tens.purpose = TensorPurpose.FSBias - new_tens.element_size_bytes = 10 new_tens_size = new_tens.storage_size() - if (cp.sram_used + new_tens_size) <= self.sram_limit: + if (cp.sram_used + new_tens_size) <= arch.sram_size: # Create DMA cmd dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma") dma_cmd.inputs = [tens] @@ -1082,9 +1096,6 @@ def schedule_passes(nng, arch, options: SchedulerOptions): dps.apply_result(strat_set, arch) - if not options.keep_scale_placement: - dps.move_scales_to_fast_storage(sg, arch) - if options.verbose_schedule: sg.print_cascaded_passes() |