diff options
author | Tim Hall <tim.hall@arm.com> | 2021-05-27 18:49:40 +0100 |
---|---|---|
committer | Tim Hall <tim.hall@arm.com> | 2021-05-27 18:57:39 +0100 |
commit | d8339a75c9b655c0507e34238078fdad068b4023 (patch) | |
tree | 36a14726b30760169a83c0356803b480992fade8 /ethosu/vela/npu_serialisation.py | |
parent | 64556f32ff7bfca6036a6598034464b13b64a4ef (diff) | |
download | ethos-u-vela-d8339a75c9b655c0507e34238078fdad068b4023.tar.gz |
MLBEDSW-4034: New Scheduler Size or Performance Optimisation
- Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b
Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
Diffstat (limited to 'ethosu/vela/npu_serialisation.py')
-rw-r--r-- | ethosu/vela/npu_serialisation.py | 46 |
1 files changed, 17 insertions, 29 deletions
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py index ad4d29ca..39a7f21f 100644 --- a/ethosu/vela/npu_serialisation.py +++ b/ethosu/vela/npu_serialisation.py @@ -42,10 +42,8 @@ def make_memory_tensor(name, mem_area, mem_type, sz, want_values, arch): def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor): start_addr = src_tensor.address - for compressed_values in src_tensor.compressed_values: - end_addr = start_addr + len(compressed_values) - memory_tensor.values[start_addr:end_addr] = compressed_values - start_addr = end_addr + end_addr = src_tensor.address + src_tensor.storage_size() + memory_tensor.values[start_addr:end_addr] = src_tensor.buffer.copy() def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor): @@ -94,31 +92,21 @@ def serialise_npu_subgraph_into_tensors(sg, arch, scratch_tens, scratch_fast_ten sg.scratch_fast_tensor = scratch_fast_tens sg.scratch_fast_tensor.shape[0] = 0 - for cps in sg.cascaded_passes: - for ps in cps.passes: - if ps.placement == PassPlacement.Npu: - if ps.weight_tensor is not None: - # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address - # is pointing at the destination address of where the weights should be placed in SRAM. - # This ensures that the Flash weight tensor is used instead and thus gets the correct address. - if ps.weight_tensor.ops[0].type == Op.DMA: - copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0]) - else: - copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor) - - if ps.scale_tensor.ops[0].type == Op.DMA: - copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor.ops[0].inputs[0]) - else: - copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor) - - if ps.lut_tensor is not None: - copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.lut_tensor) - if ps.ifm_tensor is not None and ps.ifm_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast): - copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor) - if ps.ifm2_tensor is not None and ( - ps.ifm2_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast) - ): - copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor) + for sched_op in sg.sched_ops: + ifm_tensor, ifm2_tensor, _, _, _ = sched_op.parent_op.get_ifm_ifm2_weights_biases_ofm() + + op_info = sg.schedule.cost_map[sched_op] + if op_info.npu_weights_tensor: + copy_compressed_values_to_memory_tensor(sg.flash_tensor, op_info.npu_weights_tensor) + + if ifm_tensor and ifm_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast): + copy_ifm_values_to_memory_tensor(sg.flash_tensor, ifm_tensor) + if ifm2_tensor and (ifm2_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast)): + copy_ifm_values_to_memory_tensor(sg.flash_tensor, ifm2_tensor) + + if sched_op.parent_op.activation_lut: + copy_ifm_values_to_memory_tensor(sg.flash_tensor, sched_op.parent_ps.lut_tensor) + sg.command_stream_tensor = make_memory_tensor( sg.name + "_command_stream", flash_area, MemType.Permanent_CPU, command_stream_size_bytes, True, arch ) |