MLBEDSW-6260: Add support for using DMA to copy feature maps

- Reshape ops can be bypassed and there is no need to process them by the NPU. There are use cases when the IFM must be preserved so a memcpy is needed. This is implemented by an AvgPool. - In order to reduce the cost of the AvgPool the IFM can be copied by DMA. This is faster and also it can be turned into a real NOP in cases where the IFM and the OFM can use the same memory space. - Added new memcpy op. Only NHWC format supported since DMA can not change the format on the fly. - Allow ofm to reuse ifm for memcpy op - Make sure the DMA copy size is 16 byte aligned Change-Id: I3605a48d47646ff60d2bb3644dd3a23f872235a7 Signed-off-by: Johan Alfven <johan.alfven@arm.com>
author: Johan Alfven <johan.alfven@arm.com> 2023-02-02 09:07:48 +0100
committer: Johan Alfven <johan.alfven@arm.com> 2023-03-14 11:00:58 +0100
commit: 90724965751e882c58de74a044cc7adab307bc55 (patch)
tree: 425ccea87487b66ca298a801b298fbf8567f86d9 /ethosu/vela/high_level_command_to_npu_op.py
parent: bb9885190f5f7ea959f171b38ee1dd44d3e1e75e (diff)
download: ethos-u-vela-90724965751e882c58de74a044cc7adab307bc55.tar.gz
1 files changed, 6 insertions, 1 deletions
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 2c62c6f7..7634fe1f 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -54,6 +54,7 @@ from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .high_level_command_stream import Box
 from .high_level_command_stream import Command
 from .high_level_command_stream import DMA
+from .high_level_command_stream import NOP
 from .high_level_command_stream import NpuStripe
 from .numeric_util import quantise_float32
 from .numeric_util import round_up
@@ -627,7 +628,8 @@ def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
     else:
         src_addr = cmd.in_tensor.address_for_coordinate(cmd.box.start_coord)
         dest_addr = cmd.out_tensor.address_for_coordinate(cmd.box.start_coord)
-        sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
+        # DMA must use 16 bytes alignment (tensors are always aligned but the sz calculation uses actual size)
+        sz = round_up(cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr, 16)
     src = NpuAddressRange(src_region, int(src_addr), int(sz))
     dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
     return NpuDmaOperation(src, dest)
@@ -663,6 +665,9 @@ def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
     for cmd in sg.high_level_command_stream:
         if isinstance(cmd, NpuStripe) and cmd.ps.npu_block_type == NpuBlockType.Default:
             print("Warning: Skipping register command stream generation for", cmd.ps)
+        elif isinstance(cmd, NOP):
+            # NOP should not generate anything
+            continue
         else:
             npu_op = convert_command_to_npu_op(cmd, arch)
             npu_op_list.append(npu_op)
author	Johan Alfven <johan.alfven@arm.com>	2023-02-02 09:07:48 +0100
committer	Johan Alfven <johan.alfven@arm.com>	2023-03-14 11:00:58 +0100
commit	90724965751e882c58de74a044cc7adab307bc55 (patch)
tree	425ccea87487b66ca298a801b298fbf8567f86d9 /ethosu/vela/high_level_command_to_npu_op.py
parent	bb9885190f5f7ea959f171b38ee1dd44d3e1e75e (diff)
download	ethos-u-vela-90724965751e882c58de74a044cc7adab307bc55.tar.gz