MLBEDSW-6260: Add support for using DMA to copy feature maps

- Reshape ops can be bypassed and there is no need to process them by the NPU. There are use cases when the IFM must be preserved so a memcpy is needed. This is implemented by an AvgPool. - In order to reduce the cost of the AvgPool the IFM can be copied by DMA. This is faster and also it can be turned into a real NOP in cases where the IFM and the OFM can use the same memory space. - Added new memcpy op. Only NHWC format supported since DMA can not change the format on the fly. - Allow ofm to reuse ifm for memcpy op - Make sure the DMA copy size is 16 byte aligned Change-Id: I3605a48d47646ff60d2bb3644dd3a23f872235a7 Signed-off-by: Johan Alfven <johan.alfven@arm.com>
author: Johan Alfven <johan.alfven@arm.com> 2023-02-02 09:07:48 +0100
committer: Johan Alfven <johan.alfven@arm.com> 2023-03-14 11:00:58 +0100
commit: 90724965751e882c58de74a044cc7adab307bc55 (patch)
tree: 425ccea87487b66ca298a801b298fbf8567f86d9 /ethosu/vela/npu_performance.py
parent: bb9885190f5f7ea959f171b38ee1dd44d3e1e75e (diff)
download: ethos-u-vela-90724965751e882c58de74a044cc7adab307bc55.tar.gz
1 files changed, 25 insertions, 7 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 967a7ac0..80011244 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -472,6 +472,10 @@ def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery)
             _estimate_output_cycles_per_element(arch, op_type, faf_type, query)
             * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
         )
+    # DMA cycle calculation
+    elif query.npu_block_type == NpuBlockType.Dma:
+        # Return 0 since this is not an actual NPU op
+        cycles.op_cycles = 0
     else:
         assert False
 
@@ -541,6 +545,10 @@ def measure_element_access(arch, query: PerformanceQuery):
                 elif query.ifm2_bits > 8:
                     # ifm2 is a non 8-bit scalar
                     access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
+    # DMA
+    elif query.npu_block_type == NpuBlockType.Dma:
+        # Return empty access since this is not an actual NPU op
+        return access
     # Unknown
     else:
         assert False
@@ -646,18 +654,28 @@ def estimate_full_op_performance(
 
     # LUT Transfer
     parent_op = op.parent_op
-    lut_transfer_cycles = 0
+    dma_transfer_cycles = 0
     if parent_op.activation_lut:
         lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
         src_tensor = lut_tensor.src_tensor
         if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
             bw = src_tensor.storage_size()
-            lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
+            dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
 
             bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
             # LUT read from SHRAM TODO remove?
             scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
 
+    # DMA Transfer
+    if parent_op.type == Op.Memcpy:
+        src_tensor = parent_op.ifm
+        dst_tensor = parent_op.ofm
+        if src_tensor.mem_area != dst_tensor.mem_area:
+            bw = src_tensor.storage_size()
+            dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, dst_tensor.mem_area, bw)
+            bws[src_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Read] += bw
+            bws[dst_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Write] += bw
+
     if cost.npu_weights_tensor and cost.buffered_weight_tensors:
         # DMA Weight Transfer
         sz = 0
@@ -690,11 +708,11 @@ def estimate_full_op_performance(
                 cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles)
             )
 
-        # Add cycles for LUT Transfer
-        cycles_a[PassCycles.Npu] += lut_transfer_cycles
+        # Add cycles for LUT + mempcy op Transfer
+        cycles_a[PassCycles.Npu] += dma_transfer_cycles
     else:
-        # Add cycles for LUT Transfer
-        cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
+        # Add cycles for LUT + mempcy op Transfer
+        cycles_a[PassCycles.Npu] += max(dma_transfer_cycles - slack_cycles, 0)
 
     # OFM write
     ofm = op.parent_op.ofm
author	Johan Alfven <johan.alfven@arm.com>	2023-02-02 09:07:48 +0100
committer	Johan Alfven <johan.alfven@arm.com>	2023-03-14 11:00:58 +0100
commit	90724965751e882c58de74a044cc7adab307bc55 (patch)
tree	425ccea87487b66ca298a801b298fbf8567f86d9 /ethosu/vela/npu_performance.py
parent	bb9885190f5f7ea959f171b38ee1dd44d3e1e75e (diff)
download	ethos-u-vela-90724965751e882c58de74a044cc7adab307bc55.tar.gz