diff options
author | Johan Alfven <johan.alfven@arm.com> | 2023-02-02 09:07:48 +0100 |
---|---|---|
committer | Johan Alfven <johan.alfven@arm.com> | 2023-03-14 11:00:58 +0100 |
commit | 90724965751e882c58de74a044cc7adab307bc55 (patch) | |
tree | 425ccea87487b66ca298a801b298fbf8567f86d9 /ethosu/vela/npu_performance.py | |
parent | bb9885190f5f7ea959f171b38ee1dd44d3e1e75e (diff) | |
download | ethos-u-vela-90724965751e882c58de74a044cc7adab307bc55.tar.gz |
MLBEDSW-6260: Add support for using DMA to copy feature maps
- Reshape ops can be bypassed and there is no need to process them by the NPU.
There are use cases when the IFM must be preserved so a memcpy is needed.
This is implemented by an AvgPool.
- In order to reduce the cost of the AvgPool the IFM can be copied by DMA.
This is faster and also it can be turned into a real NOP in cases where
the IFM and the OFM can use the same memory space.
- Added new memcpy op. Only NHWC format supported since DMA can not change
the format on the fly.
- Allow ofm to reuse ifm for memcpy op
- Make sure the DMA copy size is 16 byte aligned
Change-Id: I3605a48d47646ff60d2bb3644dd3a23f872235a7
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
Diffstat (limited to 'ethosu/vela/npu_performance.py')
-rw-r--r-- | ethosu/vela/npu_performance.py | 32 |
1 files changed, 25 insertions, 7 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 967a7ac0..80011244 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com> +# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com> # # SPDX-License-Identifier: Apache-2.0 # @@ -472,6 +472,10 @@ def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery) _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() ) + # DMA cycle calculation + elif query.npu_block_type == NpuBlockType.Dma: + # Return 0 since this is not an actual NPU op + cycles.op_cycles = 0 else: assert False @@ -541,6 +545,10 @@ def measure_element_access(arch, query: PerformanceQuery): elif query.ifm2_bits > 8: # ifm2 is a non 8-bit scalar access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements() + # DMA + elif query.npu_block_type == NpuBlockType.Dma: + # Return empty access since this is not an actual NPU op + return access # Unknown else: assert False @@ -646,18 +654,28 @@ def estimate_full_op_performance( # LUT Transfer parent_op = op.parent_op - lut_transfer_cycles = 0 + dma_transfer_cycles = 0 if parent_op.activation_lut: lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] src_tensor = lut_tensor.src_tensor if src_tensor and lut_tensor.mem_area != src_tensor.mem_area: bw = src_tensor.storage_size() - lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw) + dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw) bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw # LUT read from SHRAM TODO remove? scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw + # DMA Transfer + if parent_op.type == Op.Memcpy: + src_tensor = parent_op.ifm + dst_tensor = parent_op.ofm + if src_tensor.mem_area != dst_tensor.mem_area: + bw = src_tensor.storage_size() + dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, dst_tensor.mem_area, bw) + bws[src_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Read] += bw + bws[dst_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Write] += bw + if cost.npu_weights_tensor and cost.buffered_weight_tensors: # DMA Weight Transfer sz = 0 @@ -690,11 +708,11 @@ def estimate_full_op_performance( cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles) ) - # Add cycles for LUT Transfer - cycles_a[PassCycles.Npu] += lut_transfer_cycles + # Add cycles for LUT + mempcy op Transfer + cycles_a[PassCycles.Npu] += dma_transfer_cycles else: - # Add cycles for LUT Transfer - cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0) + # Add cycles for LUT + mempcy op Transfer + cycles_a[PassCycles.Npu] += max(dma_transfer_cycles - slack_cycles, 0) # OFM write ofm = op.parent_op.ofm |