aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/insert_dma.py
blob: b63c1ea1261172d6b146050891650108d70d32c1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Description:
# Insert DMA operations into the graph for transfering weights.

from .nn_graph import Operation, MemArea, TensorPurpose, NpuBlockType
from . import rewrite_graph


def insert_dma_cmd(op, arch):
    if op.type == "DMA":
        return op # Already rewritten
    for idx, tens in enumerate(op.inputs):

        if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area:
            if tens.purpose == TensorPurpose.Weights:
                only_vector_product_consumers = True
                for oper in tens.consumers():
                    if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
                        only_vector_product_consumers = False
                        break

                # Tensor products has no need for DMA, tensors are only read once and can be in flash.
                # Other operations re-reads tensors, this is better done from SRAM.
                if not only_vector_product_consumers:
                    # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
                    new_tens = tens.clone_into_fast_storage(arch)
                    dma_cmd = Operation("DMA", tens.ops[0].name + "_dma")
                    dma_cmd.inputs = [tens]
                    dma_cmd.outputs = [new_tens]
                    dma_cmd.attrs["source"] = tens.mem_area
                    dma_cmd.attrs["destination"] = new_tens.mem_area
                    dma_cmd.run_on_npu = True
                    new_tens.ops = [dma_cmd]
                    op.inputs[idx] = new_tens
    return op


def insert_dma_commands(nng, arch, verbose_graph=False):

    for idx, sg in enumerate(nng.subgraphs):
        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [], [insert_dma_cmd])
    if verbose_graph:
        nng.print_graph()
    return nng