ethosu/vela/insert_dma.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Description:
# Insert DMA operations into the graph for transfering weights.
from . import rewrite_graph
from .operation import NpuBlockType
from .operation import Op
from .operation import Operation
from .tensor import MemArea
from .tensor import MemType
from .tensor import TensorPurpose
from .weight_compressor import compress_weights


def weights_fit_sram(arch, op, tens, nng):
    # Compresses weights and checks if they fit in SRAM
    if tens.purpose != TensorPurpose.Weights:
        return True

    min_weight_size = 0
    if len(tens.shape) == 4:
        min_weight_size = tens.shape[0] * tens.shape[1] * tens.shape[2] * arch.OFMSplitDepth
    elif len(tens.shape) == 2:
        min_weight_size = tens.shape[0] * arch.OFMSplitDepth

    compress_weights(arch, nng, tens, op.type.npu_block_type, 16, 16, op.get_dilation_h_w())

    # Need to be fit into Sram, as a double buffer
    worst_buffer_size = tens.compression_scale_for_worst_weight_stream * min_weight_size * 2
    if worst_buffer_size > arch.sram_size:
        print(
            "Weights, {}, are too big to be DMAed to SRAM, estimated minimum size is {} bytes".format(
                tens.name, worst_buffer_size
            )
        )
        return False
    return True


def insert_dma_cmd(op, arch, nng):
    if op.type == Op.DMA or not op.run_on_npu:
        return op

    is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in op.inputs)
    max_ifm_shram_avail = (
        (arch.available_shram_banks(is_lut_used) - arch.shram_reserved_output_banks) * arch.shram_bank_size // 2
    )

    for idx, tens in enumerate(op.inputs):

        if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
            # Tensor is in permanent storage
            # Only when permanent storage differs from fast storage, there is a point moving the data
            if (
                tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
                and arch.permanent_storage_mem_area != arch.fast_storage_mem_area
            ) or tens.purpose == TensorPurpose.LUT:
                if tens.purpose in (TensorPurpose.Weights, TensorPurpose.LUT) or (
                    tens.purpose == TensorPurpose.FeatureMap
                    and op.type.is_binary_elementwise_op()
                    and tens.shape != []
                    and tens.shape != op.outputs[0].shape
                    and tens.storage_size() > max_ifm_shram_avail
                ):
                    only_vector_product_consumers = True
                    for oper in tens.consumers():
                        if oper is None or oper.type.npu_block_type != NpuBlockType.VectorProduct:
                            only_vector_product_consumers = False
                            break

                    # Tensor products has no need for DMA, tensors are only read once and can be in flash.
                    # Other operations re-reads tensors, this is better done from SRAM.
                    # LUTs must be placed in the last 2 blocks of SHRAM.
                    if (
                        not only_vector_product_consumers and weights_fit_sram(arch, op, tens, nng)
                    ) or tens.purpose == TensorPurpose.LUT:
                        # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
                        new_tens = tens.clone_into_fast_storage(arch)
                        dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma")
                        dma_cmd.inputs = [tens]
                        dma_cmd.set_output_tensor(new_tens)
                        dma_cmd.attrs["source"] = tens.mem_area
                        dma_cmd.attrs["destination"] = new_tens.mem_area
                        dma_cmd.run_on_npu = True
                        if tens.purpose == TensorPurpose.LUT:
                            new_tens.mem_area = MemArea.Shram
                        op.inputs[idx] = new_tens
    return op


def insert_dma_commands(nng, arch, verbose_graph=False):

    for idx, sg in enumerate(nng.subgraphs):
        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [insert_dma_cmd])
    if verbose_graph:
        nng.print_graph()
    return nng