ethosu/vela/shared_buffer_allocation.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Description:
# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.

import numpy as np
from .nn_graph import NpuBlockType
from .numeric_util import round_up_divide, round_up
from .architecture_features import Block, Kernel, SHRAMElements, SharedBufferArea, ArchitectureFeatures
from . import pass_packing


class SharedBufferAllocation:
    def __init__(self, arch, ps):
        self.arch = arch

        self.bank_locations = np.zeros(SharedBufferArea.Size)
        self.banks_required = np.zeros(SharedBufferArea.Size)

        ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()

        strides = (1, 1, 1, 1)
        dilation = (1, 1, 1, 1)
        self.kernel = Kernel(1, 1)
        is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise

        if ps.primary_op:
            strides = ps.primary_op.attrs.get("strides", strides)
            dilation = ps.primary_op.attrs.get("dilation", dilation)
            k_h = 1
            k_w = 1
            if weight_tensor:
                if ps.primary_op.type != "FullyConnectedAct":
                    k_h = weight_tensor.shape[0]
                    k_w = weight_tensor.shape[1]
            else:
                k_h = ps.primary_op.attrs.get("filter_height", 1)
                k_w = ps.primary_op.attrs.get("filter_width", 1)

            self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])

        self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (
            NpuBlockType.ConvolutionDepthWise,
            NpuBlockType.Pooling,
        )
        self.strides = strides

        self.use_accumulator_element = SHRAMElements.Acc32
        if is_elementwise:
            self.use_ifm_element = SHRAMElements.IFM8_Elementwise
        else:
            self.use_ifm_element = SHRAMElements.IFM8

        self.ifm_bits = 0
        self.ifm_depth = 0
        if ifm_tensor:
            self.ifm_bits = ifm_tensor.dtype.size_in_bits()
            if ifm_tensor.shape == [] and is_elementwise:
                # Elementwise operator with scalar in ifm, use ifm2 depth
                self.ifm_depth = ifm2_tensor.shape[-1]
            else:
                self.ifm_depth = ifm_tensor.shape[-1]
            if self.ifm_bits == 16:
                self.use_accumulator_element = SHRAMElements.Acc40
                self.use_ifm_element = self.use_ifm_element + 1
                assert (self.use_ifm_element == SHRAMElements.IFM16) or (
                    self.use_ifm_element == SHRAMElements.IFM16_Elementwise
                )
            else:
                assert self.ifm_bits == 8, "Unexpected IFM bitdepth"

        self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
        self.ofm_tensor = ofm_tensor

        self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
        self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks

    def is_valid(self):
        # Assign zero-based bank starts (first element remains zero)
        self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]

        # Accumulator area is measured from the end of the buffer
        self.bank_locations[SharedBufferArea.Accumulators] = (
            self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]
        )
        ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
        return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]

    def try_block(self, ofm_block: Block):
        # Get IFM block configuration
        ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
        ifm_block = self.arch.get_ifm_block_size(ifm_block_depth, ofm_block, self.kernel)
        ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
        if ifm_config is None:
            return None

        # Get OFM block configuration
        ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
        if ofm_config is None:
            return None

        # Update bank counts for IFM and Accumulator
        self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element]
        self.banks_required[SharedBufferArea.Accumulators] = ofm_config.banks[self.use_accumulator_element]

        # Validating calculates bank layout and returns validity
        if not self.is_valid():
            return None

        return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)

    def generate_used_mask(self, active_set):
        res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
        for kind in active_set:
            start = int(self.bank_locations[kind])
            end = start + int(self.banks_required[kind])
            res[start:end] = 1
        return res

    def is_compatible(first, second):
        """See if the bank allocations of two convolutions are compatible,
        so that they can run back-to-back without a fence in between"""

        first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
        second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))

        first_mask = first.generate_used_mask(first_set)
        second_mask = second.generate_used_mask(second_set)

        if np.sum(first_mask & second_mask):
            # overlap
            return False

        return True


def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
    alloc = SharedBufferAllocation(arch, ps)
    assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
    if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
        return alloc

    return None


def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):
    alloc = SharedBufferAllocation(arch, ps)

    if arch.override_block_config:
        config = alloc.try_block(arch.override_block_config)
        assert config, "Block config override cannot be used"
        return [config]

    # Constrain the search space if the OFM is smaller than the max block size
    # - Add other block search constraints here if required
    if len(alloc.ofm_tensor.shape) == 2:
        max_block_height = max_block_width = alloc.ofm_tensor.shape[0]
    else:
        max_block_width = alloc.ofm_tensor.shape[-2]
        max_block_height = alloc.ofm_tensor.shape[-3]

    # Common block depth
    max_block_depth = alloc.ofm_tensor.shape[-1]

    # Constrain to valid ranges before search
    max_block_width = min(arch.ofm_block_max.width, max_block_width)
    max_block_height = min(arch.ofm_block_max.height, max_block_height)
    max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)

    valid_block_configs = []
    # Try a range of block shapes against this pass
    for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):
        for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):
            # Try valid OFM block depths
            for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
                # OFM block depth has the constraint that if it causes the OFM to be
                # split, it must be a multiple of the OFM split size
                if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
                    config = alloc.try_block(Block(w, h, c))
                    if config:
                        valid_block_configs.append(config)

    assert len(valid_block_configs) > 0
    return valid_block_configs