ethosu/vela/graph_optimiser_util.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429

# SPDX-FileCopyrightText: Copyright 2021-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Description:
# Common functions and definitions used during the graph optimization.
from typing import Tuple

import numpy as np

from . import lut
from .architecture_features import Accelerator
from .data_type import DataType
from .debug_database import DebugDatabase
from .errors import UnsupportedFeatureError
from .errors import VelaError
from .operation import Op
from .operation_util import create_avgpool_nop
from .shape4d import Shape4D
from .tensor import create_const_tensor
from .tensor import QuantizationParameters

memory_only_ops = (
    Op.Reshape,
    Op.QuantizedReshape,
    Op.Squeeze,
    Op.ExpandDims,
    Op.Identity,
)

# Ops that are dependent that the original ifm tensor shape is not changed
# by the bypass memory op function
original_ifm_shape_ops = (Op.Mean,)


def _avoid_nhcwb16_for_concat(tens):
    # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
    # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
    # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
    # and those addresses are always 16 byte aligned due to the NHCWB16 format.
    return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)


def _avoid_nhcwb16_for_split(tens):
    # If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input

    # Return True if NHCWB16 needs to be avoided
    def offset_not_aligned(read_offset):
        return read_offset is not None and (read_offset.depth % 16) != 0

    for cons_op in tens.consumer_list:
        if cons_op.ifm == tens:
            if offset_not_aligned(cons_op.read_offsets[0]):
                return True
        if cons_op.ifm2 is not None and cons_op.ifm2 == tens:
            if offset_not_aligned(cons_op.read_offsets[1]):
                return True
    return False


def _avoid_nhcwb16_for_shapes(tens):
    # check all producers/consumers to see if any op shape is preventing NHCWB16
    for cons_op in tens.consumer_list:
        if cons_op.ifm == tens:
            cons_op_shape = cons_op.ifm_shapes[0]
        elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
            cons_op_shape = cons_op.ifm_shapes[1]
        else:
            assert False
        if Shape4D(tens.shape) != cons_op_shape:
            return True

    for prod_op in tens.ops:
        if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
            return True

    return False


# Check if non linear format can be used
def check_format_restrictions(tens, arch):
    if len(tens.ops) < 1:
        return
    if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
        cons is None for cons in tens.consumer_list
    ):
        return

    # Check if any of the producers/consumers is run on CPU
    if not all(cons.run_on_npu for cons in tens.consumer_list):
        return
    if not all(prod.run_on_npu for prod in tens.ops):
        return

    # "Concat" ofm exception:
    if _avoid_nhcwb16_for_concat(tens):
        return

    # "Split" ifm exception:
    if _avoid_nhcwb16_for_split(tens):
        return

    # Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
    if _avoid_nhcwb16_for_shapes(tens):
        return

    # Resize bilinear half pixel center implementation requires OFM with linear format to
    # allow stride modification in H/W dimensions.
    for op in tens.ops:
        if op.original_type == Op.ResizeBilinear and op.type == Op.DepthwiseConv2DBias:
            return

    for op in tens.consumer_list:
        if op.type == Op.ReduceSum and (
            tens.dtype == DataType.int32 or arch.accelerator_config == Accelerator.Ethos_U65_512
        ):
            # ReduceSum requires NHWC input
            return
        if op.type == Op.Reshape:
            # Using NHCWB16 format for a no-op reshape is only an option if subsequent
            # consumers do not also need to perform a reshape or if the OFM is going to
            # be processed by CPU operations. No-op reshape consumers with empty lists
            # (those that have no consumers, or null-consumers used as list terminators)
            # must use normal NHWC output.

            def incompatible_consumers(oper):
                if oper and oper.type == Op.Reshape:
                    for consumer in oper.outputs[0].consumer_list:
                        yield from incompatible_consumers(consumer)
                yield not oper or not oper.run_on_npu

            if not any(incompatible_consumers(op)):

                def get_rewrites(oper):
                    if oper and oper.type == Op.Reshape:
                        for consumer in oper.outputs[0].consumer_list:
                            yield from get_rewrites(consumer)
                        yield oper

                # Detect no-op reshapes by comparing their full input and output tensor shapes.
                inshape = op.ifm_shapes[0]
                compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
                if not (compatible_shape and all(compatible_shape)):
                    return
            else:
                return

    tens.needs_linear_format = False


def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
    """
    Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
    that provides equivalent results.
    """
    total_padding = needed_total_padding(input_size, stride, filter_size)

    # The bottom/right padding might need downward adjustment depending on stride/input size
    total_minus_before = total_padding - pad_before
    output_pad_after = pad_after
    while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
        output_pad_after -= 1
    return pad_before, output_pad_after


def needed_total_padding(input_size, stride, filter_size):
    out_size = (input_size + stride - 1) // stride
    needed_input = (out_size - 1) * stride + filter_size
    total_padding = max(0, needed_input - input_size)
    return total_padding


# Set input/output tensor equivalence to the same id for memory operations
def set_tensor_equivalence(op, arch, nng):
    if op.type in memory_only_ops:
        eid = op.outputs[0].equivalence_id
        for inp in op.inputs:
            inp.equivalence_id = eid
    return op


def set_ifm_ofm_op_shapes(op, arch, nng):
    if op.run_on_npu and op.type.needs_shapes():
        if op.ifm_shapes or op.ofm_shapes:
            # Shapes already set
            return op
        op.set_ifm_ofm_shapes()
    return op


def bypass_need_to_keep_ofm_shape(op):
    # Check if ifm must be replaced by ofm (rank is changed or the op that follow must have original ifm shape)
    ifm_replaced_by_ofm = any(
        ofm_cons is not None and ofm_cons.type in original_ifm_shape_ops for ofm_cons in op.ofm.consumer_list
    ) or len(op.ifm.shape) != len(op.ofm.shape)
    return ifm_replaced_by_ofm


def bypass_memory_only_ops(op):
    assert op.type in memory_only_ops
    ofm = op.ofm
    ifm = op.ifm

    # Check if ifm/ofm are network ifm/ofm
    ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
    ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)
    ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)
    # Check if ifm/ofm is produced respectively consumed by CPU
    ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
    ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)

    # This case should be handled prior to this function
    assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed))

    if (ifm.shape != ofm.shape) and (ofm_is_sg_ofm or ofm_is_cpu_consumed or bypass_need_to_keep_ofm_shape(op)):
        # Bypassed by replacing ifm with ofm
        ofm.ops = []
        for prev_op in ifm.ops:
            prev_op.outputs = [ofm]
            ofm.ops.append(prev_op)

        # All ifm consumers need to use ofm as input
        for ifm_cons in ifm.consumer_list:
            for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
                if cons_ifm == ifm:
                    ifm_cons.set_input_tensor(ofm, ifm_idx)
    else:
        # Bypassed by replacing ofm with ifm
        for cons in ofm.consumer_list:
            for ifm_idx, cons_ifm in enumerate(cons.inputs):
                if cons_ifm == ofm:
                    cons.set_input_tensor(ifm, ifm_idx)


def move_splitsliceread_to_consumer(op, cons_op):
    assert op.type == Op.SplitSliceRead

    if cons_op.ifm == op.ofm:
        cons_op.read_offsets[0] = op.read_offsets[0]
        cons_op.read_shapes[0] = op.read_shapes[0]
        cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
        cons_op.ifm_shapes[0] = op.ifm_shapes[0]
    elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
        cons_op.read_offsets[1] = op.read_offsets[0]
        cons_op.read_shapes[1] = op.read_shapes[0]
        cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
        cons_op.ifm_shapes[1] = op.ifm_shapes[0]
    op.ofm.consumer_list.remove(cons_op)
    op.ofm.ops = []
    op.ifm.consumer_list.remove(op)


def check_memory_only_removed(op, arch):
    if op.run_on_npu and op.type in memory_only_ops:
        # Memory only operators should have been removed
        raise VelaError(f"Memory only {op.type} op {op} expected to have been removed, still remains")


def record_optimised(op, arch):
    if op.type not in (Op.Const, Op.Placeholder):
        DebugDatabase.add_optimised(op, op)


def insert_copy_op_before_op(op):
    # Create a avg_pool nop op with ifm as input
    tens = op.ifm
    copy_tens = tens.clone()
    copy_op = create_avgpool_nop(f"{tens.name}_avgpool")
    copy_op.add_input_tensor(tens)
    copy_op.set_output_tensor(copy_tens)
    copy_op.set_ifm_ofm_shapes()

    op.set_input_tensor(copy_tens, 0)

    DebugDatabase.add_optimised(op, copy_op)


def insert_copy_op_after_tens(tens):
    tens_cons_list_copy = tens.consumer_list.copy()

    # Create a avg_pool nop op with ifm as input
    copy_tens = tens.clone()
    copy_op = create_avgpool_nop(tens.name + "_avgpool")
    copy_op.add_input_tensor(tens)
    copy_op.set_output_tensor(copy_tens)
    copy_op.set_ifm_ofm_shapes()
    copy_op.run_on_npu = True

    # Set copy_ifm consumers
    for tens_cons in tens_cons_list_copy:
        if tens_cons is not None:
            for ifm_idx, cons_inp in enumerate(tens_cons.inputs):
                if cons_inp == tens:
                    tens_cons.set_input_tensor(copy_tens, ifm_idx)

    DebugDatabase.add_optimised(tens.ops[0], copy_op)


def fix_sg_input_output(op, arch, nng):
    if not op.run_on_npu or op.type not in memory_only_ops:
        return op

    # For the memory only operators we want to remove, tensors are removed.
    # But in order to to do this, they cannot be outputs of the sg,
    # this need to be fixed prior to the removal.
    # Solution is to add a avgpool NOP, to maintain the original tensor.
    # This is also valid when reshape ifm/ofm is produced respectively
    # consumed by CPU

    # Rare case: original_ifm_shape_ops contain ops that are dependent
    # that the original ifm tensor shape is not changed by the bypass memory
    # function. If the memory only op ifm is subgraph ifm/ifm is cpu produced
    # or the ifm is consumed by many, then there is a need to insert an avgpool
    # NOP before the original_ifm_shape_ops. Also note that the NOP is only inserted
    # before original_ifm_shape_ops. The above is also true when the memory only
    # op change the rank between the IFM and OFM.
    #
    # Below is an example showing the case when there is a need for an AVG NOP
    # when RESHAPE is bypassed by replacing IFM with OFM.
    #
    #                Converts to           And in bypass_memory
    #                    --->                     --->
    #    -----ADD-----           -----ADD-----           -----ADD-----
    #    |           |           |           |           |           |
    # 1x6x6x10    1x6x6x10   1x6x6x10    1x6x6x10    1x6x6x10    1x6x6x10
    #  RESHAPE      MEAN     AVG POOL      MEAN      AVG POOL      MEAN
    #    |                       |           |           |
    # 1x20x3x6               1x6x6x10                1x20x3x6
    #   MEAN                  RESHAPE                  MEAN
    #                            |
    #                        1x20x3x6
    #                          MEAN
    ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1

    # Check if operator ifm/ofm are sg ifm/ofm
    ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
    ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
    ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
    # Check if ifm/ofm is produced respectively consumed by CPU
    ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
    ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)

    if bypass_need_to_keep_ofm_shape(op):
        # Bypass need to keep OFM shape
        if ifm_has_multiple_cons:
            # Rare case:
            # IFM need to persist due to multiple consumers and copy op is needed
            # OFM will replace IFM for the memory only op
            insert_copy_op_before_op(op)
        elif not (ofm_is_sg_ofm or ofm_is_cpu_consumed):
            # Only one consumer and OFM is not subgraph output or cpu consumed,
            # safe to replace ifm.shape by ofm.shape
            # IFM can then replace OFM for the memory only op and no copy op is needed
            op.ifm.shape = op.ofm.shape

    # Special case when when OFM is sg_ofm or cpu_consumed
    if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed):
        # Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the memory only operator.
        insert_copy_op_after_tens(op.ifm)

    return op


def convert_depthwise_to_conv(op, arch, nng):
    # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
    # the ofm depth equals the depth multipler.
    # If those conditions are true, then we can perform a simple
    # switch of the operator type (and weight order)

    if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
        ifm_shape = op.ifm_shapes[0]
        weight_tensor = op.inputs[1]
        ofm_shape = op.ofm_shapes[0]
        if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
            # Change op type to Conv2d
            op.type = Op.Conv2DBias
            del op.attrs["channel_multiplier"]
            del op.attrs["depth_multiplier"]

            weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
            weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
            DebugDatabase.add_optimised(op, op)
        else:
            raise UnsupportedFeatureError(
                f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
                f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
            )
    return op


def convert_to_lut(op, lut_values, lut_name):
    # Rewrite the operation by Add with scalar 0 + LUT activation
    ifm = op.inputs[0]
    if ifm is None:
        return op
    assert ifm.dtype.size_in_bytes() == 1
    op.type = Op.Add
    op.name = op.name + "_lut_" + lut_name
    # Mark as no-op to enable potential fusing optimizations
    op.attrs["is_nop"] = True
    # Create an input tensor containing scalar zero
    quantization = QuantizationParameters(0.0, 255.0)
    quantization.scale_f32 = ifm.quantization.scale_f32
    quantization.zero_point = 0
    tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
    op.add_input_tensor(tens)
    op.ifm_shapes.append(Shape4D(tens.shape))  # TODO no shape?

    # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
    # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
    # should be the same as the IFM
    op.forced_output_quantization = ifm.quantization
    lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8)
    op.set_activation_lut(lut_tensor)
    op.set_ifm_ofm_shapes()
    DebugDatabase.add_optimised(op, op)
    return op