# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the License); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an AS IS BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Description: # Early optimisation of the TOSA based network graph, using the rewrite_graph module to do the traversal of the graph. import numpy as np from . import rewrite_graph from .api import NpuRoundingMode from .data_type import DataType from .debug_database import DebugDatabase from .graph_optimiser_util import bypass_memory_only_ops from .graph_optimiser_util import calc_explicit_padding from .graph_optimiser_util import convert_depthwise_to_conv from .graph_optimiser_util import convert_to_lut from .graph_optimiser_util import move_splitsliceread_to_consumer from .graph_optimiser_util import needed_total_padding from .graph_optimiser_util import set_ifm_ofm_op_shapes from .graph_optimiser_util import set_tensor_equivalence from .operation import ExplicitScaling from .operation import Op from .operation_util import create_add_nop from .operation_util import create_avgpool_nop from .operation_util import create_pad_nop from .shape4d import Shape4D from .tensor import create_const_tensor from .tensor import create_equivalence_id from .tensor import shape_num_elements from .tensor import Tensor def replace_rescale_with_avg_pool(rescale_op): assert rescale_op.type == Op.Rescale avgpool_op = create_avgpool_nop(rescale_op.name + "_avgpool") rescale_op_clone = rescale_op.clone() op = rescale_op op.attrs = avgpool_op.attrs.copy() op.type = Op.AvgPool DebugDatabase.add_optimised(rescale_op_clone, op) return op def calc_skirt(kernel, input_shape, explicit_padding): k_w, k_h = kernel.dilated_wh() s_x, s_y = kernel.stride ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h)) xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w)) top, left, bottom, right = explicit_padding top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom)) left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right)) padding = (top_pad, left_pad, bottom_pad, right_pad) skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad) return padding, skirt def add_padding_fields(op, arch, nng): if op.run_on_npu: if "explicit_padding" in op.attrs: input_shape = op.ifm_shapes[0] if op.type == Op.Conv2DBackpropInputSwitchedBias: # TODO not yet supported, but there will be need for separate handling assert False else: padding, skirt = calc_skirt(op.kernel, input_shape, op.attrs.get("explicit_padding")) op.attrs["explicit_padding"] = padding op.attrs["skirt"] = skirt return op # Counts leading zeroes for a (int32) def count_leading_zeros(a): lz = int(32) if a != 0: mask = 1 << (32 - 1) lz = 0 while (mask & a) == 0: mask = mask >> 1 lz = lz + 1 return lz def calc_scaling_avgpool(op, arch, nng): if op.type == Op.AvgPool: top, left, _, _ = op.attrs["explicit_padding"] # TODO Only support for when global scaling can be used. # That is when there is no padding assert top == 0 and left == 0 assert op.explicit_scaling is None multiplier = [] shift = [] kernel_wh = op.kernel.elements_wh() k = 32 - count_leading_zeros(kernel_wh - 1) numerator = np.int64(((1 << 30) + 1) << k) multiplier.append(numerator // kernel_wh) shift.append(30 + k) op.rounding_mode = NpuRoundingMode.NATURAL op.explicit_scaling = ExplicitScaling(False, shift, multiplier) return op def remove_const_transpose(op, arch, nng): if op.type == Op.Transpose: removed = False if len(op.ifm.ops) == 1: prev_op = op.ifm.ops[0] if prev_op.type == Op.Const: # Transpose the Tensor and data and remove Transpose # TODO move to Tensor? reorder = op.attrs["perms"] shape = op.ifm.shape.copy() tens = op.ifm tens.shape = [shape[idx] for idx in reorder] tens.bandwidth_shape = tens.shape tens.storage_shape = tens.shape if tens.values is not None: tens.values = tens.values.transpose(reorder) op.ofm.values = tens.values # Bypass the Transpose op prev_op.set_output_tensor(op.ofm) DebugDatabase.add_optimised(op, prev_op) removed = True if not removed: print("Warning: Cannot remove Transpose, and handling of Transpose is not supported") assert False return op def insert_add_copy_for_const(op, ifm_ofm_shape): assert op.type == Op.Const ofm = op.ofm copy_tens = ofm.clone() op.set_output_tensor(copy_tens) name = ofm.name + "_add" ifm2 = create_const_tensor( name + "_zero_scalar", [1], copy_tens.dtype, [0], copy_tens.dtype.as_numpy_type(), quantization=copy_tens.quantization, ) copy_op = create_add_nop(name) copy_op.add_input_tensor(copy_tens) copy_op.add_input_tensor(ifm2) copy_op.set_output_tensor(ofm) copy_op.ifm_shapes.append(ifm_ofm_shape) copy_op.ifm_shapes.append(Shape4D(ifm2.shape)) copy_op.ofm_shapes.append(ifm_ofm_shape) copy_op.run_on_npu = True DebugDatabase.add_optimised(op, copy_op) # TODO can we change to add for both TFLite and TOSA? def insert_add_copy_op_after_tens(tens, ifm_ofm_shape): tens_cons_list_copy = tens.consumer_list.copy() copy_tens = tens.clone() name = tens.name + "_add" ifm2 = create_const_tensor( name + "_zero_scalar", [1], copy_tens.dtype, [0], copy_tens.dtype.as_numpy_type(), quantization=copy_tens.quantization, ) copy_op = create_add_nop(name) copy_op.add_input_tensor(tens) copy_op.add_input_tensor(ifm2) copy_op.set_output_tensor(copy_tens) copy_op.ifm_shapes.append(ifm_ofm_shape) copy_op.ifm_shapes.append(Shape4D(ifm2.shape)) copy_op.ofm_shapes.append(ifm_ofm_shape) copy_op.run_on_npu = True # Set copy_ifm consumers for tens_cons in tens_cons_list_copy: if tens_cons is not None: for ifm_idx, cons_inp in enumerate(tens_cons.inputs): if cons_inp == tens: tens_cons.set_input_tensor(copy_tens, ifm_idx) DebugDatabase.add_optimised(tens.ops[0], copy_op) def get_shape_for_copy_op(shape): # remove dimensions that are set to 1 new_shape = [] for dim in shape: if dim != 1: new_shape.append(dim) if not new_shape: new_shape = [1] rank = len(new_shape) if rank > 3: # Reshape so that batch becomes 1, by moving elements to H dimension n = rank - 2 h = 1 for i in range(n): h *= shape[i] new_shape = Shape4D(new_shape[n:]).with_height(h) else: new_shape = Shape4D(new_shape) return new_shape def fix_sg_input_output_tosa(op, arch, nng): if op.type == Op.Const and any(ofm_cons is None for ofm_cons in op.ofm.consumer_list): # Const operator with sg output, insert copy op before the ofm new_shape = get_shape_for_copy_op(op.ofm.shape.copy()) insert_add_copy_for_const(op, new_shape) elif op.run_on_npu and op.type in (Op.Reshape, Op.Identity): # For the Reshape operators we want to remove, tensors are removed. # But in order to to do this, they cannot be outputs of the sg, # this need to be fixed prior to the removal. # Solution is to add a copy op, to maintain the original tensor. # This is also valid when reshape ifm/ofm is produced respectively # consumed by CPU # Check if operator ifm/ofm are sg ifm/ofm ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list) ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list) # Check if ifm/ofm is produced repectivly consumed by CPU ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops) ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list) if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed): # Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the Operator # Decide on ifm/ofm shapes for the copy op based on ifm new_shape = get_shape_for_copy_op(op.ifm.shape.copy()) insert_add_copy_op_after_tens(op.ifm, new_shape) return op def create_add_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D): """Creates an add op for the given concat op/input feature map""" ofm = concat_op.ofm ifm2 = create_const_tensor( name + "_zero_scalar", [1], ofm.dtype, [0], ofm.dtype.as_numpy_type(), quantization=ofm.quantization ) add_op = create_add_nop(name) add_op.inputs = [ifm, ifm2] add_op.outputs = [ofm] add_op.write_offset = write_offset add_op.write_shape = ifm_shape ofm.ops.append(add_op) DebugDatabase.add_optimised(concat_op, add_op) add_op.ifm_shapes.append(ifm_shape) add_op.ifm_shapes.append(Shape4D(ifm2.shape)) add_op.ofm_shapes.append(concat_op.ofm_shapes[0]) add_op.memory_function = Op.ConcatSliceWrite return add_op # TODO Could be further optimized checking the type of the consumer, # rather than just mimic the TFLite behaviour depending on type. # TOSA bool_t not considered yet def remove_splitsliceread(op, arch): if op.type == Op.SplitSliceRead: # Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted if ( len(op.ofm.consumer_list) == 1 and op.ofm.consumer_list[0] is not None and op.ofm.consumer_list[0].run_on_npu and op.ofm.consumer_list[0].type != Op.Reshape and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and op.ofm.dtype in (DataType.uint8, DataType.int8, DataType.int16) ): # SplitSliceRead can be performed by tensor consumer cons_op = op.ofm.consumer_list[0] move_splitsliceread_to_consumer(op, cons_op) else: name = op.name + "_add" ofm = op.ofm ifm2 = create_const_tensor( name + "_zero_scalar", [1], ofm.dtype, [0], ofm.dtype.as_numpy_type(), quantization=ofm.quantization ) add_op = create_add_nop(name) add_op.inputs = [op.ifm, ifm2] add_op.outputs = [ofm] op.ofm.ops.remove(op) op.ofm.ops.append(add_op) add_op.ifm_shapes.append(op.ifm_shapes[0]) add_op.ifm_shapes.append(Shape4D(ifm2.shape)) add_op.ofm_shapes.append(op.ofm_shapes[0]) add_op.read_offsets[0] = op.read_offsets[0] add_op.read_shapes[0] = op.read_shapes[0] op.ifm.consumer_list.remove(op) DebugDatabase.add_optimised(op, add_op) def rewrite_concat(op): if not op.run_on_npu or not op.type == Op.Concat: return offset = 0 inputs = op.inputs axis_4D = op.attrs["axis4D"] for idx, inp in enumerate(inputs): write_offset = [0, 0, 0, 0] write_offset[axis_4D] = offset concat_end = offset + op.ifm_shapes[idx][axis_4D] create_add_for_concat(op, op.name + str(idx) + "_add", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)) offset = concat_end assert op.ofm_shapes[0][axis_4D] == offset def remove_memory_ops(op, arch): if op.run_on_npu and op.type in (Op.Reshape, Op.Identity): bypass_memory_only_ops(op) def rewrite_activation(op, arch, nng): if op.type not in (Op.ReluN, Op.Clamp): return op ifm = op.ifm zp = ifm.quantization.zero_point if ifm.quantization.zero_point else 0 if op.ofm.quantization.zero_point is None: op.ofm.quantization.zero_point = zp if op.type == Op.Clamp: op.attrs["min"] = op.attrs["min_int"] - zp op.attrs["max"] = op.attrs["max_int"] - zp elif op.type == Op.ReluN: op.attrs["max"] = op.attrs["max_int"] - zp return op def rewrite_rescale(op, arch, nng): if op.type == Op.Rescale: ifm = op.ifm ofm = op.ofm # some error checking assert len(ifm.ops) == 1 prev_op = ifm.ops[0] # TODO currently not supported assert len(ifm.consumer_list) == 1 input_zp = op.attrs["input_zp"] output_zp = op.attrs["output_zp"] multiplier = op.attrs["multiplier"] shift = op.attrs["shift"] scale32 = op.attrs["scale32"] double_round = op.attrs["double_round"] per_channel = op.attrs["per_channel"] assert ifm.dtype in (DataType.uint8, DataType.int8, DataType.int32) assert ifm.dtype in (DataType.uint8, DataType.int8) or input_zp == 0 assert ofm.dtype in (DataType.uint8, DataType.int8) or output_zp == 0 assert (scale32 and ifm.dtype != DataType.int48) or (not scale32 and not double_round) # Check that input tensor has the same zp or no zp ifm_zp = ifm.quantization.zero_point if ifm_zp is not None and ifm_zp != input_zp: print("Error (fuse_rescale): zp of tensors producer/consumer differs unexpectedidly ") assert False ifm.quantization.zero_point = input_zp ofm.quantization.zero_point = output_zp for s, m in zip(shift, multiplier): # TODO these are the TOSA limitations assert m >= 0 assert 2 <= s <= 62 # TODO these are the HW limitations assert 0 <= s < (1 << 6) explicit_scaling = ExplicitScaling(per_channel, shift, multiplier) if double_round and scale32: rounding_mode = NpuRoundingMode.TFL else: rounding_mode = NpuRoundingMode.NATURAL if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected: assert len(multiplier) == len(shift) == len(prev_op.bias.values) if ifm.dtype == DataType.int32 and per_channel: prev_op.explicit_scaling = explicit_scaling prev_op.rounding_mode = rounding_mode # Bypass op prev_op.set_output_tensor(ofm) DebugDatabase.add_optimised(op, prev_op) return op else: print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type) assert False # TODO which are the cases we need to and can do standalone Rescale? # TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops? # origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE? # limited to these at the moment: elif ( (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8) or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8) or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8) ): # Create NOP performing the RESCALE avgpool_op = replace_rescale_with_avg_pool(op) avgpool_op.rounding_mode = rounding_mode if per_channel: # TODO avgpool_op.explicit_scaling = explicit_scaling print("Warning, unsupported TOSA Rescale") assert False else: avgpool_op.explicit_scaling = explicit_scaling else: print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type) assert False return op def convert_pad_in_width(op): """ Rewrites PAD operator to an add that copies the IFM to the OFM + up to 4 add operators that fill the OFM with zeros at the borders. """ assert op.type == Op.Pad assert op.ifm_shapes[0] is not None and op.ofm_shapes[0] is not None ifm = op.ifm ofm = op.ofm ifm_shape = op.ifm_shapes[0] ofm.ops = [] ofm_shape = op.ofm_shapes[0] padding = op.inputs[1].values left, right = padding[-2] # Add op that copies IFM to the right place inside the OFM shp0 = Shape4D(0, 0, 0, 0) add_op = create_add_for_concat(op, op.name + "_main", ifm, ifm_shape, shp0.with_width(left)) add_op.activation = op.activation quant = ofm.quantization pad_value = ifm.quantization.zero_point ifm.quantization.zero_point = 0 if left > 0: shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth) zero_tens = create_const_tensor( op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant ) zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values)) create_add_for_concat(op, op.name + "_left", zero_tens, shape, shp0) if right > 0: shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth) zero_tens = create_const_tensor( op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant ) zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values)) create_add_for_concat(op, op.name + "_right", zero_tens, shape, shp0.with_width(ofm_shape.width - right)) op.type = Op.ConcatTFLite return add_op def convert_table_to_lut(op, arch, nng): # Converts table op to a no-op + LUT if op.type is not Op.Table: return op table = op.inputs[1] op.inputs.remove(table) op.set_ifm_ofm_shapes() return convert_to_lut(op, table.values, "table") def decompose_elem_tensors_hwc(op): """ Decomposes elementwise op if any of the ifm(s)/ofm are to large in any dimension to be handled by the NPU """ max_t_size = 65535 ofm_shape = op.write_shape if op.write_shape is not None else op.ofm_shapes[0] ifm_shape = op.read_shapes[0] if op.read_shapes[0] is not None else op.ifm_shapes[0] ifm2_shape = op.ifm_shapes[1] if op.ifm_shapes[1] else None ifm2_shape = op.read_shapes[1] if op.read_shapes[1] is not None else ifm2_shape limit_shape = Shape4D(1, max_t_size, max_t_size, max_t_size) if any(dim_size > max_t_size for dim_size in ofm_shape.as_list()): ofm_split = ofm_shape.floordiv_const(max_t_size).add(1, 1, 1, 1) for height in range(ofm_split.height): for width in range(ofm_split.width): for depth in range(ofm_split.depth): ofm_offset = Shape4D(0, height * max_t_size, width * max_t_size, depth * max_t_size) ofm_part_shape = ofm_shape.clip(ofm_offset, limit_shape) ofm_cut = (ofm_offset, ofm_part_shape) ifm_d = depth * max_t_size if ifm_shape.depth == ofm_shape.depth else 0 ifm_w = width * max_t_size if ifm_shape.width == ofm_shape.width else 0 ifm_h = height * max_t_size if ifm_shape.height == ofm_shape.height else 0 ifm_offset = Shape4D(0, ifm_h, ifm_w, ifm_d) ifm_part_shape = ifm_shape.clip(ifm_offset, limit_shape) ifm_cut = (ifm_offset, ifm_part_shape) if ifm2_shape is not None: ifm2_d = depth * max_t_size if ifm2_shape.depth == ofm_shape.depth else 0 ifm2_w = width * max_t_size if ifm2_shape.width == ofm_shape.width else 0 ifm2_h = height * max_t_size if ifm2_shape.height == ofm_shape.height else 0 ifm2_offset = Shape4D(0, ifm2_h, ifm2_w, ifm2_d) ifm2_part_shape = ifm2_shape.clip(ifm2_offset, limit_shape) ifm2_cut = (ifm2_offset, ifm2_part_shape) else: ifm2_cut = (None, None) create_elem_part_op(op, ifm_cut, ifm2_cut, ofm_cut) op.ofm.ops.remove(op) op.ifm.consumer_list.remove(op) if op.ifm2 is not None: op.ifm2.consumer_list.remove(op) return def create_elem_part_op(op, ifm_cut, ifm2_cut, ofm_cut): part_op = op.clone() ifm_read_offset = op.read_offsets[0] if op.read_offsets[0] is not None else Shape4D(0, 0, 0, 0) ofm_write_offset = op.write_offset if op.write_offset is not None else Shape4D(0, 0, 0, 0) ifm_offset, ifm_shape = ifm_cut ofm_offset, ofm_shape = ofm_cut part_op.read_offsets[0] = ifm_read_offset + ifm_offset part_op.read_shapes[0] = ifm_shape part_op.write_offset = ofm_write_offset + ofm_offset part_op.write_shape = ofm_shape part_op.ifm_shapes = op.ifm_shapes.copy() part_op.ofm_shapes = op.ofm_shapes.copy() part_op.ifm.consumer_list.append(part_op) op.ofm.ops.append(part_op) ifm2_offset, ifm2_shape = ifm2_cut if ifm2_offset: ifm2_read_offset = op.read_offsets[1] if op.read_offsets[1] is not None else Shape4D(0, 0, 0, 0) part_op.read_offsets[1] = ifm2_read_offset + ifm2_offset part_op.read_shapes[1] = ifm2_shape part_op.ifm2.consumer_list.append(part_op) return part_op def get_nhwc_stride(shape): stride_x = shape.depth stride_y = shape.width * stride_x stride_n = shape.height * stride_y return Shape4D(stride_n, stride_y, stride_x, 1) def pad_to_rank(shape, rank): """ Pads a shape to the given rank """ while len(shape) < rank: shape = [1] + shape return shape def get_elem_shapes_removed_singles(op): """ Returns the shapes of ifm(s)/ofms after removing all the dimensions that are 1 for all ifm(s)/ofm """ binary = op.ifm2 is not None ofm_shape = op.ofm_shapes[0].as_list() if len(op.ofm_shapes) > 0 else op.ofm.shape ifm_shape = op.ifm_shapes[0].as_list() if len(op.ifm_shapes) > 0 else op.ifm.shape if binary: ifm2_shape = op.ifm_shapes[1].as_list() if len(op.ofm_shapes) else op.ifm2.shape rank = max(len(ofm_shape), len(ifm_shape), len(ifm2_shape) if binary else 0) ofm_shape = pad_to_rank(ofm_shape, rank) ifm_shape = pad_to_rank(ifm_shape, rank) if binary: ifm2_shape = pad_to_rank(ifm2_shape, rank) new_ofm_shape = [] new_ifm_shape = [] new_ifm2_shape = [] for idx in range(rank): if ofm_shape[idx] != 1: new_ofm_shape.append(ofm_shape[idx]) new_ifm_shape.append(ifm_shape[idx]) if binary: new_ifm2_shape.append(ifm2_shape[idx]) if new_ofm_shape == []: new_ofm_shape = [1] new_ifm_shape = [1] new_ifm2_shape = [1] if binary else None return new_ofm_shape, new_ifm_shape, new_ifm2_shape def decomp_dims_elementwise(op): """ Decompose elementwise ops with Rank > 3 (H,W,D). If Rank > 3, all the dimensions above H are viewed as the N dimension. the elementwise operation will be decomposed to N (of ofm) elementwise operations. By reading and writing with offsets from/to the ifm(s)/ofm. Note: Broadcast need to be handled for binary elementwise ops, and TOSA allowes for broadcast by both ifm and ifm2 """ ifm = op.ifm ifm2 = op.ifm2 ofm = op.ofm binary = op.ifm2 is not None # Remove dimensions that are all 1 new_ofm_shape, new_ifm_shape, new_ifm2_shape = get_elem_shapes_removed_singles(op) rank = len(new_ofm_shape) if rank > 3: n = rank - 3 ofm_decomp_shape = Shape4D(new_ofm_shape[0:n]) ofm_decomp_stride = get_nhwc_stride(ofm_decomp_shape) ofm_part_shape = Shape4D(new_ofm_shape[n:]) op.ofm_shapes.append(Shape4D([ofm_decomp_shape.elements()] + new_ofm_shape[n:])) if binary: ifm_decomp_shape = Shape4D(new_ifm_shape[0:n]) ifm2_decomp_shape = Shape4D(new_ifm2_shape[0:n]) ifm_decomp_stride = get_nhwc_stride(ifm_decomp_shape) ifm2_decomp_stride = get_nhwc_stride(ifm2_decomp_shape) ifm_part_shape = Shape4D(new_ifm_shape[n:]) ifm2_part_shape = Shape4D(new_ifm2_shape[n:]) op.ifm_shapes.append(Shape4D([ifm_decomp_shape.elements()] + new_ifm_shape[n:])) op.ifm_shapes.append(Shape4D([ifm2_decomp_shape.elements()] + new_ifm2_shape[n:])) else: op.ifm_shapes.append(Shape4D([ofm_decomp_shape.elements()] + new_ofm_shape[n:])) op_list = [] for height in range(ofm_decomp_shape.height): for width in range(ofm_decomp_shape.width): for depth in range(ofm_decomp_shape.depth): ofm_offset = Shape4D(0, height, width, depth) ofm_offset = Shape4D(ofm_offset.dot_prod(ofm_decomp_stride), 0, 0, 0) ofm_cut = (ofm_offset, ofm_part_shape) if binary: ifm_d = depth if ifm_decomp_shape.depth == ofm_decomp_shape.depth else 0 ifm_w = width if ifm_decomp_shape.width == ofm_decomp_shape.width else 0 ifm_h = height if ifm_decomp_shape.height == ofm_decomp_shape.height else 0 ifm_offset = Shape4D(0, ifm_h, ifm_w, ifm_d) ifm_offset = Shape4D(ifm_offset.dot_prod(ifm_decomp_stride), 0, 0, 0) ifm_cut = (ifm_offset, ifm_part_shape) ifm2_d = depth if ifm2_decomp_shape.depth == ofm_decomp_shape.depth else 0 ifm2_w = width if ifm2_decomp_shape.width == ofm_decomp_shape.width else 0 ifm2_h = height if ifm2_decomp_shape.height == ofm_decomp_shape.height else 0 ifm2_offset = Shape4D(0, ifm2_h, ifm2_w, ifm2_d) ifm2_offset = Shape4D(ifm2_offset.dot_prod(ifm2_decomp_stride), 0, 0, 0) ifm2_cut = (ifm2_offset, ifm2_part_shape) op_list.append(create_elem_part_op(op, ifm_cut, ifm2_cut, ofm_cut)) else: op_list.append(create_elem_part_op(op, ofm_cut, None, ofm_cut)) ofm.ops.remove(op) ifm.consumer_list.remove(op) if binary: ifm2.consumer_list.remove(op) return op_list else: op.ofm_shapes.append(Shape4D(new_ofm_shape)) op.ifm_shapes.append(Shape4D(new_ifm_shape)) op.ifm_shapes.append(Shape4D(new_ifm2_shape)) return [op] def decomp_elementwise(tens, arch, nng): """ Decompose elementwise ops with Rank > 3 (H,W,C). Decompose size of tensors exceeding NPU max size """ tens_ops = tens.ops.copy() for op in tens_ops: if op.type.is_elementwise_op(): decomp_list = decomp_dims_elementwise(op) for part_op in decomp_list: decompose_elem_tensors_hwc(part_op) return tens def reshape_concat_shape(shape, rank, axis): new_h = 1 for i in range(axis): new_h *= shape[i] new_c = 1 for i in range(axis + 1, rank): new_c *= shape[i] if axis == (rank - 1): new_shape = [new_h, shape[axis], 1] else: new_shape = [new_h, shape[axis], new_c] return new_shape def reshape_concat(op): """ Reshapes concat ops with Rank > 3 (H,W,C). """ ofm = op.ofm rank = len(ofm.shape) axis = op.attrs["axis"] if axis < 0: axis += rank if rank > 3: # Reshape so that axis in to be concatenated is the W dimension # Reshape inputs for inp in op.inputs: new_shape = reshape_concat_shape(inp.shape, rank, axis) op.ifm_shapes.append(Shape4D(new_shape)) # Reshape output new_shape = reshape_concat_shape(ofm.shape, rank, axis) op.ofm_shapes.append(Shape4D(new_shape)) op.attrs["axis4D"] = 2 else: for inp in op.inputs: op.ifm_shapes.append(Shape4D(inp.shape)) op.ofm_shapes.append(Shape4D(ofm.shape)) op.attrs["axis4D"] = axis + (4 - rank) def decomp_rewrite_concat(tens, arch, nng): """ Decompose concat ops with Rank > 3 (H,W,C). Rewrite of concat to elementwise operations """ if len(tens.ops) == 1 and tens.ops[0].type == Op.Concat: op = tens.ops[0] reshape_concat(op) rewrite_concat(op) op.ofm.ops.remove(op) for inp in op.inputs: inp.consumer_list.remove(op) return tens def decomp_rewrite_pad(op, arch): """ Decomposition of pad to elementwise operations: For each dimension that needs padding: -Create a new PAD operator for each dimension to be added Ifm/ofm are reshape so this is the width dimension is to be padded (rank for each is 3) -Rewrite the the new PAD operator so there is: -1 Add operator for copying the data -1 Add operator for each left/right to be padded """ # TODO several things would be possible to optimize # For instance there are cases when it should be possible to pad 2 # dimensions at the same time. if op.type == Op.Pad: ofm_elements = shape_num_elements(op.ofm.shape) padding = op.inputs[1].values rank = len(op.ifm.shape) next_ifm = op.ifm next_ifm_shape = next_ifm.shape.copy() first_pad_rewrite_op = None ifm_quant = op.ifm.quantization.clone() for dim in range(padding.shape[0]): # Check if padding is to be applied in this dimension dim_pad = padding[dim] if not (dim_pad == 0).all(): # Reshape so that width dimension is to be padded new_ifm_shape = reshape_concat_shape(next_ifm_shape, rank, dim) new_pad_input = np.zeros((4, 2), dtype=np.int32) new_pad_input[2] = dim_pad pad_op = create_pad_nop(f"{op.name}_dim_{dim}") pad_op.add_input_tensor(next_ifm) new_pad_tens = op.inputs[1].clone("_dim_{dim}") name = op.inputs[1].name + f"_dim_{dim}" new_pad_tens = create_const_tensor( name, list(new_pad_input.shape), DataType.int32, new_pad_input, np.int32 ) pad_op.add_input_tensor(new_pad_tens) new_ofm_shape = new_ifm_shape.copy() new_ofm_shape[-2] = new_ofm_shape[-2] + dim_pad.sum() next_ifm_shape[dim] = next_ifm_shape[dim] + dim_pad.sum() if Shape4D(new_ofm_shape).elements() == ofm_elements: # Last one, use op.ofm ofm = op.ofm else: # add a new ofm Tensor ofm = Tensor(new_ofm_shape, op.ofm.dtype, f"{pad_op.name}_tens") ofm.quantization = ifm_quant.clone() pad_op.set_output_tensor(ofm) pad_op.ifm_shapes.append(Shape4D(new_ifm_shape)) pad_op.ofm_shapes.append(Shape4D(new_ofm_shape)) DebugDatabase.add_optimised(op, pad_op) next_ifm = ofm # Rewrite the pad op converted_pad_op = convert_pad_in_width(pad_op) first_pad_rewrite_op = converted_pad_op else: # Change to Identity operation (will be removed) op.type = Op.Identity if first_pad_rewrite_op: assert op.ofm.shape == next_ifm_shape for inp in op.inputs: inp.consumer_list.remove(op) return first_pad_rewrite_op return op def fixup_quantization(op, arch, nng): if op.ifm and op.ifm.quantization.zero_point is None: op.ifm.quantization.zero_point = 0 if op.ifm2 and op.ifm2.quantization.zero_point is None: op.ifm2.quantization.zero_point = 0 if not op.forced_output_quantization: if op.ofm and op.ofm.quantization and op.ofm.quantization.zero_point is None: op.ofm.quantization.zero_point = 0 return op def supported_operator_check(op, arch, nng): op.run_on_npu = arch.tosa_supported_operators.is_operator_supported(op) assert op.run_on_npu or op.type in (Op.Placeholder, Op.SubgraphInput, Op.Const) return op def tosa_optimise_graph(nng, arch): # TODO the supported operator checking need to be split in semantic and HW checks for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [], [supported_operator_check], rewrite_unsupported=False, ) # Decomposing and rewrite of concat for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [decomp_rewrite_concat], [], rewrite_unsupported=False ) # Decomposing of pad for idx, sg in enumerate(nng.subgraphs): rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [decomp_rewrite_pad]) sg.refresh_after_modification() # Handle sg input output for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [], [fix_sg_input_output_tosa], rewrite_unsupported=True, ) # Removal of reshapes for sg in nng.subgraphs: rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_memory_ops]) sg.refresh_after_modification() # Decomposing of elementwise for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [decomp_elementwise], [], rewrite_unsupported=False ) for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [], [set_ifm_ofm_op_shapes], rewrite_unsupported=False, ) # Removal of Transpose for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [], [remove_const_transpose], rewrite_unsupported=False, ) # TODO, when and where to best handle calc_scaling_avgpool for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [], [calc_scaling_avgpool], rewrite_unsupported=False, ) # Rewite Operators step op_rewrite_list = [set_tensor_equivalence, rewrite_rescale, convert_depthwise_to_conv, convert_table_to_lut] for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False, ) # Post-processing step 1 for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [], [rewrite_activation, add_padding_fields], ) # Removal of Slice, need to be done after optimisation has been performed, # since ifm/ofm_shapes are of importance to this function for sg in nng.subgraphs: rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_splitsliceread]) sg.refresh_after_modification() # Post-processing step 2 for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( nng, sg, arch, [], [fixup_quantization], ) return nng