# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the License); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an AS IS BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Description: # Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations. import collections import enum from .debug_database import DebugDatabase from .nn_graph import Pass from .nn_graph import PassPlacement from .operation import NpuBlockType from .operation import Op from .operation_util import create_avgpool_nop from .tensor import TensorPurpose class PassFlags(enum.Flag): Empty = 0 Main = 1 Post = 2 Mac = 4 ElementWise = 8 Npu = 16 Cpu = 32 StartupInit = 64 MemoryOnly = 128 PostFusingLimited = 256 mac_main_ops = set( ( # convolutions Op.Conv2DBias, Op.Conv2D, Op.QuantizedConv2D, Op.Conv2DBackpropInputSwitchedBias, # depth-wise convolutions Op.DepthwiseConv2DBias, # FC layers Op.QuantizedMatMul, Op.MatMul, Op.FullyConnected, # RNN/LSTM/GRU Op.BlockLSTM, # pooling Op.QuantizedMaxPool, Op.QuantizedAvgPool, Op.AvgPool, Op.MaxPool, Op.ReduceSum, # deconvolution Op.ResizeBilinear, ) ) binary_elem_wise_main_ops = Op.op_set(Op.is_binary_elementwise_op) unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op) elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops activation_ops = Op.op_set(Op.is_relu_op) npu_post_ops = activation_ops npu_post_fuse_limited_ops = set( # Set of post operators that should not be fused with main/elementwise ops (Op.Sigmoid, Op.Tanh, Op.Quantize) ) elem_wise_ops = elem_wise_main_ops | activation_ops | set((Op.Sigmoid, Op.Tanh)) quantization_ops = set((Op.Dequantize, Op.Max, Op.Min)) cpu_ops = set((Op.Softmax, Op.LRN, Op.Shape, Op.Pad, Op.AddN)) | quantization_ops startup_init_ops = set((Op.Const, Op.Placeholder, Op.SubgraphInput)) memory_only_ops = set( ( Op.Squeeze, Op.Reshape, Op.QuantizedReshape, Op.ExpandDims, ) ) test_sequence = [ ( # ops_set npu_post_ops, # incompatible_pack_flags PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Main, # flags_to_set PassFlags.Npu | PassFlags.Post, # flags_to_clear PassFlags.Empty, ), ( # ops_set npu_post_fuse_limited_ops, # incompatible_pack_flags PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Main | PassFlags.PostFusingLimited, # flags_to_set PassFlags.Npu | PassFlags.PostFusingLimited, # flags_to_clear PassFlags.Empty, ), ( # ops_set mac_main_ops, # incompatible_pack_flags PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.ElementWise | PassFlags.Main | PassFlags.PostFusingLimited, # flags_to_set PassFlags.Npu | PassFlags.Mac | PassFlags.Main, # flags_to_clear PassFlags.Empty, ), ( # ops_set elem_wise_main_ops, # incompatible_pack_flags PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Mac | PassFlags.Main | PassFlags.PostFusingLimited, # flags_to_set PassFlags.Npu | PassFlags.ElementWise | PassFlags.Main, # flags_to_clear PassFlags.Empty, ), ( # ops_set startup_init_ops, # incompatible_pack_flags PassFlags.Npu | PassFlags.Cpu | PassFlags.MemoryOnly, # flags_to_set PassFlags.StartupInit | PassFlags.Main, # flags_to_clear PassFlags.Empty, ), ( # ops_set memory_only_ops, # incompatible_pack_flags PassFlags.Npu | PassFlags.Cpu, # flags_to_set PassFlags.MemoryOnly | PassFlags.Main, # flags_to_clear PassFlags.Empty, ), ( # ops_set cpu_ops, # incompatible_pack_flags PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main, # flags_to_set PassFlags.Cpu | PassFlags.Main, # flags_to_clear PassFlags.Empty, ), ( # This last one is a fallback for unrecognised operations # ops_set None, # incompatible_pack_flags PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main, # flags_to_set PassFlags.Cpu | PassFlags.Main, # flags_to_clear PassFlags.Empty, ), ] # Some sanity checking for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence: assert not flags_to_clear & flags_to_set def pack_into_passes(nng, arch, verbose_packing=False): def visit_op(op, ignored): visit_op_refcount[op] += 1 if visit_op_refcount[op] == 1: # First-time visit, go and fix up unused output tensors for tens in op.outputs: if len(tens.consumers()) == 0: visit_op_refcount[op] += 1 assert visit_op_refcount[op] <= len(op.outputs) if visit_op_refcount[op] == len(op.outputs): if op.type in startup_init_ops: startup_list.append(op) else: ofm_tensor = op.ofm if ofm_tensor is None: ofm_tensor = op.outputs[0] ofm_shape = op.ofm_shapes[0] if op.run_on_npu else None build_pass((op,), ofm_tensor, ofm_shape) def build_pass(start_ops_to_process, ofm_tensor=None, ofm_shape=None): reverse_ops_list = [] curr_flags = PassFlags.Empty npu_block_type = NpuBlockType.Default reverse_intermediates = [] input_set = set() ifm_tensor = None primary_op = None ifm_shapes = None to_process = collections.deque() for start_op in start_ops_to_process: to_process.append((start_op, None)) while to_process: curr_op, tens = to_process.popleft() if curr_op in reverse_ops_list: continue for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence: if operation_set is None or curr_op.type in operation_set: if not (curr_flags & incompatible_pack_flags): if flags_to_set & PassFlags.Npu: if not curr_op.run_on_npu: continue reverse_ops_list.append(curr_op) new_block_type = curr_op.type.npu_block_type if new_block_type != NpuBlockType.Default: assert npu_block_type == NpuBlockType.Default npu_block_type = new_block_type # Only one major block type per pass assert primary_op is None primary_op = curr_op curr_flags &= ~flags_to_clear curr_flags |= flags_to_set if flags_to_set & PassFlags.Npu: if flags_to_set & ( PassFlags.Mac | PassFlags.ElementWise | PassFlags.Post | PassFlags.PostFusingLimited ): assert len(curr_op.inputs) >= 1 ifm_tensor = curr_op.ifm ifm_shapes = curr_op.ifm_shapes.copy() assert ifm_tensor is not None, "IFM missing in {}".format(curr_op) assert ifm_tensor.purpose == TensorPurpose.FeatureMap if operation_set is None: print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU") for inp in reversed(curr_op.inputs): if inp is None: continue if can_pack(inp, curr_op): to_process.append((inp.ops[0], inp)) else: input_set.add(inp) break else: # This operation is not compatible with already packed operations, just register the tensor as an input assert tens is not None input_set.add(tens) if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise | PassFlags.Mac): # Make the choice that if we don't have a mac operation, the ambidextrous operations go on the # element wise unit curr_flags |= PassFlags.ElementWise is_element_wise = True for op in reverse_ops_list: if op.type not in elem_wise_ops and op.type: is_element_wise = False break placement = PassPlacement.Unknown if curr_flags & PassFlags.Npu: assert placement == PassPlacement.Unknown placement = PassPlacement.Npu if curr_flags & PassFlags.Cpu: assert placement == PassPlacement.Unknown placement = PassPlacement.Cpu if curr_flags & PassFlags.MemoryOnly: assert placement == PassPlacement.Unknown placement = PassPlacement.MemoryOnly if curr_flags & PassFlags.StartupInit: assert placement == PassPlacement.Unknown placement = PassPlacement.StartupInit assert placement != PassPlacement.Unknown ops_list = list(reversed(reverse_ops_list)) intermediates = list(reversed(reverse_intermediates)) if primary_op is None: primary_op = create_primary_op(ops_list) if primary_op is not None: visit_tensor_refcount[primary_op.inputs[0]] += 1 npu_block_type = primary_op.type.npu_block_type for input_tens in primary_op.inputs: if input_tens not in input_set: input_set.add(input_tens) ordered_input_list = [] # Keep LUT-s in a separate list and add as inputs at the end # to avoid that they would accidentally be assigned as ifm or ifm2 lut_list = [] input_refcounts = collections.defaultdict(int) input_ops_list = ops_list.copy() # Check primary_op first if primary_op is not None: for inp in primary_op.inputs: if inp is None: continue add_input_list(inp, input_set, input_refcounts, lut_list, ordered_input_list) input_ops_list.remove(primary_op) # Check rest of the list for op in input_ops_list: for inp in op.inputs: add_input_list(inp, input_set, input_refcounts, lut_list, ordered_input_list) name = ops_list[0].name ps = Pass(name, placement, is_element_wise, npu_block_type) ps.ops = ops_list ps.primary_op = primary_op ps.inputs = ordered_input_list ps.intermediates = intermediates ps.outputs = list(ops_list[-1].outputs) # ElementWise operation, 2 IFMs if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops: ps.ifm_tensor = ps.inputs[0] ps.ifm2_tensor = ps.inputs[-1] if len(ps.inputs) > 2: ps.ifm_tensor = ps.inputs[-2] # Get the corresponding ifm_shapes for op in input_ops_list + [primary_op]: if op.run_on_npu: if ps.ifm_tensor == op.ifm: ps.ifm_shapes.append(op.ifm_shapes[0]) elif ps.ifm_tensor == op.ifm2: ps.ifm_shapes.append(op.ifm_shapes[1]) if ps.ifm2_tensor == op.ifm: ps.ifm_shapes.append(op.ifm_shapes[0]) elif ps.ifm2_tensor == op.ifm2: ps.ifm_shapes.append(op.ifm_shapes[1]) else: ps.ifm_tensor = ifm_tensor ps.ifm2_tensor = None if ps.primary_op is not None and ps.primary_op.run_on_npu: ps.ifm_shapes.append(ifm_shapes[0]) ps.ofm_tensor = ofm_tensor ps.ofm_shapes.append(ofm_shape) assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None ps.weight_tensor = ps.get_primary_op_ifm_weights()[1] ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2] ps.lut_tensor = ps.get_primary_op_lut() ps.inputs.extend(lut_list) for op in ps.ops: op.scheduled_pass = ps reverse_pass_list.append(ps) for inp, refcount in input_refcounts.items(): for _ in range(refcount): visit_tensor(inp) return ps def visit_tensor(tens): visit_tensor_refcount[tens] += 1 assert visit_tensor_refcount[tens] <= len(tens.consumers()) if visit_tensor_refcount[tens] == len(tens.consumers()): for op in reversed(tens.ops): visit_op(op, tens) def create_primary_op(op_list): if any(op.type in (npu_post_ops | npu_post_fuse_limited_ops) and op.run_on_npu for op in op_list): # Configure a 1x1 AvgPool and attach the op onto it op = op_list[0] inp = op.inputs[0] avgpool_op = create_avgpool_nop(op.name + "_avgpool") avgpool_op.add_input_tensor(inp) avgpool_out = inp.clone("_avgpooled") avgpool_out.consumer_list.append(op) avgpool_op.set_output_tensor(avgpool_out) avgpool_op.ifm_shapes = op.ifm_shapes.copy() avgpool_op.ofm_shapes = op.ofm_shapes.copy() avgpool_op.read_offsets = op.read_offsets.copy() avgpool_op.read_shapes = op.read_shapes.copy() op.inputs[0] = avgpool_out op_list.insert(0, avgpool_op) DebugDatabase.add_optimised(op, avgpool_op) return avgpool_op return None def can_pack(inp, curr_op): if len(inp.ops) == 1: next_op = inp.ops[0] for outp in next_op.outputs: consumers = outp.consumers() if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op): return False # There cannot be any reshaping between next_op ofm and corresponding curr_op ifm if len(curr_op.ifm_shapes) != 0 and len(next_op.ofm_shapes) != 0: if inp == curr_op.ifm and next_op.ofm_shapes[0] != curr_op.ifm_shapes[0]: return False elif ( curr_op.ifm2 is not None and inp == curr_op.ifm2 and next_op.ofm_shapes[0] != curr_op.ifm_shapes[1] ): return False else: return False return True def add_input_list(inp_to_add, inp_set, inp_refcnts, lut_list, ordered_inp_list): if inp_to_add in inp_set: if inp_refcnts[inp_to_add] == 0: if inp_to_add.purpose == TensorPurpose.LUT: lut_list.append(inp_to_add) else: ordered_inp_list.append(inp_to_add) inp_refcnts[inp_to_add] += 1 for sg in nng.subgraphs: reverse_pass_list = [] visit_op_refcount = collections.defaultdict(int) visit_tensor_refcount = collections.defaultdict(int) startup_list = [] for tens in sg.output_tensors: visit_tensor(tens) if startup_list: startup_ps = build_pass(startup_list) startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs startup_ps.name = "startup_weight_initialisation" # Graphs with both CPU and NPU ops might not have an optimal order in # the pass list due to how the graph is traversed (depth first search). # This can result in more context switching between CPU and NPU. # Try to optmize this by moving/grouping CPU ops where that is possible. # Criteria for CPU pass to be moved: # # 1) CPU passes that only depends on sg.input_tensor can be # moved to the top of the list. # # 2) A CPU pass X is allowed to be grouped together with CPU pass Y # if there is no NPU pass between pass X and pass Y that depends # on output from pass X or a MemoryOnly pass. # # Criteria 2 will try to move as many CPU passes towards the bottom of # the list. pass_list_top = [] pass_list = [] # Filter out early passes from the rest for ps in list(reversed(reverse_pass_list)): if startup_ps == ps: # startup pass belongs in the top pass_list_top.insert(0, ps) continue if ( ps.placement == PassPlacement.Cpu and ps.ops[0].ifm in sg.input_tensors and (ps.ops[0].ifm2 in sg.input_tensors or ps.ops[0].ifm2 is None) ): # This CPU pass only depends on sg.input_tensors pass_list_top.append(ps) else: # Add pass to the list that will be sorted in the next step pass_list.append(ps) # Sort the rest of the list based on critera 2. # Search from bottom of list and when a CPU pass is found # search forward in the list and see if it is possible to join another CPU pass. last_idx = len(pass_list) - 1 for cpu_ps in reversed(pass_list): if cpu_ps.placement != PassPlacement.Cpu: continue # CPU pass found, search forward and move pass if possible idx = pass_list.index(cpu_ps) for next_ps in pass_list[idx + 1 :]: if next_ps.placement == PassPlacement.Cpu: # It is possible to move the CPU pass pass_list.remove(cpu_ps) insert_index = pass_list.index(next_ps) pass_list.insert(insert_index, cpu_ps) break if ( cpu_ps.ops[0].ofm in [next_ps.ops[0].ifm, next_ps.ops[0].ifm2] or next_ps.placement == PassPlacement.MemoryOnly ): # Not possible to move break if pass_list.index(next_ps) == last_idx: # Last element, ok to move the CPU pass pass_list.remove(cpu_ps) pass_list.append(cpu_ps) break pass_list_top.extend(pass_list) sg.passes = pass_list_top sg.build_pass_links() if verbose_packing: nng.print_passes() return nng