From e3b1b91c450129308a3a1d466a2f2876a5a872b1 Mon Sep 17 00:00:00 2001 From: Patrik Gustavsson Date: Tue, 9 Feb 2021 15:38:46 +0100 Subject: MLBEDSW-3774 Remove SplitSliceRead Removed SplitSliceRead from subgraph during graph optimisation. Signed-off-by: Patrik Gustavsson Change-Id: I9315d4c2a6767828dd2b4e66823d73b10ebee99c --- ethosu/vela/graph_optimiser.py | 51 ++++++++++++++++++++-- ethosu/vela/high_level_command_stream.py | 4 +- ethosu/vela/high_level_command_stream_generator.py | 32 ++------------ ethosu/vela/operation.py | 3 ++ ethosu/vela/pass_packing.py | 16 +------ 5 files changed, 57 insertions(+), 49 deletions(-) diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index eb93106e..50368b86 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -164,10 +164,8 @@ def rewrite_split_ops(tens, arch, nng): # If start offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input if (offset_start[-1] % 16) != 0: inp.avoid_NHCWB16 = True - else: - offset_start = full_shape(4, offset_start, 0) - new_op.attrs["split_start"] = offset_start + new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0) new_op.run_on_npu = True new_op.set_output_tensor(tens) new_op.ifm_shapes.append(Shape4D(inp.shape)) @@ -177,6 +175,45 @@ def rewrite_split_ops(tens, arch, nng): return tens +def remove_SplitSliceRead(op, arch): + + if op.type == Op.SplitSliceRead: + # Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted + if ( + len(op.ofm.consumer_list) == 1 + and op.ofm.consumer_list[0] is not None + and op.ofm.consumer_list[0].run_on_npu + and op.ofm.consumer_list[0].type != Op.Reshape + and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) + ): + # SplitSliceRead can be performed by tensor consumer + cons_op = op.ofm.consumer_list[0] + if cons_op.ifm == op.ofm: + cons_op.read_offsets[0] = op.read_offsets[0] + cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0]) + cons_op.ifm_shapes[0] = op.ifm_shapes[0] + elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm: + cons_op.read_offsets[1] = op.read_offsets[0] + cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1]) + cons_op.ifm_shapes[1] = op.ifm_shapes[0] + + op.ofm.consumer_list.remove(cons_op) + op.ofm.ops = [] + op.ifm.consumer_list.remove(op) + else: + avgpool_op = create_avgpool_nop(op.name + "_avgpool") + avgpool_op.add_input_tensor(op.ifm) + avgpool_op.outputs = [op.ofm] + op.ofm.ops.remove(op) + op.ofm.ops.append(avgpool_op) + avgpool_op.ifm_shapes.append(op.ifm_shapes[0]) + avgpool_op.ofm_shapes.append(op.ofm_shapes[0]) + avgpool_op.read_offsets[0] = op.read_offsets[0] + + op.ifm.consumer_list.remove(op) + DebugDatabase.add_optimised(op, avgpool_op) + + def insert_copy_op_after_tens(tens): tens_cons_list_copy = tens.consumer_list.copy() @@ -202,7 +239,7 @@ def fix_sg_input_output(op, arch, nng): if not op.run_on_npu or op.type != Op.Reshape: return op - # For the memory operators we want to remove, tensors are removed. + # For the Reshape operators we want to remove, tensors are removed. # But in order to to do this, they cannot be outputs of the sg, # this need to be fixed prior to the removal. # Solution is to add a avgpool NOP, to maintain the original tensor. @@ -1295,6 +1332,12 @@ def optimise_graph_a(nng, arch, verbose_graph=False): [fuse_activation_function_with_prev, optimise_pad, add_padding_fields], ) + # Removal of SplitSliceRead, need to be done after optimisation has been performed, + # since ifm/ofm_shapes are of importance to this function + for sg in nng.subgraphs: + rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead]) + sg.refresh_after_modification() + # Post-optimisation operator debug tracing, and checking that no undesired reshapes are left in the graph for sg in nng.subgraphs: rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [check_reshapes, _record_optimised]) diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py index c25c023e..0ce8fac2 100644 --- a/ethosu/vela/high_level_command_stream.py +++ b/ethosu/vela/high_level_command_stream.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -41,7 +41,7 @@ class Box: npu_block_type: NpuBlockType, concat_axis: int = 0, concat_offset: int = 0, - split_offset: int = None, + split_offset: Shape4D = None, k_height: int = 1, upscaling_factor: int = 1, ): diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py index 97b42aeb..c2027e0f 100644 --- a/ethosu/vela/high_level_command_stream_generator.py +++ b/ethosu/vela/high_level_command_stream_generator.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -38,46 +38,20 @@ def dma_if_necessary(ps, box, tensor): yield DMA(ps, in_tensor, tensor, box) -def match_tensor(source, derived): - if source == derived: - return True - ops = derived.ops - return ops != [] and len(ops) == 1 and ops[0].type == Op.SplitSliceRead and source == ops[0].inputs[0] - - def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx): is_first = idx == 0 is_last = idx == len(passes) - 1 ps = passes[idx] block_config = block_configs[idx] npu_block_type = ps.npu_block_type - split_offsets = [None, None] # offset for [ifm, ifm2] + split_offsets = list(ps.primary_op.read_offsets) # offset for [ifm, ifm2] if ps.ifm_tensor is not None and ps.ifm2_tensor is not None and npu_block_type == NpuBlockType.ElementWise: # Ensure correct ifm and ifm2 order - if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]): + if ps.inputs[0] == ps.primary_op.inputs[1] and ps.inputs[1] == ps.primary_op.inputs[0]: ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0] - for op in ps.ops: - if op.type == Op.SplitSliceRead: - ps.primary_op.memory_function = op.type - assert len(op.inputs) == 1 - if match_tensor(ps.ifm_tensor, op.inputs[0]): - split_offsets[0] = op.attrs["split_start"] - elif match_tensor(ps.ifm2_tensor, op.inputs[0]): - split_offsets[1] = op.attrs["split_start"] - else: - assert False - else: - ifm_idx = 0 - for op in ps.ops: - if op.type == Op.SplitSliceRead: - assert ifm_idx < 2 - split_offsets[ifm_idx] = op.attrs["split_start"] - ps.primary_op.memory_function = op.type - ifm_idx += 1 - ifm_tensor = ps.ifm_tensor ifm_shape = None if ifm_tensor.shape != []: diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index b297bed0..16431be7 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -418,6 +418,7 @@ class Operation: "ifm_shapes", "ofm_shapes", "rescale", + "read_offsets", ) def __init__(self, op_type: Op, name: str): @@ -444,6 +445,7 @@ class Operation: # If not none: contains rescale to be used as output scaling # (which overrides the ofm tensor's scale) self.rescale = None + self.read_offsets: List[Shape4D] = [None, None] # offset for [ifm, ifm2] def clone(self, suffix="_clone"): res = Operation(self.type, self.name + suffix) @@ -458,6 +460,7 @@ class Operation: res.forced_output_quantization = self.forced_output_quantization res.scheduled_pass = self.scheduled_pass res.op_index = None # not relevant as not part of input network + res.read_offsets = list(self.read_offsets) return res diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py index b52b159e..281c0932 100644 --- a/ethosu/vela/pass_packing.py +++ b/ethosu/vela/pass_packing.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -42,8 +42,6 @@ class PassFlags(enum.Flag): PostFusingLimited = 8192 -npu_pre_ops = set((Op.SplitSliceRead,)) - mac_main_ops = set( ( # convolutions @@ -146,16 +144,6 @@ test_sequence = [ # flags_to_clear PassFlags.Empty, ), - ( - # ops_set - npu_pre_ops, - # incompatible_pack_flags - PassFlags.Cpu | PassFlags.MemoryOnly, - # flags_to_set - PassFlags.Npu | PassFlags.Mac | PassFlags.Pre | PassFlags.ElementWise, - # flags_to_clear - PassFlags.Empty, - ), ( # ops_set npu_dma_ops, @@ -437,7 +425,7 @@ def pack_into_passes(nng, arch, verbose_packing=False): visit_op(op, tens) def create_primary_op(op_list): - if any(op.type in (npu_pre_ops | npu_post_ops | npu_post_fuse_limited_ops) and op.run_on_npu for op in op_list): + if any(op.type in (npu_post_ops | npu_post_fuse_limited_ops) and op.run_on_npu for op in op_list): # Configure a 1x1 AvgPool and attach the op onto it op = op_list[0] inp = op.inputs[0] -- cgit v1.2.1