diff options
author | Patrik Gustavsson <patrik.gustavsson@arm.com> | 2021-02-09 15:38:46 +0100 |
---|---|---|
committer | Patrik Gustavsson <patrik.gustavsson@arm.com> | 2021-02-11 14:38:56 +0100 |
commit | e3b1b91c450129308a3a1d466a2f2876a5a872b1 (patch) | |
tree | 77c6fe5cc8836dc3aac505efdbf78cee433bd398 /ethosu/vela | |
parent | 138d47f5a3e87d294b3714ae799ccad8ac9111bd (diff) | |
download | ethos-u-vela-e3b1b91c450129308a3a1d466a2f2876a5a872b1.tar.gz |
MLBEDSW-3774 Remove SplitSliceRead
Removed SplitSliceRead from subgraph during
graph optimisation.
Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
Change-Id: I9315d4c2a6767828dd2b4e66823d73b10ebee99c
Diffstat (limited to 'ethosu/vela')
-rw-r--r-- | ethosu/vela/graph_optimiser.py | 51 | ||||
-rw-r--r-- | ethosu/vela/high_level_command_stream.py | 4 | ||||
-rw-r--r-- | ethosu/vela/high_level_command_stream_generator.py | 32 | ||||
-rw-r--r-- | ethosu/vela/operation.py | 3 | ||||
-rw-r--r-- | ethosu/vela/pass_packing.py | 16 |
5 files changed, 57 insertions, 49 deletions
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index eb93106e..50368b86 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -164,10 +164,8 @@ def rewrite_split_ops(tens, arch, nng): # If start offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input if (offset_start[-1] % 16) != 0: inp.avoid_NHCWB16 = True - else: - offset_start = full_shape(4, offset_start, 0) - new_op.attrs["split_start"] = offset_start + new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0) new_op.run_on_npu = True new_op.set_output_tensor(tens) new_op.ifm_shapes.append(Shape4D(inp.shape)) @@ -177,6 +175,45 @@ def rewrite_split_ops(tens, arch, nng): return tens +def remove_SplitSliceRead(op, arch): + + if op.type == Op.SplitSliceRead: + # Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted + if ( + len(op.ofm.consumer_list) == 1 + and op.ofm.consumer_list[0] is not None + and op.ofm.consumer_list[0].run_on_npu + and op.ofm.consumer_list[0].type != Op.Reshape + and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) + ): + # SplitSliceRead can be performed by tensor consumer + cons_op = op.ofm.consumer_list[0] + if cons_op.ifm == op.ofm: + cons_op.read_offsets[0] = op.read_offsets[0] + cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0]) + cons_op.ifm_shapes[0] = op.ifm_shapes[0] + elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm: + cons_op.read_offsets[1] = op.read_offsets[0] + cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1]) + cons_op.ifm_shapes[1] = op.ifm_shapes[0] + + op.ofm.consumer_list.remove(cons_op) + op.ofm.ops = [] + op.ifm.consumer_list.remove(op) + else: + avgpool_op = create_avgpool_nop(op.name + "_avgpool") + avgpool_op.add_input_tensor(op.ifm) + avgpool_op.outputs = [op.ofm] + op.ofm.ops.remove(op) + op.ofm.ops.append(avgpool_op) + avgpool_op.ifm_shapes.append(op.ifm_shapes[0]) + avgpool_op.ofm_shapes.append(op.ofm_shapes[0]) + avgpool_op.read_offsets[0] = op.read_offsets[0] + + op.ifm.consumer_list.remove(op) + DebugDatabase.add_optimised(op, avgpool_op) + + def insert_copy_op_after_tens(tens): tens_cons_list_copy = tens.consumer_list.copy() @@ -202,7 +239,7 @@ def fix_sg_input_output(op, arch, nng): if not op.run_on_npu or op.type != Op.Reshape: return op - # For the memory operators we want to remove, tensors are removed. + # For the Reshape operators we want to remove, tensors are removed. # But in order to to do this, they cannot be outputs of the sg, # this need to be fixed prior to the removal. # Solution is to add a avgpool NOP, to maintain the original tensor. @@ -1295,6 +1332,12 @@ def optimise_graph_a(nng, arch, verbose_graph=False): [fuse_activation_function_with_prev, optimise_pad, add_padding_fields], ) + # Removal of SplitSliceRead, need to be done after optimisation has been performed, + # since ifm/ofm_shapes are of importance to this function + for sg in nng.subgraphs: + rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead]) + sg.refresh_after_modification() + # Post-optimisation operator debug tracing, and checking that no undesired reshapes are left in the graph for sg in nng.subgraphs: rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [check_reshapes, _record_optimised]) diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py index c25c023e..0ce8fac2 100644 --- a/ethosu/vela/high_level_command_stream.py +++ b/ethosu/vela/high_level_command_stream.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -41,7 +41,7 @@ class Box: npu_block_type: NpuBlockType, concat_axis: int = 0, concat_offset: int = 0, - split_offset: int = None, + split_offset: Shape4D = None, k_height: int = 1, upscaling_factor: int = 1, ): diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py index 97b42aeb..c2027e0f 100644 --- a/ethosu/vela/high_level_command_stream_generator.py +++ b/ethosu/vela/high_level_command_stream_generator.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -38,46 +38,20 @@ def dma_if_necessary(ps, box, tensor): yield DMA(ps, in_tensor, tensor, box) -def match_tensor(source, derived): - if source == derived: - return True - ops = derived.ops - return ops != [] and len(ops) == 1 and ops[0].type == Op.SplitSliceRead and source == ops[0].inputs[0] - - def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx): is_first = idx == 0 is_last = idx == len(passes) - 1 ps = passes[idx] block_config = block_configs[idx] npu_block_type = ps.npu_block_type - split_offsets = [None, None] # offset for [ifm, ifm2] + split_offsets = list(ps.primary_op.read_offsets) # offset for [ifm, ifm2] if ps.ifm_tensor is not None and ps.ifm2_tensor is not None and npu_block_type == NpuBlockType.ElementWise: # Ensure correct ifm and ifm2 order - if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]): + if ps.inputs[0] == ps.primary_op.inputs[1] and ps.inputs[1] == ps.primary_op.inputs[0]: ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0] - for op in ps.ops: - if op.type == Op.SplitSliceRead: - ps.primary_op.memory_function = op.type - assert len(op.inputs) == 1 - if match_tensor(ps.ifm_tensor, op.inputs[0]): - split_offsets[0] = op.attrs["split_start"] - elif match_tensor(ps.ifm2_tensor, op.inputs[0]): - split_offsets[1] = op.attrs["split_start"] - else: - assert False - else: - ifm_idx = 0 - for op in ps.ops: - if op.type == Op.SplitSliceRead: - assert ifm_idx < 2 - split_offsets[ifm_idx] = op.attrs["split_start"] - ps.primary_op.memory_function = op.type - ifm_idx += 1 - ifm_tensor = ps.ifm_tensor ifm_shape = None if ifm_tensor.shape != []: diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index b297bed0..16431be7 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -418,6 +418,7 @@ class Operation: "ifm_shapes", "ofm_shapes", "rescale", + "read_offsets", ) def __init__(self, op_type: Op, name: str): @@ -444,6 +445,7 @@ class Operation: # If not none: contains rescale to be used as output scaling # (which overrides the ofm tensor's scale) self.rescale = None + self.read_offsets: List[Shape4D] = [None, None] # offset for [ifm, ifm2] def clone(self, suffix="_clone"): res = Operation(self.type, self.name + suffix) @@ -458,6 +460,7 @@ class Operation: res.forced_output_quantization = self.forced_output_quantization res.scheduled_pass = self.scheduled_pass res.op_index = None # not relevant as not part of input network + res.read_offsets = list(self.read_offsets) return res diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py index b52b159e..281c0932 100644 --- a/ethosu/vela/pass_packing.py +++ b/ethosu/vela/pass_packing.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -42,8 +42,6 @@ class PassFlags(enum.Flag): PostFusingLimited = 8192 -npu_pre_ops = set((Op.SplitSliceRead,)) - mac_main_ops = set( ( # convolutions @@ -148,16 +146,6 @@ test_sequence = [ ), ( # ops_set - npu_pre_ops, - # incompatible_pack_flags - PassFlags.Cpu | PassFlags.MemoryOnly, - # flags_to_set - PassFlags.Npu | PassFlags.Mac | PassFlags.Pre | PassFlags.ElementWise, - # flags_to_clear - PassFlags.Empty, - ), - ( - # ops_set npu_dma_ops, # incompatible_pack_flags PassFlags.Cpu | PassFlags.MemoryOnly, @@ -437,7 +425,7 @@ def pack_into_passes(nng, arch, verbose_packing=False): visit_op(op, tens) def create_primary_op(op_list): - if any(op.type in (npu_pre_ops | npu_post_ops | npu_post_fuse_limited_ops) and op.run_on_npu for op in op_list): + if any(op.type in (npu_post_ops | npu_post_fuse_limited_ops) and op.run_on_npu for op in op_list): # Configure a 1x1 AvgPool and attach the op onto it op = op_list[0] inp = op.inputs[0] |