From 72c6a2414205e033279f80b622cdf479c05a4f5b Mon Sep 17 00:00:00 2001 From: Raul Farkas Date: Thu, 16 Mar 2023 16:38:05 +0000 Subject: MLBEDSW-6343: Remove op_index constraint Remove op_index constraint and force linear format for all Conv2D that have strides that can be optimised. Change-Id: Idef3508ab074ea9abeacac030eaaa15a00ad1211 Signed-off-by: Raul Farkas --- ethosu/vela/extract_npu_subgraphs.py | 4 ++-- ethosu/vela/graph_optimiser_util.py | 7 +++++-- ethosu/vela/scheduler.py | 10 +++++----- ethosu/vela/tensor.py | 12 +++++++++--- ethosu/vela/tflite_graph_optimiser.py | 21 ++++++++++++--------- 5 files changed, 33 insertions(+), 21 deletions(-) diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py index bf637b83..5e9a5b53 100644 --- a/ethosu/vela/extract_npu_subgraphs.py +++ b/ethosu/vela/extract_npu_subgraphs.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -250,7 +250,7 @@ def extract_subgraph(nng, orig_sg, arch): for tens in curr_sg.output_tensors: # ofm can depend on multiple ops. These ops can be divided into different NPU # nodes due to CPU nodes. If that is the case the ofm must be NHWC. - tens.needs_linear_format = True + tens.force_linear_format = True return new_subgraphs diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py index 8b24eaf9..e1341d82 100644 --- a/ethosu/vela/graph_optimiser_util.py +++ b/ethosu/vela/graph_optimiser_util.py @@ -30,6 +30,7 @@ from .operation import Op from .shape4d import Shape4D from .tensor import create_const_tensor from .tensor import QuantizationParameters +from .tensor import Tensor memory_only_ops = ( Op.Reshape, @@ -90,7 +91,9 @@ def _avoid_nhcwb16_for_memory_only(tens): # Check if non linear format can be used -def check_format_restrictions(tens, arch): +def check_format_restrictions(tens: Tensor, arch): + if tens.force_linear_format: + return if len(tens.ops) < 1: return if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any( @@ -161,7 +164,7 @@ def check_format_restrictions(tens, arch): else: return - tens.needs_linear_format = False + tens.force_linear_format = False def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]: diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 83e19bc6..6fcb6c1d 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -467,11 +467,11 @@ class Scheduler: if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap: continue - if output.needs_linear_format: + if output.use_linear_format: continue if self.avoid_nhcwb16_for_ofm(output, ps, arch): - output.needs_linear_format = True + output.force_linear_format = True continue output.set_format(TensorFormat.NHCWB16, arch) @@ -504,11 +504,11 @@ class Scheduler: if ps.ofm_tensor in self.sg.output_tensors: # This Op produces a subgraph output op.requires_full_ofm = True - if ps.ifm_tensor.needs_linear_format: + if ps.ifm_tensor.use_linear_format: op.requires_full_ifm = True - if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format: + if ps.ifm2_tensor and ps.ifm2_tensor.use_linear_format: op.requires_full_ifm2 = True - if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite: + if ps.ofm_tensor.use_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite: op.requires_full_ofm = True if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1: # Op has multiple outputs or consumers - requires full OFM diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 86306cad..51c7592e 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -397,7 +397,7 @@ class Tensor: "block_traversal", "equivalence_id", "src_tensor", - "needs_linear_format", + "force_linear_format", "ifm_write_protected", ) AllocationQuantum = 16 @@ -444,12 +444,18 @@ class Tensor: self.quantization: Optional[QuantizationParameters] = None self.block_traversal: TensorBlockTraversal = TensorBlockTraversal.Default - self.needs_linear_format = True + # Keep track of whether the linear format should be enforced + self.force_linear_format: Optional[bool] = None self.ifm_write_protected = False # Reference to parent-tensor if this tensor is a clone self.src_tensor: Optional[Tensor] = None + @property + def use_linear_format(self) -> bool: + """Return whether the tensor should use linear format or not.""" + return self.force_linear_format in (True, None) + @property def original_shape(self): return self._original_shape @@ -545,7 +551,7 @@ class Tensor: if shape_len > 4: return - assert not (self.needs_linear_format and fmt == TensorFormat.NHCWB16) + assert not (self.use_linear_format and fmt == TensorFormat.NHCWB16) self.storage_rounding_quantum = arch.storage_rounding_quantums[self.format] self.storage_rounding_quantum = tuple(self.storage_rounding_quantum[-shape_len:]) self.brick_size = arch.brick_sizes[self.format] diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index 44f5d6ae..518b6db0 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -942,19 +942,21 @@ def reorder_depthwise_weights(op, arch, nng): return op -def fixup_strided_conv(op, arch, nng): +def fixup_strided_conv(op: Operation, arch, nng): + """Optimize or fixup strided Conv2DBias + Optimization: + Reduce, when possible, the Conv2DBias stride from 2 to 1 by re-shaping + both IFM and filter. + + Fixup: + Introduce software support for Conv2DBias with stride_width = 4 by + reducing it to 1 when possible by re-shaping both IFM and filter. + """ if op.type != Op.Conv2DBias: return op stride_x, stride_y = op.get_kernel_stride() weight_tensor = op.weights ifm_shape = op.ifm_shapes[0] - - # Do not optimize if op is not the first in the network and stride is - # supported by the hardware - if op.op_index != 0 and stride_x < 4: - return op - op.ifm.needs_linear_format = True - if ( (stride_x == 2 or stride_x == 4) and ifm_shape.depth <= 4 @@ -1004,6 +1006,7 @@ def fixup_strided_conv(op, arch, nng): stride_x = 1 op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)}) + op.ifm.force_linear_format = True return op @@ -2125,7 +2128,6 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights): convert_prelu, convert_mul_max_to_abs_or_lrelu, convert_lrelu, - fixup_strided_conv, convert_hardswish_to_lut, rewrite_fully_connected_input, convert_batched_fc_shape, @@ -2139,6 +2141,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights): convert_tanh_sigmoid_to_lut, replace_pad_by_hw_pad, fixup_dilation_gt2, + fixup_strided_conv, ] for idx, sg in enumerate(nng.subgraphs): -- cgit v1.2.1