From 72c6a2414205e033279f80b622cdf479c05a4f5b Mon Sep 17 00:00:00 2001
From: Raul Farkas <raul.farkas@arm.com>
Date: Thu, 16 Mar 2023 16:38:05 +0000
Subject: MLBEDSW-6343: Remove op_index constraint

Remove op_index constraint and force linear format for all Conv2D that
have strides that can be optimised.

Change-Id: Idef3508ab074ea9abeacac030eaaa15a00ad1211
Signed-off-by: Raul Farkas <raul.farkas@arm.com>
---
 ethosu/vela/extract_npu_subgraphs.py  |  4 ++--
 ethosu/vela/graph_optimiser_util.py   |  7 +++++--
 ethosu/vela/scheduler.py              | 10 +++++-----
 ethosu/vela/tensor.py                 | 12 +++++++++---
 ethosu/vela/tflite_graph_optimiser.py | 21 ++++++++++++---------
 5 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py
index bf637b83..5e9a5b53 100644
--- a/ethosu/vela/extract_npu_subgraphs.py
+++ b/ethosu/vela/extract_npu_subgraphs.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -250,7 +250,7 @@ def extract_subgraph(nng, orig_sg, arch):
         for tens in curr_sg.output_tensors:
             # ofm can depend on multiple ops. These ops can be divided into different NPU
             # nodes due to CPU nodes. If that is the case the ofm must be NHWC.
-            tens.needs_linear_format = True
+            tens.force_linear_format = True
 
     return new_subgraphs
 
diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py
index 8b24eaf9..e1341d82 100644
--- a/ethosu/vela/graph_optimiser_util.py
+++ b/ethosu/vela/graph_optimiser_util.py
@@ -30,6 +30,7 @@ from .operation import Op
 from .shape4d import Shape4D
 from .tensor import create_const_tensor
 from .tensor import QuantizationParameters
+from .tensor import Tensor
 
 memory_only_ops = (
     Op.Reshape,
@@ -90,7 +91,9 @@ def _avoid_nhcwb16_for_memory_only(tens):
 
 
 # Check if non linear format can be used
-def check_format_restrictions(tens, arch):
+def check_format_restrictions(tens: Tensor, arch):
+    if tens.force_linear_format:
+        return
     if len(tens.ops) < 1:
         return
     if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
@@ -161,7 +164,7 @@ def check_format_restrictions(tens, arch):
             else:
                 return
 
-    tens.needs_linear_format = False
+    tens.force_linear_format = False
 
 
 def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 83e19bc6..6fcb6c1d 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -467,11 +467,11 @@ class Scheduler:
                     if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
                         continue
 
-                    if output.needs_linear_format:
+                    if output.use_linear_format:
                         continue
 
                     if self.avoid_nhcwb16_for_ofm(output, ps, arch):
-                        output.needs_linear_format = True
+                        output.force_linear_format = True
                         continue
 
                     output.set_format(TensorFormat.NHCWB16, arch)
@@ -504,11 +504,11 @@ class Scheduler:
                 if ps.ofm_tensor in self.sg.output_tensors:
                     # This Op produces a subgraph output
                     op.requires_full_ofm = True
-                if ps.ifm_tensor.needs_linear_format:
+                if ps.ifm_tensor.use_linear_format:
                     op.requires_full_ifm = True
-                if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
+                if ps.ifm2_tensor and ps.ifm2_tensor.use_linear_format:
                     op.requires_full_ifm2 = True
-                if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
+                if ps.ofm_tensor.use_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
                     op.requires_full_ofm = True
                 if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
                     # Op has multiple outputs or consumers - requires full OFM
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 86306cad..51c7592e 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -397,7 +397,7 @@ class Tensor:
         "block_traversal",
         "equivalence_id",
         "src_tensor",
-        "needs_linear_format",
+        "force_linear_format",
         "ifm_write_protected",
     )
     AllocationQuantum = 16
@@ -444,12 +444,18 @@ class Tensor:
         self.quantization: Optional[QuantizationParameters] = None
         self.block_traversal: TensorBlockTraversal = TensorBlockTraversal.Default
 
-        self.needs_linear_format = True
+        # Keep track of whether the linear format should be enforced
+        self.force_linear_format: Optional[bool] = None
         self.ifm_write_protected = False
 
         # Reference to parent-tensor if this tensor is a clone
         self.src_tensor: Optional[Tensor] = None
 
+    @property
+    def use_linear_format(self) -> bool:
+        """Return whether the tensor should use linear format or not."""
+        return self.force_linear_format in (True, None)
+
     @property
     def original_shape(self):
         return self._original_shape
@@ -545,7 +551,7 @@ class Tensor:
 
         if shape_len > 4:
             return
-        assert not (self.needs_linear_format and fmt == TensorFormat.NHCWB16)
+        assert not (self.use_linear_format and fmt == TensorFormat.NHCWB16)
         self.storage_rounding_quantum = arch.storage_rounding_quantums[self.format]
         self.storage_rounding_quantum = tuple(self.storage_rounding_quantum[-shape_len:])
         self.brick_size = arch.brick_sizes[self.format]
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 44f5d6ae..518b6db0 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -942,19 +942,21 @@ def reorder_depthwise_weights(op, arch, nng):
     return op
 
 
-def fixup_strided_conv(op, arch, nng):
+def fixup_strided_conv(op: Operation, arch, nng):
+    """Optimize or fixup strided Conv2DBias
+    Optimization:
+        Reduce, when possible, the Conv2DBias stride from 2 to 1 by re-shaping
+        both IFM and filter.
+
+    Fixup:
+        Introduce software support for Conv2DBias with stride_width = 4 by
+        reducing it to 1 when possible by re-shaping both IFM and filter.
+    """
     if op.type != Op.Conv2DBias:
         return op
     stride_x, stride_y = op.get_kernel_stride()
     weight_tensor = op.weights
     ifm_shape = op.ifm_shapes[0]
-
-    # Do not optimize if op is not the first in the network and stride is
-    # supported by the hardware
-    if op.op_index != 0 and stride_x < 4:
-        return op
-    op.ifm.needs_linear_format = True
-
     if (
         (stride_x == 2 or stride_x == 4)
         and ifm_shape.depth <= 4
@@ -1004,6 +1006,7 @@ def fixup_strided_conv(op, arch, nng):
         stride_x = 1
         op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
 
+        op.ifm.force_linear_format = True
     return op
 
 
@@ -2125,7 +2128,6 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
         convert_prelu,
         convert_mul_max_to_abs_or_lrelu,
         convert_lrelu,
-        fixup_strided_conv,
         convert_hardswish_to_lut,
         rewrite_fully_connected_input,
         convert_batched_fc_shape,
@@ -2139,6 +2141,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
         convert_tanh_sigmoid_to_lut,
         replace_pad_by_hw_pad,
         fixup_dilation_gt2,
+        fixup_strided_conv,
     ]
 
     for idx, sg in enumerate(nng.subgraphs):
-- 
cgit v1.2.1