From 9070f0f1d9ee0fbf2cc3ee62a60f9b600bd62055 Mon Sep 17 00:00:00 2001
From: Johan Alfven <johan.alfven@arm.com>
Date: Tue, 7 Feb 2023 13:01:03 +0100
Subject: MLBEDSW-7316: Fix crash for networks with resource variables

- The problem was that networks with resource variables have
not been thought of. The major problem was the graph traversal
where these ops were not visited resulting in an empty subgraph
that resulted in the crash.
- Fixed the problem by attaching virtual tensors to the ops simulating
subgraph output. These tensors are only used to get the graph
traversal to work.
- Fixed serializing of attribute container and shared_name
- Fixed subgraph index for operator CallOnce
- All resource variable ops are pushed to the CPU

Change-Id: I815f9c81baf7a3fbb686e895980b462f58208b6e
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
---
 ethosu/vela/live_range.py    |  2 ++
 ethosu/vela/mark_tensors.py  |  4 +++-
 ethosu/vela/nn_graph.py      |  6 +++++-
 ethosu/vela/pass_packing.py  | 15 ++++++++++-----
 ethosu/vela/tensor.py        | 11 ++++++++++-
 ethosu/vela/tflite_reader.py | 26 +++++++++++++++++++++++++-
 ethosu/vela/tflite_writer.py | 13 ++++++++++++-
 7 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'ethosu')
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index b18afecc..6a2a04ac 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -155,6 +155,8 @@ class LiveRangeGraph:
 
 
 def tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
+    if tens.purpose == TensorPurpose.Virtual:
+        return True
     if target_mem_area is None or target_mem_type_set is None:
         return False
     if tens.mem_area != target_mem_area or tens.mem_type not in target_mem_type_set:
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index 64cc7883..4b5bf1dc 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2021 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -41,6 +41,8 @@ def mark_purpose(tens, arch, purpose):
     # Sets tensor's purpose, format, mem_area and mem_type
     if tens.purpose == TensorPurpose.Unknown:
         tens.purpose = purpose
+    elif tens.purpose == TensorPurpose.Virtual:
+        return
     elif tens.purpose not in (purpose, TensorPurpose.LUT):
         assert 0, "Cannot resolve tensor purpose {} and {} for tensor {}".format(tens.purpose, purpose, tens)
 
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 846632df..a43aac2a 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -149,7 +149,11 @@ class Subgraph:
     def __init__(self, name="<unnamed>", placement=PassPlacement.Cpu):
         self.output_tensors = []
         self.input_tensors = []
-        self.original_inputs = []  # Preserve the original input order
+        # Preserve the original input order
+        self.original_inputs = []
+        # Attach virtual outputs to resource variables op
+        # in order to be able to traverse the graph correctly
+        self.virtual_outputs = []
         self.passes = []
         self.cascaded_passes = []
         self.name = name
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 5c0d8ebe..6049366f 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -469,6 +469,8 @@ def pack_into_passes(nng, arch, verbose_packing=False):
         #
         # 1) CPU passes that only depends on sg.input_tensor can be
         #    moved to the top of the list.
+        #    ResourceVariables ops like VarHandle, ReadVariable, CallOnce
+        #    can also be moved to the top of list.
         #
         # 2) A CPU pass X is allowed to be grouped together with CPU pass Y
         #    if there is no NPU pass between pass X and pass Y that depends
@@ -487,17 +489,20 @@ def pack_into_passes(nng, arch, verbose_packing=False):
                 pass_list_top.insert(0, ps)
                 continue
 
-            if (
-                ps.placement == PassPlacement.Cpu
-                and ps.ops[0].ifm in sg.input_tensors
+            if ps.placement == PassPlacement.Cpu and (
+                ps.ops[0].ifm in sg.input_tensors
                 and (ps.ops[0].ifm2 in sg.input_tensors or ps.ops[0].ifm2 is None)
+                or (ps.ops[0].type in (Op.VarHandle, Op.ReadVariable, Op.CallOnce))
             ):
-                # This CPU pass only depends on sg.input_tensors
+                # This CPU pass only depends on sg.input_tensors or resource variable
                 pass_list_top.append(ps)
             else:
                 # Add pass to the list that will be sorted in the next step
                 pass_list.append(ps)
 
+        # Sort ops by op_index (same call order as in the original graph)
+        pass_list_top = sorted(pass_list_top, key=lambda ps: -1 if ps.ops[0].op_index is None else ps.ops[0].op_index)
+
         # Sort the rest of the list based on critera 2.
         # Search from bottom of list and when a CPU pass is found
         # search forward in the list and see if it is possible to join another CPU pass.
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 6a95bad4..008cd05e 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -114,7 +114,8 @@ class TensorPurpose(enum.IntFlag):
     ScratchFast = 4
     LUT = 5
     FSBias = 6
-    Size = 7
+    Virtual = 7
+    Size = 8
 
     def display_name(self) -> str:
         return ("Unknown", "Weights", "FeatureMap", "Scratch", "ScratchFast", "LUT", "FastStorageBias", "Size")[
@@ -297,6 +298,14 @@ class QuantizationParameters:
         return False
 
 
+def create_virtual_tensor(
+    name: str,
+):
+    virtual_tensor = Tensor([], DataType.int8, name)
+    virtual_tensor.purpose = TensorPurpose.Virtual
+    return virtual_tensor
+
+
 def create_const_tensor(
     name: str,
     shape: Shape,
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
index 80f36457..2325ff65 100644
--- a/ethosu/vela/tflite_reader.py
+++ b/ethosu/vela/tflite_reader.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -32,6 +32,7 @@ from .reader_util import align_tensor_indices_to_nng
 from .reader_util import clone_and_reshape_tensor
 from .reader_util import decode_str
 from .reader_util import fixup_tensors
+from .tensor import create_virtual_tensor
 from .tensor import QuantizationParameters
 from .tensor import Tensor
 from .tflite.BuiltinOperator import BuiltinOperator
@@ -51,6 +52,7 @@ class TFLiteSubgraph:
         for idx in range(subgraph.TensorsLength()):
             self.tensors.append(self.parse_tensor(subgraph.Tensors(idx)))
 
+        self.virtual_outputs = []
         for idx in range(subgraph.OperatorsLength()):
             self.parse_operator(idx, subgraph.Operators(idx))
 
@@ -58,6 +60,8 @@ class TFLiteSubgraph:
         self.inputs = self.get_tensors_from_indices_remove_duplicates(subgraph.InputsAsNumpy(), "input")
         fixup_tensors(self.inputs, self.tensors)
 
+        self.outputs.extend(self.virtual_outputs)
+
     def get_tensors_from_indices_remove_duplicates(self, indices, warning_str):
         tensors = []
         for idx in indices:
@@ -131,6 +135,21 @@ class TFLiteSubgraph:
         for out in op.outputs:
             out.ops = [op]
 
+        if op_type in (Op.AssignVariable, Op.CallOnce):
+            # All graph traversals are based on depth-first and the starting
+            # points are the subgraph output tensors. Because of this, operators
+            # like AssignVariable and CallOnce will not be visit when the
+            # graph is traversed and the ops are never handled. In order to
+            # fix that, the code base will have to be changed in several places.
+            # Until then this workaround is applied. A virtual output is added
+            # both to the operator and to the subgraph. By doing this the full
+            # graph is traversed correctly. The tensor is not used for anything
+            # else.
+            op.name = f"{op_type}_{op_index}"
+            tens = create_virtual_tensor(op.name)
+            op.set_output_tensor(tens)
+            self.virtual_outputs.append(tens)
+
         if op.type.is_depthwise_conv2d_op() or op.type.is_conv2d_op() or op.type == Op.FullyConnected:
             if inputs[1].values is not None:
                 if op.type == Op.FullyConnected:
@@ -156,6 +175,10 @@ class TFLiteSubgraph:
                     self.graph.nng.subgraphs[cond_subgraph_index],
                     self.graph.nng.subgraphs[body_subgraph_index],
                 )
+            if op_type == Op.CallOnce:
+                # Attach the actual nng subgraphs to the op
+                init_subgraph_index = op.attrs["init_subgraph_index"]
+                op.attrs["subgraph"] = (self.graph.nng.subgraphs[init_subgraph_index],)
 
             if op_type == Op.Reshape and "new_shape" not in op.attrs:
                 # Reshape should have an attrib "new_shape" but if it is missing, add it based on the output shape
@@ -250,6 +273,7 @@ class TFLiteGraph:
                 sg.name = tflite_sg.name
                 sg.original_inputs = tflite_sg.inputs  # Preserve the original input order
                 sg.output_tensors = tflite_sg.outputs
+                sg.virtual_outputs = tflite_sg.virtual_outputs
 
             parsing_step = "parsing metadata length"
             # Preserve the original metadata
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index e527cd4d..32982298 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -321,6 +321,10 @@ class TFLiteSerialiser:
                 attrs["dilation_w_factor"] = attrs["dilation"][2]
             if "channel_multiplier" in attrs:
                 attrs["depth_multiplier"] = attrs["channel_multiplier"]
+            if "container" in attrs:
+                attrs["container"] = builder.CreateString(attrs["container"])
+            if "shared_name" in attrs:
+                attrs["shared_name"] = builder.CreateString(attrs["shared_name"])
             attrs["fused_activation_function"] = op.activation.op_type if op.activation is not None else None
 
             builtin_opt_offset, custom_opt_offset = opt_serializer.serialize(builder, attrs)
@@ -362,6 +366,13 @@ class TFLiteSerialiser:
         # to an op.
         tensor_set = set(sg.original_inputs)
 
+        # Remove any virtual outputs since they are only used internally when
+        # traversing the graph.
+        for tens in sg.virtual_outputs:
+            tens.ops[0].outputs = []
+            if tens in sg.output_tensors:
+                sg.output_tensors.remove(tens)
+
         # Add the tensors from all valid ops, as well as the tensors from placeholder ops
         # This allows us to serialise tensors which arent attached to any specific ops,
         # e.g. due to an empty graph containing no ops
-- 
cgit v1.2.1