4 files changed, 97 insertions, 53 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index cace0f08..61a3b0b1 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -233,7 +233,10 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
             sg, arch, scratch_tens, scratch_fast_tens, flash_tens
         )
 
-    npu_serialisation.rewrite_npu_call_ops(root_sg, arch)
+    # Create list of CPU subgraphs with same order as the list of all subgraphs
+    cpu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Cpu]
+    for sg in cpu_subgraphs:
+        npu_serialisation.rewrite_npu_call_ops(sg, arch)
 
     # Set Scratch and Fast_scratch Tensor size
     if scratch_tens is not None:
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index e683f9f5..9b6fe63d 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -224,18 +224,24 @@ def extract_live_ranges_from_cascaded_passes(
             rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment)
             rng.mark_usage(time_for_pass)
 
-        cps_primary_op = cps.passes[0].primary_op
-
-        if (
-            cps_primary_op
-            and cps_primary_op.type == Op.CustomNpuOp
-            and MemType.Permanent_CPU not in target_mem_type_set
-        ):
-            # If the primary-op is an NpuOp that means this is where an Npu subgraph
-            # is called. Go into said subgraph and extract live ranges before continuing.
-            # Use default allocation alignment of 16 for Npu tensors
-            npu_sg = cps_primary_op.attrs["subgraph"]
-            lr_graph = _extract_live_ranges_from_schedule(npu_sg, target_mem_area, target_mem_type_set, lr_graph)
+        op_subgraph = cps.passes[0].ops[0].attrs.get("subgraph", None)
+        op_type = cps.passes[0].ops[0].type
+
+        if op_subgraph is not None and MemType.Permanent_CPU not in target_mem_type_set:
+            if op_type == Op.CustomNpuOp:
+                # If the primary-op is an NpuOp that means this is where an Npu subgraph
+                # is called. Go into said subgraph and extract live ranges before continuing.
+                # Use default allocation alignment of 16 for Npu tensors
+                lr_graph = _extract_live_ranges_from_schedule(
+                    op_subgraph, target_mem_area, target_mem_type_set, lr_graph
+                )
+            else:
+                # The op has one or more subgraphs in it (a typical op is the While op)
+                # Go into all subgraphs and extract live ranges before continuing.
+                for op_sg in op_subgraph:
+                    lr_graph = extract_live_ranges_from_cascaded_passes(
+                        op_sg, target_mem_area, target_mem_type_set, lr_graph, cpu_tensor_alignment
+                    )
             # Set the new time after handling the Npu subgraph
             time_for_pass = lr_graph.current_time
             cps.time = time_for_pass
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
index 8dc5efe1..fa90ad9e 100644
--- a/ethosu/vela/tflite_reader.py
+++ b/ethosu/vela/tflite_reader.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -147,6 +147,15 @@ class TFLiteSubgraph:
         if opt_serializer is not None:
             op.attrs = opt_serializer.deserialize(op_data)
 
+            if op_type == Op.While:
+                # Attach the actual nng subgraphs to the op
+                cond_subgraph_index = op.attrs["cond_subgraph_index"]
+                body_subgraph_index = op.attrs["body_subgraph_index"]
+                op.attrs["subgraph"] = (
+                    self.graph.nng.subgraphs[cond_subgraph_index],
+                    self.graph.nng.subgraphs[body_subgraph_index],
+                )
+
             if op_type == Op.Reshape and "new_shape" not in op.attrs:
                 # Reshape should have an attrib "new_shape" but if it is missing, add it based on the output shape
                 op.attrs["new_shape"] = outputs[0].shape
@@ -223,16 +232,23 @@ class TFLiteGraph:
 
             parsing_step = "parsing subgraphs length"
             self.subgraphs = []
+
+            # Pre-allocate nng subgraphs - needed when parsing an operator and the operator
+            # has subgraph attributes.
+            self.nng = Graph(self.name, self.batch_size)
+            for idx in range(model.SubgraphsLength()):
+                sg = Subgraph()
+                self.nng.subgraphs.append(sg)
+
             for idx in range(model.SubgraphsLength()):
                 parsing_step = f"parsing subgraph {idx}"
                 self.subgraphs.append(TFLiteSubgraph(self, model.Subgraphs(idx)))
 
-            self.nng = Graph(self.name, self.batch_size)
-            for tflite_sg in self.subgraphs:
-                sg = Subgraph(tflite_sg.name)
+            for idx, tflite_sg in enumerate(self.subgraphs):
+                sg = self.nng.subgraphs[idx]
+                sg.name = tflite_sg.name
                 sg.original_inputs = tflite_sg.inputs  # Preserve the original input order
                 sg.output_tensors = tflite_sg.outputs
-                self.nng.subgraphs.append(sg)
 
             parsing_step = "parsing metadata length"
             # Preserve the original metadata
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index 7aab01f2..ce53f9b1 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -71,13 +71,19 @@ def make_vector(v):
 
 
 class TFLiteSerialiser:
+
+    BUF_IDX_SCRATCH = 0  # Always assign scratch to buffer 0
+    BUF_IDX_SCRATCH_FAST = 1  # Always assign scratch_fast to buffer 1
+    BUF_IDX_START = 2  # Unique buffer id for every tensor in all subgraphs
+
     def __init__(self, nng):
         self.builder = flatbuffers.Builder(0)
         self.nng = nng
 
-        self.scratch_buf_id = 0  # Always assign scratch to buffer 0
-        self.scratch_fast_buf_id = 1  # Always assign scratch_fast to buffer 1
+        self.buf_idx = TFLiteSerialiser.BUF_IDX_START
         self.buffers_to_write = []  # have an empty array there
+        self.tensor_map_all = []  # Keep track of all subgraphs
+        self.tensor_map_sg = []  # Keep track of one subgraph
 
         self.ops_to_ignore = (Op.Const, Op.Placeholder, Op.SubgraphInput)
 
@@ -154,22 +160,20 @@ class TFLiteSerialiser:
 
         buffer_map = {}
 
-        buf_idx = 2
-
         for tens in tensors:
             # Set buffer ids depending on allocation
             if tens.is_allocated_in_tensor_arena(scratch_tensor_mem_area):
-                buffer_map[tens] = self.scratch_buf_id
+                buffer_map[tens] = TFLiteSerialiser.BUF_IDX_SCRATCH
             elif tens.mem_type == MemType.Scratch_fast:
                 # For Scratch_fast when not co-allocated with scratch in the TensorArena:
-                buffer_map[tens] = self.scratch_fast_buf_id
+                buffer_map[tens] = TFLiteSerialiser.BUF_IDX_SCRATCH_FAST
             else:
-                buffer_map[tens] = buf_idx
-                buf_idx += 1
+                buffer_map[tens] = self.buf_idx
+                self.buf_idx += 1
 
-        # Initialize buffers_to_write to a length equal to number of buffers so
+        # Initialize/extend buffers_to_write to a length equal to number of buffers so
         # they can be appended at the correct index during tensor serialization
-        self.buffers_to_write = [None] * (buf_idx)
+        self.buffers_to_write += [None] * (self.buf_idx)
 
         return buffer_map
 
@@ -281,13 +285,13 @@ class TFLiteSerialiser:
         builder = self.builder
 
         inputs_offset = self.write_int_vector(
-            [self.tensor_map[tens] if tens in self.tensor_map else -1 for tens in op.inputs]
+            [self.tensor_map_sg[tens] if tens in self.tensor_map_sg else -1 for tens in op.inputs]
         )
         outputs_offset = self.write_int_vector(
-            [self.tensor_map[tens] for tens in op.outputs if tens in self.tensor_map]
+            [self.tensor_map_sg[tens] for tens in op.outputs if tens in self.tensor_map_sg]
         )
         intermediates_offset = self.write_int_vector(
-            [self.tensor_map[tens] for tens in op.intermediates if tens in self.tensor_map]
+            [self.tensor_map_sg[tens] for tens in op.intermediates if tens in self.tensor_map_sg]
         )
 
         if op.type == Op.Custom:
@@ -331,9 +335,8 @@ class TFLiteSerialiser:
         Operator.OperatorAddMutatingVariableInputs(builder, mutating_variable_inputs_offset)
         return Operator.OperatorEnd(builder)
 
-    def serialise_subgraph(self, sg):
+    def serialise_subgraph(self, sg, name):
         builder = self.builder
-        tensor_set = set()
         all_ops = []
         placeholder_ops = []
 
@@ -344,6 +347,14 @@ class TFLiteSerialiser:
                 elif op.type == Op.Placeholder:
                     placeholder_ops.append(op)
 
+        # Make sure all original tensors are written back, special case for Ops
+        # with connected subgraphs. Even though not all inputs are used,
+        # the reference kernel expects all inputs to be in the tflite file.
+        # Since we traverse the graph starting with all outputs they are
+        # always added but if an input is not referenced it will not be added
+        # to an op.
+        tensor_set = set(sg.original_inputs)
+
         # Add the tensors from all valid ops, as well as the tensors from placeholder ops
         # This allows us to serialise tensors which arent attached to any specific ops,
         # e.g. due to an empty graph containing no ops
@@ -362,18 +373,19 @@ class TFLiteSerialiser:
             assert len(scratch_tensors) == 1, "Multiple scratch tensors"
             scratch_tensor = scratch_tensors[0]
 
-        self.tensor_map = {tens: idx for idx, tens in enumerate(all_tensors)}
+        self.tensor_map_sg = {tens: idx for idx, tens in enumerate(all_tensors)}
         self.buffer_map = self.assign_buffers_to_tensors(all_tensors, scratch_tensor)
+        self.tensor_map_all.append(self.tensor_map_sg)
 
         tensors_offset = self.write_offset_vector([self.serialise_tensor(tens) for tens in all_tensors])
 
         # Make sure the input_tensors haven't been modified
         assert all(inp in sg.original_inputs for inp in sg.input_tensors)
-        inputs = [self.tensor_map[tens] for tens in sg.original_inputs if tens in self.tensor_map]
+        inputs = [self.tensor_map_sg[tens] for tens in sg.original_inputs if tens in self.tensor_map_sg]
 
         inputs_offset = self.write_int_vector(inputs)
         outputs_offset = self.write_int_vector(
-            [self.tensor_map[tens] for tens in sg.output_tensors if tens in self.tensor_map]
+            [self.tensor_map_sg[tens] for tens in sg.output_tensors if tens in self.tensor_map_sg]
         )
 
         operators_offset = self.write_offset_vector([self.serialise_operator(op) for op in all_ops])
@@ -384,6 +396,7 @@ class TFLiteSerialiser:
         SubGraph.SubGraphAddOutputs(builder, outputs_offset)
 
         SubGraph.SubGraphAddOperators(builder, operators_offset)
+        SubGraph.SubGraphAddName(builder, name)
 
         return SubGraph.SubGraphEnd(builder)
 
@@ -427,26 +440,32 @@ class TFLiteSerialiser:
 
         description = builder.CreateString("Vela Optimised")
 
-        subgraph_offset = self.write_offset_vector([self.serialise_subgraph(sg) for sg in self.subgraphs_to_write])
+        subgraph_offset = self.write_offset_vector(
+            [self.serialise_subgraph(sg, builder.CreateString(sg.name)) for sg in self.subgraphs_to_write]
+        )
 
         # Fill the metadata buffer
         version = np.int32(0)
-        subgraph_idx = np.int32(len(self.subgraphs_to_write))  # Only 1 supported currently
-        nbr_tensors = np.int32(len(self.tensor_map))
+        subgraph_idx = np.int32(len(self.subgraphs_to_write))
+
+        nbr_tensors_all = np.sum([len(tensor_map_sg) for tensor_map_sg in self.tensor_map_all], dtype=np.int32)
+
+        offlineAlloc = [version, subgraph_idx, nbr_tensors_all]
 
         if not any([name == b"OfflineMemoryAllocation" for name, _ in self.nng.metadata]):
-            # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro
-            offsets = [np.int32(-1)] * nbr_tensors
-
-            # Ensure that the order of the offsets match the order of the tensors
-            for tens, idx in self.tensor_map.items():
-                # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area
-                if tens.mem_type in (MemType.Scratch, MemType.Scratch_fast):
-                    offsets[idx] = np.int32(tens.address) if tens.address is not None else np.int32(0)
-
-            self.nng.metadata.append(
-                ("OfflineMemoryAllocation", np.array([version, subgraph_idx, nbr_tensors] + offsets))
-            )
+            for tensor_map_sg in self.tensor_map_all:
+                nbr_tensors_sg = np.int32(len(tensor_map_sg))
+                # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro
+                offsets = [np.int32(-1)] * nbr_tensors_sg
+                # Ensure that the order of the offsets match the order of the tensors
+                for tens, idx in tensor_map_sg.items():
+                    # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area
+                    if tens.mem_type in (MemType.Scratch, MemType.Scratch_fast):
+                        offsets[idx] = np.int32(tens.address) if tens.address is not None else np.int32(0)
+
+                offlineAlloc += offsets
+
+            self.nng.metadata.append(("OfflineMemoryAllocation", np.array(offlineAlloc)))
 
         metadata_list = []
         for name, buffer in self.nng.metadata: