MLBEDSW-6880: Add support for multiple subgraphs

- Vela failed to compile networks with multiple subgraphs because only cascaded passes in the root subgraph were used when extracting the live ranges. The fix is to extract the subgraph range live on Ops that have connected subgraphs. - The tf_writer did not handle multiple subgraphs in a correct way resulting in corrupt buffer data in the optimized tflite file. The buffer index must be unique for every tensor. -Added support to handle multiple subgraphs for the OfflineMemoryAllocation meta data. The change will not change behavior for single graphs. Signed-off-by: Johan Alfven <johan.alfven@arm.com> Change-Id: I2328dfc1f07e2e4faf43a75423ea95423096ffa3
author: Johan Alfvén <johan.alfven@arm.com> 2022-09-05 09:39:47 +0200
committer: Johan Alfvén <johan.alfven@arm.com> 2022-10-19 13:37:45 +0200
commit: 673683bb828cd552f1970922e3c61079607332b2 (patch)
tree: 02e6ca41621ca7ec32d7eb6f36cb755b8da14963
parent: d3d81b3ce138a48c0cddad7eb12710e26dad653e (diff)
download: ethos-u-vela-673683bb828cd552f1970922e3c61079607332b2.tar.gz
4 files changed, 97 insertions, 53 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index cace0f08..61a3b0b1 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -233,7 +233,10 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
             sg, arch, scratch_tens, scratch_fast_tens, flash_tens
         )
 
-    npu_serialisation.rewrite_npu_call_ops(root_sg, arch)
+    # Create list of CPU subgraphs with same order as the list of all subgraphs
+    cpu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Cpu]
+    for sg in cpu_subgraphs:
+        npu_serialisation.rewrite_npu_call_ops(sg, arch)
 
     # Set Scratch and Fast_scratch Tensor size
     if scratch_tens is not None:
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index e683f9f5..9b6fe63d 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -224,18 +224,24 @@ def extract_live_ranges_from_cascaded_passes(
             rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment)
             rng.mark_usage(time_for_pass)
 
-        cps_primary_op = cps.passes[0].primary_op
-
-        if (
-            cps_primary_op
-            and cps_primary_op.type == Op.CustomNpuOp
-            and MemType.Permanent_CPU not in target_mem_type_set
-        ):
-            # If the primary-op is an NpuOp that means this is where an Npu subgraph
-            # is called. Go into said subgraph and extract live ranges before continuing.
-            # Use default allocation alignment of 16 for Npu tensors
-            npu_sg = cps_primary_op.attrs["subgraph"]
-            lr_graph = _extract_live_ranges_from_schedule(npu_sg, target_mem_area, target_mem_type_set, lr_graph)
+        op_subgraph = cps.passes[0].ops[0].attrs.get("subgraph", None)
+        op_type = cps.passes[0].ops[0].type
+
+        if op_subgraph is not None and MemType.Permanent_CPU not in target_mem_type_set:
+            if op_type == Op.CustomNpuOp:
+                # If the primary-op is an NpuOp that means this is where an Npu subgraph
+                # is called. Go into said subgraph and extract live ranges before continuing.
+                # Use default allocation alignment of 16 for Npu tensors
+                lr_graph = _extract_live_ranges_from_schedule(
+                    op_subgraph, target_mem_area, target_mem_type_set, lr_graph
+                )
+            else:
+                # The op has one or more subgraphs in it (a typical op is the While op)
+                # Go into all subgraphs and extract live ranges before continuing.
+                for op_sg in op_subgraph:
+                    lr_graph = extract_live_ranges_from_cascaded_passes(
+                        op_sg, target_mem_area, target_mem_type_set, lr_graph, cpu_tensor_alignment
+                    )
             # Set the new time after handling the Npu subgraph
             time_for_pass = lr_graph.current_time
             cps.time = time_for_pass
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
index 8dc5efe1..fa90ad9e 100644
--- a/ethosu/vela/tflite_reader.py
+++ b/ethosu/vela/tflite_reader.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -147,6 +147,15 @@ class TFLiteSubgraph:
         if opt_serializer is not None:
             op.attrs = opt_serializer.deserialize(op_data)
 
+            if op_type == Op.While:
+                # Attach the actual nng subgraphs to the op
+                cond_subgraph_index = op.attrs["cond_subgraph_index"]
+                body_subgraph_index = op.attrs["body_subgraph_index"]
+                op.attrs["subgraph"] = (
+                    self.graph.nng.subgraphs[cond_subgraph_index],
+                    self.graph.nng.subgraphs[body_subgraph_index],
+                )
+
             if op_type == Op.Reshape and "new_shape" not in op.attrs:
                 # Reshape should have an attrib "new_shape" but if it is missing, add it based on the output shape
                 op.attrs["new_shape"] = outputs[0].shape
@@ -223,16 +232,23 @@ class TFLiteGraph:
 
             parsing_step = "parsing subgraphs length"
             self.subgraphs = []
+
+            # Pre-allocate nng subgraphs - needed when parsing an operator and the operator
+            # has subgraph attributes.
+            self.nng = Graph(self.name, self.batch_size)
+            for idx in range(model.SubgraphsLength()):
+                sg = Subgraph()
+                self.nng.subgraphs.append(sg)
+
             for idx in range(model.SubgraphsLength()):
                 parsing_step = f"parsing subgraph {idx}"
                 self.subgraphs.append(TFLiteSubgraph(self, model.Subgraphs(idx)))
 
-            self.nng = Graph(self.name, self.batch_size)
-            for tflite_sg in self.subgraphs:
-                sg = Subgraph(tflite_sg.name)
+            for idx, tflite_sg in enumerate(self.subgraphs):
+                sg = self.nng.subgraphs[idx]
+                sg.name = tflite_sg.name
                 sg.original_inputs = tflite_sg.inputs  # Preserve the original input order
                 sg.output_tensors = tflite_sg.outputs
-                self.nng.subgraphs.append(sg)
 
             parsing_step = "parsing metadata length"
             # Preserve the original metadata
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index 7aab01f2..ce53f9b1 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -71,13 +71,19 @@ def make_vector(v):
 
 
 class TFLiteSerialiser:
+
+    BUF_IDX_SCRATCH = 0  # Always assign scratch to buffer 0
+    BUF_IDX_SCRATCH_FAST = 1  # Always assign scratch_fast to buffer 1
+    BUF_IDX_START = 2  # Unique buffer id for every tensor in all subgraphs
+
     def __init__(self, nng):
         self.builder = flatbuffers.Builder(0)
         self.nng = nng
 
-        self.scratch_buf_id = 0  # Always assign scratch to buffer 0
-        self.scratch_fast_buf_id = 1  # Always assign scratch_fast to buffer 1
+        self.buf_idx = TFLiteSerialiser.BUF_IDX_START
         self.buffers_to_write = []  # have an empty array there
+        self.tensor_map_all = []  # Keep track of all subgraphs
+        self.tensor_map_sg = []  # Keep track of one subgraph
 
         self.ops_to_ignore = (Op.Const, Op.Placeholder, Op.SubgraphInput)
 
@@ -154,22 +160,20 @@ class TFLiteSerialiser:
 
         buffer_map = {}
 
-        buf_idx = 2
-
         for tens in tensors:
             # Set buffer ids depending on allocation
             if tens.is_allocated_in_tensor_arena(scratch_tensor_mem_area):
-                buffer_map[tens] = self.scratch_buf_id
+                buffer_map[tens] = TFLiteSerialiser.BUF_IDX_SCRATCH
             elif tens.mem_type == MemType.Scratch_fast:
                 # For Scratch_fast when not co-allocated with scratch in the TensorArena:
-                buffer_map[tens] = self.scratch_fast_buf_id
+                buffer_map[tens] = TFLiteSerialiser.BUF_IDX_SCRATCH_FAST
             else:
-                buffer_map[tens] = buf_idx
-                buf_idx += 1
+                buffer_map[tens] = self.buf_idx
+                self.buf_idx += 1
 
-        # Initialize buffers_to_write to a length equal to number of buffers so
+        # Initialize/extend buffers_to_write to a length equal to number of buffers so
         # they can be appended at the correct index during tensor serialization
-        self.buffers_to_write = [None] * (buf_idx)
+        self.buffers_to_write += [None] * (self.buf_idx)
 
         return buffer_map
 
@@ -281,13 +285,13 @@ class TFLiteSerialiser:
         builder = self.builder
 
         inputs_offset = self.write_int_vector(
-            [self.tensor_map[tens] if tens in self.tensor_map else -1 for tens in op.inputs]
+            [self.tensor_map_sg[tens] if tens in self.tensor_map_sg else -1 for tens in op.inputs]
         )
         outputs_offset = self.write_int_vector(
-            [self.tensor_map[tens] for tens in op.outputs if tens in self.tensor_map]
+            [self.tensor_map_sg[tens] for tens in op.outputs if tens in self.tensor_map_sg]
         )
         intermediates_offset = self.write_int_vector(
-            [self.tensor_map[tens] for tens in op.intermediates if tens in self.tensor_map]
+            [self.tensor_map_sg[tens] for tens in op.intermediates if tens in self.tensor_map_sg]
         )
 
         if op.type == Op.Custom:
@@ -331,9 +335,8 @@ class TFLiteSerialiser:
         Operator.OperatorAddMutatingVariableInputs(builder, mutating_variable_inputs_offset)
         return Operator.OperatorEnd(builder)
 
-    def serialise_subgraph(self, sg):
+    def serialise_subgraph(self, sg, name):
         builder = self.builder
-        tensor_set = set()
         all_ops = []
         placeholder_ops = []
 
@@ -344,6 +347,14 @@ class TFLiteSerialiser:
                 elif op.type == Op.Placeholder:
                     placeholder_ops.append(op)
 
+        # Make sure all original tensors are written back, special case for Ops
+        # with connected subgraphs. Even though not all inputs are used,
+        # the reference kernel expects all inputs to be in the tflite file.
+        # Since we traverse the graph starting with all outputs they are
+        # always added but if an input is not referenced it will not be added
+        # to an op.
+        tensor_set = set(sg.original_inputs)
+
         # Add the tensors from all valid ops, as well as the tensors from placeholder ops
         # This allows us to serialise tensors which arent attached to any specific ops,
         # e.g. due to an empty graph containing no ops
@@ -362,18 +373,19 @@ class TFLiteSerialiser:
             assert len(scratch_tensors) == 1, "Multiple scratch tensors"
             scratch_tensor = scratch_tensors[0]
 
-        self.tensor_map = {tens: idx for idx, tens in enumerate(all_tensors)}
+        self.tensor_map_sg = {tens: idx for idx, tens in enumerate(all_tensors)}
         self.buffer_map = self.assign_buffers_to_tensors(all_tensors, scratch_tensor)
+        self.tensor_map_all.append(self.tensor_map_sg)
 
         tensors_offset = self.write_offset_vector([self.serialise_tensor(tens) for tens in all_tensors])
 
         # Make sure the input_tensors haven't been modified
         assert all(inp in sg.original_inputs for inp in sg.input_tensors)
-        inputs = [self.tensor_map[tens] for tens in sg.original_inputs if tens in self.tensor_map]
+        inputs = [self.tensor_map_sg[tens] for tens in sg.original_inputs if tens in self.tensor_map_sg]
 
         inputs_offset = self.write_int_vector(inputs)
         outputs_offset = self.write_int_vector(
-            [self.tensor_map[tens] for tens in sg.output_tensors if tens in self.tensor_map]
+            [self.tensor_map_sg[tens] for tens in sg.output_tensors if tens in self.tensor_map_sg]
         )
 
         operators_offset = self.write_offset_vector([self.serialise_operator(op) for op in all_ops])
@@ -384,6 +396,7 @@ class TFLiteSerialiser:
         SubGraph.SubGraphAddOutputs(builder, outputs_offset)
 
         SubGraph.SubGraphAddOperators(builder, operators_offset)
+        SubGraph.SubGraphAddName(builder, name)
 
         return SubGraph.SubGraphEnd(builder)
 
@@ -427,26 +440,32 @@ class TFLiteSerialiser:
 
         description = builder.CreateString("Vela Optimised")
 
-        subgraph_offset = self.write_offset_vector([self.serialise_subgraph(sg) for sg in self.subgraphs_to_write])
+        subgraph_offset = self.write_offset_vector(
+            [self.serialise_subgraph(sg, builder.CreateString(sg.name)) for sg in self.subgraphs_to_write]
+        )
 
         # Fill the metadata buffer
         version = np.int32(0)
-        subgraph_idx = np.int32(len(self.subgraphs_to_write))  # Only 1 supported currently
-        nbr_tensors = np.int32(len(self.tensor_map))
+        subgraph_idx = np.int32(len(self.subgraphs_to_write))
+
+        nbr_tensors_all = np.sum([len(tensor_map_sg) for tensor_map_sg in self.tensor_map_all], dtype=np.int32)
+
+        offlineAlloc = [version, subgraph_idx, nbr_tensors_all]
 
         if not any([name == b"OfflineMemoryAllocation" for name, _ in self.nng.metadata]):
-            # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro
-            offsets = [np.int32(-1)] * nbr_tensors
-
-            # Ensure that the order of the offsets match the order of the tensors
-            for tens, idx in self.tensor_map.items():
-                # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area
-                if tens.mem_type in (MemType.Scratch, MemType.Scratch_fast):
-                    offsets[idx] = np.int32(tens.address) if tens.address is not None else np.int32(0)
-
-            self.nng.metadata.append(
-                ("OfflineMemoryAllocation", np.array([version, subgraph_idx, nbr_tensors] + offsets))
-            )
+            for tensor_map_sg in self.tensor_map_all:
+                nbr_tensors_sg = np.int32(len(tensor_map_sg))
+                # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro
+                offsets = [np.int32(-1)] * nbr_tensors_sg
+                # Ensure that the order of the offsets match the order of the tensors
+                for tens, idx in tensor_map_sg.items():
+                    # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area
+                    if tens.mem_type in (MemType.Scratch, MemType.Scratch_fast):
+                        offsets[idx] = np.int32(tens.address) if tens.address is not None else np.int32(0)
+
+                offlineAlloc += offsets
+
+            self.nng.metadata.append(("OfflineMemoryAllocation", np.array(offlineAlloc)))
 
         metadata_list = []
         for name, buffer in self.nng.metadata:
author	Johan Alfvén <johan.alfven@arm.com>	2022-09-05 09:39:47 +0200
committer	Johan Alfvén <johan.alfven@arm.com>	2022-10-19 13:37:45 +0200
commit	673683bb828cd552f1970922e3c61079607332b2 (patch)
tree	02e6ca41621ca7ec32d7eb6f36cb755b8da14963
parent	d3d81b3ce138a48c0cddad7eb12710e26dad653e (diff)
download	ethos-u-vela-673683bb828cd552f1970922e3c61079607332b2.tar.gz