From ad45f792e699fe6abdc381f62690801aa50bd412 Mon Sep 17 00:00:00 2001
From: "erik.andersson@arm.com" <erik.andersson@arm.com>
Date: Wed, 3 Feb 2021 10:20:16 +0100
Subject: MLBEDSW-3509: Updated the debug database to support multiple custom
 operators.

Previously the debug database lost some operators in the debug database outputs when multiple custom operators were generated by Vela.
Also, the file offsets for command streams were always 0, even for a single custom operator. This patch should rectify these problems.

Signed-off-by: erik.andersson@arm.com <erik.andersson@arm.com>
Change-Id: Ieb072440d4f1806d4833a676683b4f42f431f3df
---
 DEBUG_DB.md                                 | 72 +++++++++++++++++++++++++++++
 ethosu/vela/high_level_command_to_npu_op.py | 19 ++++----
 ethosu/vela/nn_graph.py                     |  3 +-
 ethosu/vela/vela.py                         | 37 ++++++++++++++-
 4 files changed, 120 insertions(+), 11 deletions(-)
 create mode 100644 DEBUG_DB.md
diff --git a/DEBUG_DB.md b/DEBUG_DB.md
new file mode 100644
index 00000000..2b530c5e
--- /dev/null
+++ b/DEBUG_DB.md
@@ -0,0 +1,72 @@
+# Debug database
+
+The purpose of the debug database is to track operator transformations during
+the optimisation process of Vela. This is later correlated with the trace
+output of the model, externally, to determine the runtime of the original layer
+operators. Standalone, the debug database can be used in order to give a brief
+overview of how the operators in the network change throughout the optimisation
+process. This document gives an overview of the structure of the database and
+its outputs, to help parsing of the generated data in a debug procedure.
+
+# Contents
+
+While processing, Vela maintains information about operator substitutions and
+command generation in its internal Debug Database. The database tracks the data
+transformations through the following states:
+
+- Creation of Source operators - these operators are created from the source
+representation,  in this case the original TFLite file.
+- Creation of Optimised operators - these are the operators that result from
+ optimising the source operators. They may be the source operators repeated,
+or substitute operators inserted by the optimiser.
+- Creation of Queue commands - these are the register command sequences
+generated by the code generator from the optimised operators.
+
+Vela's processing steps add data to internal debug tables; one table for each
+of the above states. When vela has completed processing, it can write out the
+internal debug tables through the command line option "++enable-debug-db".
+
+# File Format
+
+The internal debug tables are formatted as columnar CSV. Each row represents an
+operator or stream command; keyed on a numeric value that uniquely identifies
+that operator or command. These tables are further packaged into an XML
+container file, along with metadata, for easier transport and handling.
+
+**Debug node**
+
+The top-level debug node wraps the entire file and contains information about
+the source and optimised file paths.
+
+<debug optimised="output_from_vela.tflite" source="input.tflite">
+
+**Table nodes**
+
+The top-level debug node contains one or more table nodes. Each table node is
+named, and the table data is represented as CSV formatted text stored in a
+CDATA payload tag. The first row of the table contains column headers.
+
+<table name="source"><![CDATA[ "column0", "column1", "column2", ...
+
+There currently are 4 named tables.
+
+- "source" - Table of TFLite operators from the source file.
+- "optimised" - Table of optimised operators generated by vela
+- "queue" - Table of command queue offsets
+- "cmdstream" - Table describing properties of one or more command streams
+
+
+The tables reference each other through the following connections: the Queue
+table is linked to the Optimised table through the *optimised_id* and the
+command stream table through the *cmdstream_id*. The Optimised table is in turn
+linked to the Source table through the *source_id*.
+
+# Ordering
+
+Note that the source, cmdstream and optimised tables are not ordered in a
+meaningful way. The insertion order of entries in the source and optimised
+tables is arbitrarty - a side effect of traversal and optimisation. No attempt
+should be made to interpret the entries as a graph. The only ordered table is
+the queue table; which is ordered by its queue offset. This table describes the
+execution order of the hardware commands, and they can be mapped back to the
+optimised and source operators in order to determine their execution order.
\ No newline at end of file
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 8d6fc871..b5e7b4b9 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -501,13 +501,14 @@ def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
             npu_op_list.append(npu_op)
             npu_op_to_cmd[npu_op] = cmd
     # Generate register commands
-    stream_id = DebugDatabase.add_stream(sg)
-    DebugDatabase.set_stream_offset(sg, 0)  # Default to zero, can only set during file writing
+    if len(sg.high_level_command_stream) > 0:
+        stream_id = DebugDatabase.add_stream(sg)
+        sg.generated_stream_id = stream_id
 
-    def add_to_debug_db(npu_op: NpuOperation, offset: int):
-        """Adds info to the debug database"""
-        if not isinstance(npu_op, NpuDmaOperation):
-            cmd = npu_op_to_cmd[npu_op]
-            DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
+        def add_to_debug_db(npu_op: NpuOperation, offset: int):
+            """Adds info to the debug database"""
+            if not isinstance(npu_op, NpuDmaOperation):
+                cmd = npu_op_to_cmd[npu_op]
+                DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
 
-    sg.register_command_stream = generate_command_stream(npu_op_list, arch, verbose, add_to_debug_db, npu_op_to_cmd)
+        sg.register_command_stream = generate_command_stream(npu_op_list, arch, verbose, add_to_debug_db, npu_op_to_cmd)
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 71d4e614..db878bc3 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -150,6 +150,7 @@ class Subgraph:
         self.flash_tensor = None
         # Scratch information locally used in the scheduler
         self.scheduling_info = {}
+        self.generated_stream_id = None
 
         self.memory_used = {}
         self.memory_used_per_type = {}
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index bfc76ec9..c4510b18 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -23,6 +23,8 @@ import os
 import sys
 import time
 
+import flatbuffers
+
 from . import architecture_features
 from . import compiler_driver
 from . import model_reader
@@ -39,6 +41,7 @@ from .scheduler import ParetoMetric
 from .supported_operators import SupportedOperators
 from .tensor import MemArea
 from .tensor import Tensor
+from .tflite.Model import Model
 from .tflite_mapping import builtin_operator_map
 from .tflite_mapping import builtin_type_name
 from ethosu.vela.architecture_features import ArchitectureFeatures
@@ -80,6 +83,11 @@ def process(input_name, enable_debug_db, arch, model_reader_options, compiler_op
         tflite_writer.write_tflite(nng, output_filename)
 
     if enable_debug_db:
+        file_offsets = calculate_operator_file_offsets(output_filename)
+        for idx, offset in enumerate(sorted(file_offsets)):
+            sg = find_subgraph_with_command_stream_order(nng, idx)
+            if sg is not None:
+                DebugDatabase.set_stream_offset(sg, offset)
         debug_filename = output_basename + "_debug.xml"
         DebugDatabase.write(debug_filename, input_name, output_filename)
 
@@ -90,6 +98,33 @@ def process(input_name, enable_debug_db, arch, model_reader_options, compiler_op
     return nng
 
 
+def find_subgraph_with_command_stream_order(nng, idx):
+    for sg in nng.subgraphs:
+        if sg.generated_stream_id == idx:
+            return sg
+    return None
+
+
+def calculate_operator_file_offsets(name: str):
+    # Read the vela optimized tflite file
+    with open(name, "rb") as f:
+        buf = bytearray(f.read())
+    # Calculate the file offsets for each custom operator
+    file_offsets = []
+    model = Model.GetRootAsModel(buf, 0)
+    for idx in range(model.SubgraphsLength()):  # However only one subgraph is supported as of now
+        sg = model.Subgraphs(idx)
+        for idx in range(sg.OperatorsLength()):
+            operator = sg.Operators(idx)
+            if model.OperatorCodes(operator.OpcodeIndex()).CustomCode() is not None:
+                tensor_idx = operator.Inputs(0)
+                tensor = sg.Tensors(tensor_idx)
+                buffer = model.Buffers(tensor.Buffer())
+                offset = flatbuffers.number_types.UOffsetTFlags.py_type(buffer._tab.Offset(4))
+                file_offsets.append(buffer._tab.Vector(offset))
+    return file_offsets
+
+
 def print_subgraph_io_summary(nng):
     """Print a summary of all the input and output tensor sizes for all subgraphs.
     Also displays the total tensor size and the memory used area for sram.
-- 
cgit v1.2.1