4 files changed, 120 insertions, 11 deletions
diff --git a/DEBUG_DB.md b/DEBUG_DB.md
new file mode 100644
index 00000000..2b530c5e
--- /dev/null
+++ b/DEBUG_DB.md
@@ -0,0 +1,72 @@
+# Debug database
+
+The purpose of the debug database is to track operator transformations during
+the optimisation process of Vela. This is later correlated with the trace
+output of the model, externally, to determine the runtime of the original layer
+operators. Standalone, the debug database can be used in order to give a brief
+overview of how the operators in the network change throughout the optimisation
+process. This document gives an overview of the structure of the database and
+its outputs, to help parsing of the generated data in a debug procedure.
+
+# Contents
+
+While processing, Vela maintains information about operator substitutions and
+command generation in its internal Debug Database. The database tracks the data
+transformations through the following states:
+
+- Creation of Source operators - these operators are created from the source
+representation,  in this case the original TFLite file.
+- Creation of Optimised operators - these are the operators that result from
+ optimising the source operators. They may be the source operators repeated,
+or substitute operators inserted by the optimiser.
+- Creation of Queue commands - these are the register command sequences
+generated by the code generator from the optimised operators.
+
+Vela's processing steps add data to internal debug tables; one table for each
+of the above states. When vela has completed processing, it can write out the
+internal debug tables through the command line option "++enable-debug-db".
+
+# File Format
+
+The internal debug tables are formatted as columnar CSV. Each row represents an
+operator or stream command; keyed on a numeric value that uniquely identifies
+that operator or command. These tables are further packaged into an XML
+container file, along with metadata, for easier transport and handling.
+
+**Debug node**
+
+The top-level debug node wraps the entire file and contains information about
+the source and optimised file paths.
+
+<debug optimised="output_from_vela.tflite" source="input.tflite">
+
+**Table nodes**
+
+The top-level debug node contains one or more table nodes. Each table node is
+named, and the table data is represented as CSV formatted text stored in a
+CDATA payload tag. The first row of the table contains column headers.
+
+<table name="source"><![CDATA[ "column0", "column1", "column2", ...
+
+There currently are 4 named tables.
+
+- "source" - Table of TFLite operators from the source file.
+- "optimised" - Table of optimised operators generated by vela
+- "queue" - Table of command queue offsets
+- "cmdstream" - Table describing properties of one or more command streams
+
+
+The tables reference each other through the following connections: the Queue
+table is linked to the Optimised table through the *optimised_id* and the
+command stream table through the *cmdstream_id*. The Optimised table is in turn
+linked to the Source table through the *source_id*.
+
+# Ordering
+
+Note that the source, cmdstream and optimised tables are not ordered in a
+meaningful way. The insertion order of entries in the source and optimised
+tables is arbitrarty - a side effect of traversal and optimisation. No attempt
+should be made to interpret the entries as a graph. The only ordered table is
+the queue table; which is ordered by its queue offset. This table describes the
+execution order of the hardware commands, and they can be mapped back to the
+optimised and source operators in order to determine their execution order.
+\ No newline at end of file
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 8d6fc871..b5e7b4b9 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -501,13 +501,14 @@ def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
             npu_op_list.append(npu_op)
             npu_op_to_cmd[npu_op] = cmd
     # Generate register commands
-    stream_id = DebugDatabase.add_stream(sg)
-    DebugDatabase.set_stream_offset(sg, 0)  # Default to zero, can only set during file writing
+    if len(sg.high_level_command_stream) > 0:
+        stream_id = DebugDatabase.add_stream(sg)
+        sg.generated_stream_id = stream_id
 
-    def add_to_debug_db(npu_op: NpuOperation, offset: int):
-        """Adds info to the debug database"""
-        if not isinstance(npu_op, NpuDmaOperation):
-            cmd = npu_op_to_cmd[npu_op]
-            DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
+        def add_to_debug_db(npu_op: NpuOperation, offset: int):
+            """Adds info to the debug database"""
+            if not isinstance(npu_op, NpuDmaOperation):
+                cmd = npu_op_to_cmd[npu_op]
+                DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
 
-    sg.register_command_stream = generate_command_stream(npu_op_list, arch, verbose, add_to_debug_db, npu_op_to_cmd)
+        sg.register_command_stream = generate_command_stream(npu_op_list, arch, verbose, add_to_debug_db, npu_op_to_cmd)
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 71d4e614..db878bc3 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -150,6 +150,7 @@ class Subgraph:
         self.flash_tensor = None
         # Scratch information locally used in the scheduler
         self.scheduling_info = {}
+        self.generated_stream_id = None
 
         self.memory_used = {}
         self.memory_used_per_type = {}
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index bfc76ec9..c4510b18 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -23,6 +23,8 @@ import os
 import sys
 import time
 
+import flatbuffers
+
 from . import architecture_features
 from . import compiler_driver
 from . import model_reader
@@ -39,6 +41,7 @@ from .scheduler import ParetoMetric
 from .supported_operators import SupportedOperators
 from .tensor import MemArea
 from .tensor import Tensor
+from .tflite.Model import Model
 from .tflite_mapping import builtin_operator_map
 from .tflite_mapping import builtin_type_name
 from ethosu.vela.architecture_features import ArchitectureFeatures
@@ -80,6 +83,11 @@ def process(input_name, enable_debug_db, arch, model_reader_options, compiler_op
         tflite_writer.write_tflite(nng, output_filename)
 
     if enable_debug_db:
+        file_offsets = calculate_operator_file_offsets(output_filename)
+        for idx, offset in enumerate(sorted(file_offsets)):
+            sg = find_subgraph_with_command_stream_order(nng, idx)
+            if sg is not None:
+                DebugDatabase.set_stream_offset(sg, offset)
         debug_filename = output_basename + "_debug.xml"
         DebugDatabase.write(debug_filename, input_name, output_filename)
 
@@ -90,6 +98,33 @@ def process(input_name, enable_debug_db, arch, model_reader_options, compiler_op
     return nng
 
 
+def find_subgraph_with_command_stream_order(nng, idx):
+    for sg in nng.subgraphs:
+        if sg.generated_stream_id == idx:
+            return sg
+    return None
+
+
+def calculate_operator_file_offsets(name: str):
+    # Read the vela optimized tflite file
+    with open(name, "rb") as f:
+        buf = bytearray(f.read())
+    # Calculate the file offsets for each custom operator
+    file_offsets = []
+    model = Model.GetRootAsModel(buf, 0)
+    for idx in range(model.SubgraphsLength()):  # However only one subgraph is supported as of now
+        sg = model.Subgraphs(idx)
+        for idx in range(sg.OperatorsLength()):
+            operator = sg.Operators(idx)
+            if model.OperatorCodes(operator.OpcodeIndex()).CustomCode() is not None:
+                tensor_idx = operator.Inputs(0)
+                tensor = sg.Tensors(tensor_idx)
+                buffer = model.Buffers(tensor.Buffer())
+                offset = flatbuffers.number_types.UOffsetTFlags.py_type(buffer._tab.Offset(4))
+                file_offsets.append(buffer._tab.Vector(offset))
+    return file_offsets
+
+
 def print_subgraph_io_summary(nng):
     """Print a summary of all the input and output tensor sizes for all subgraphs.
     Also displays the total tensor size and the memory used area for sram.