From cd03504cfc29767d33d37b5c587116ab90752d74 Mon Sep 17 00:00:00 2001
From: Tim Hall <tim.hall@arm.com>
Date: Tue, 8 Aug 2023 14:10:17 +0100
Subject: MLBEDSW-7689: Document verbose command stream options

 - Documented High-Level and register-Level command stream options
 - Changed High-Level command stream display to show the name of the
command
 - Fixed an issue with some operators not being displayed by the
CLI option --verbose-operators
 - Changed an unneeded print in pass packing to a more useful assertion

Change-Id: I9d53f19f4e32d0478209bc964724c27c935f66d6
Signed-off-by: Tim Hall <tim.hall@arm.com>
---
 OPTIONS.md                                       | 47 ++++++++++++++++++---
 ethosu/vela/high_level_command_stream.py         |  6 +--
 ethosu/vela/nn_graph.py                          | 54 ++++++++----------------
 ethosu/vela/pass_packing.py                      |  2 +-
 ethosu/vela/register_command_stream_generator.py |  2 +
 5 files changed, 63 insertions(+), 48 deletions(-)
diff --git a/OPTIONS.md b/OPTIONS.md
index 38f65a7e..646444e1 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -398,8 +398,8 @@ vela network.tflite --verbose-performance
 
 Displays a list of all operators and the tensors that are connected to them.
 Additional information is shown about the tensors. The format is:
-`<num> <op_type> <op_name>`, where;  
-`  <direction> <idx> <purpose> <mem_area> <mem_type> <tens>`, where;  
+`<num> <op_type> <op_name> <direction> <idx> <purpose> <mem_area> <mem_type>
+<tens>`, where;  
 num = an increasing operator count  
 op_type = the Graph IR Operator Type  
 op_name = the Graph IR Operator Name (this may have been derived from the
@@ -466,9 +466,32 @@ vela network.tflite --verbose-allocation
 
 ### Verbose High Level Command Stream
 
-Display a high level command stream with one command per DMA or NPU stripe. The
-commands contain information about block configuration as well as IFM-, OFM-
-and weight boxes.  
+Display an enumerated list of High-Level (HL) commands in execution
+order.  There are three types of command and each one displays individual
+information:
+
+* NPU Stripe = `<name> <ifm_box> <ifm2_box> <ofm_box> <weight_box>
+<block_config>`, represents a data processing operation that maps directly to
+a single Ethos-U operation where;  
+name = name of the pass that corresponds to this HL command (not unique)  
+ifm_box = part of the IFM in NHWC format  
+ifm2_box = part of the IFM2 in NHWC format (is empty [] when not present)  
+ofm_box = part of the OFM in NHWC format  
+weight_box = part of the filter kernel in NHWC format  
+block_config = block processing size in HWIO format
+
+* DMA = `<in> <out> <box>`, represents a memory copy operation from source to
+destination where;  
+name = name of the pass that corresponds to this HL command (not unique)  
+in = name of the source tensor  
+out = name of the destination tensor  
+box = part of the source tensor in NHWC format
+
+* NOP = `<in> <out>`, represents a memory copy operation that has source equal
+to destination and therefore does nothing, where;  
+name = name of the pass that corresponds to this HL command (not unique)  
+in = name of the input tensor  
+out = name of the output tensor
 
 ```bash
 vela network.tflite --verbose-high-level-command-stream
@@ -476,8 +499,18 @@ vela network.tflite --verbose-high-level-command-stream
 
 ### Verbose Register Command Stream
 
-Display all NPU operations and a register level (low level) command stream with
-all register settings for the network execution on the NPU.  
+Display two groups of information.  The first group is the input to the register
+command stream generator.  The second group is the output of the register
+command stream generator:
+
+* Input = an enumerated list of the High-Level commands that are the input to
+the generator.  Each command details all of its attributes.
+
+* Output = a disassembly of the Ethos-U command stream (referred to as the
+register command stream).  More information about the commands listed in the
+register command stream can be found in the Arm Ethos-U NPU Technical Reference
+Manuals that are available from the Arm Developer website (see
+[README - Resources](README.md#resources)).
 
 ```bash
 vela network.tflite --verbose-register-command-stream
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index 09c1805d..8403b94f 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -236,7 +236,7 @@ class NpuStripe(Command):
         return True
 
     def __str__(self):
-        return "<NPUStripe: ps=%s, ifm_box=%s, ifm2_box=%s, ofm_box=%s, weight_box=%s, block_config=%s>" % (
+        return "<NpuStripe: name=%s, ifm_box=%s, ifm2_box=%s, ofm_box=%s, weight_box=%s, block_config=%s>" % (
             self.ps.name,
             self.ifm_box,
             self.ifm2_box,
@@ -286,7 +286,7 @@ class DMA(Command):
         self.box = box
 
     def __str__(self):
-        return "<DMA: in=%s, out=%s, box=%s>" % (self.in_tensor.name, self.out_tensor.name, self.box)
+        return f"<DMA: name={self.ps.name}, in={self.in_tensor.name}, out={self.out_tensor.name} box={self.box}>"
 
     __repr__ = __str__
 
@@ -302,7 +302,7 @@ class NOP(Command):
         self.out_tensor = out_tensor
 
     def __str__(self):
-        return f"<NOP: in={self.in_tensor.name}, out={self.out_tensor.name}>"
+        return f"<NOP: name={self.ps.name}, in={self.in_tensor.name}, out={self.out_tensor.name}>"
 
     __repr__ = __str__
 
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index a43aac2a..6dc6b583 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -338,41 +338,21 @@ class Subgraph:
 
         return all_ops
 
-    def print_operators(self):
-        print("print_operators()", self.name)
-        all_ops = self.get_all_ops()
-        unique_ops = []
-        for op in all_ops:
-            if op.type in (Op.Const, Op.Identity, Op.Placeholder):
-                continue
-
-            attrs = op.attrs.copy()
-            if op.type in (Op.Conv2D, Op.Conv2DBias, Op.DepthwiseConv2DBias):
-                kshape = op.inputs[1].shape
-                attrs["kshape"] = [kshape[0], kshape[1]]
-            attrs["type"] = op.type.name
-            attrs.pop("use_cudnn_on_gpu", None)
-            custom_options = attrs.pop("custom_options", None)
-            if attrs not in unique_ops:
-                unique_ops.append(attrs)
-                # print attributes in human readable format
-                a = attrs.copy()
-                if custom_options is not None:
-                    a["custom_options"] = custom_options
-                s = a.pop("type")
-                data_format = a.pop("data_format", None)
-                if data_format and data_format != b"NHWC":
-                    s += " " + str(data_format)
-                t = a.pop("T", None)
-                if t:
-                    s += " " + str(t)[9:-2]
-                srct = a.pop("SrcT", None)
-                if srct:
-                    s += " " + str(srct)[9:-2]
-                dstt = a.pop("DstT", None)
-                if dstt:
-                    s += "->" + str(dstt)[9:-2]
-                print(s + " " + str(a))
+    def print_operators(self, ignore_placeholder_const=True, show_attributes=True):
+        print(f"Operators of Subgraph {self.name}")
+
+        ignore_ops = (Op.Const, Op.Identity, Op.Placeholder) if ignore_placeholder_const else ()
+        all_ops = [op for op in self.get_all_ops() if op.type not in ignore_ops]
+
+        if len(all_ops) > 0:
+            max_op_type_len = max([len(op.type.name) for op in all_ops])
+
+            for idx, op in enumerate(all_ops):
+                attrs_str = f" - {op.attrs}" if show_attributes else ""
+                print(f"{idx:3}: {op.type:{max_op_type_len}}{attrs_str} - {op.name}")
+
+        else:
+            print("No Operators")
 
     def print_graph(self, label=None):
         if label:
@@ -562,9 +542,9 @@ class Graph:
         for sg in self.subgraphs:
             sg.refresh_after_modification()
 
-    def print_operators(self):
+    def print_operators(self, ignore_placeholder_const=True, show_attributes=True):
         for sg in self.subgraphs:
-            sg.print_operators()
+            sg.print_operators(ignore_placeholder_const, show_attributes)
 
     def print_graph(self, label=None):
         for sg in self.subgraphs:
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 932f701b..4c733cce 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -271,7 +271,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
                                 assert ifm_tensor.purpose == TensorPurpose.FeatureMap
 
                         if operation_set is None:
-                            print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")
+                            assert not curr_op.run_on_npu  # operator should have been placed on the CPU
 
                         for inp in reversed(curr_op.inputs):
                             if inp is None:
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 71fec3be..56aae73d 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -1052,6 +1052,7 @@ def generate_command_stream(
     """
     emit = CommandStreamEmitter()
     if verbose:
+        print("Register-Level Command Stream: Input")
         print_operations(npu_op_list, npu_op_to_cmd)
     # Calculate memory accesses for every operation
     memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
@@ -1105,6 +1106,7 @@ def generate_command_stream(
         )
 
     if verbose:
+        print("Register-Level Command Stream: Output")
         emit.print_cmds()
         print(f"Number of commands = {len(emit.cmd_stream)}")
         print(f"Command stream length = {emit.size_in_bytes()} bytes")
-- 
cgit v1.2.1