From db5124c2b5e10b34c61b3e016bb597ba1c1574df Mon Sep 17 00:00:00 2001
From: Diqing Zhong <diqing.zhong@arm.com>
Date: Mon, 11 Jan 2021 12:52:48 +0100
Subject: MLBEDSW-3144: Add weights compression ratio

  - Also removed the original bit_per_element

Change-Id: I51bfbd28e14f316aae2d542bb610a3ed57b8b53b
Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
---
 ethosu/vela/nn_graph.py          |  6 +++---
 ethosu/vela/npu_serialisation.py | 10 ----------
 ethosu/vela/stats_writer.py      | 23 ++++++++++-------------
 ethosu/vela/tensor_allocation.py | 14 +++++++++-----
 4 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index d2c848ad..71d4e614 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -512,9 +512,9 @@ class Graph:
         self.subgraphs = []
         self.metadata = []
         self.memory_used = {}
-        self.bits_per_element = {}
-        self.total_size = {}
-        self.total_elements = {}
+        self.weights_compression_ratio = 0
+        self.total_original_weights = 0
+        self.total_compressed_weights = 0
         self.weight_cache = None  # See CompressedWeightCache
 
     def get_root_subgraph(self):
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index a11907b2..fc6b96b7 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -72,16 +72,6 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fas
 
     command_stream_size_bytes = len(payload_bytes)
 
-    # Adjust the bits per element calculation to exclude metadata generated by Vela
-    nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes
-    nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes
-    nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
-    nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
-
-    if scratch_area != scratch_fast_area:
-        nng.total_size[scratch_fast_area] = nng.total_size.get(scratch_fast_area, 0)
-        nng.total_elements[scratch_fast_area] = nng.total_elements.get(scratch_fast_area, 0)
-
     if flash_tens == scratch_tens is None:
         # First Npu subgraph, create scratch and flash tensors
         sg.scratch_tensor = make_memory_tensor(
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 70b3ffb7..1fb6702b 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -58,7 +58,7 @@ def write_summary_metrics_csv(nng, summary_filename, arch):
             "passes_after_fusing",
         ]
         labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
-        labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
+        labels += ["weights_compression_ratio"]
 
         for mem_area in mem_areas:
             labels += [
@@ -107,11 +107,7 @@ def write_summary_metrics_csv(nng, summary_filename, arch):
 
         data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
         data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
-
-        data_items += [
-            nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
-            nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
-        ]
+        data_items += [nng.weights_compression_ratio]
 
         for mem_area in mem_areas:
             bws = nng.bandwidths[mem_area]
@@ -231,7 +227,7 @@ def print_performance_metrics_for_strat(
     num_cascaded_passes,
     n_operations=0,
     cpu_operations=None,
-    bits_per_element=None,
+    weights_compression_ratio=None,
     show_cpu_operations=False,
     f=sys.stdout,
 ):
@@ -268,11 +264,7 @@ def print_performance_metrics_for_strat(
 
         aug_label = label + " used"
 
-        extra = ""
-        if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
-            extra = f" ({bits_per_element[mem_area]:.2f} bits per element)"
-
-        print(f"Total {aug_label:25}          {memory_used[mem_area] / 1024.0:12.2f} KiB{extra}", file=f)
+        print(f"Total {aug_label:25}          {memory_used[mem_area] / 1024.0:12.2f} KiB", file=f)
 
     print(file=f)
     print(f"{num_passes:d} passes fused into {num_cascaded_passes:d}", file=f)
@@ -329,6 +321,11 @@ def print_performance_metrics_for_strat(
         )
         print(file=f)
 
+    if weights_compression_ratio != 0:
+        print(
+            f"Weights Compression Ratio                {weights_compression_ratio:12.2f}", file=f,
+        )
+
     print(
         f"Neural network macs                      {int(macs):12d} MACs/batch", file=f,
     )
@@ -368,7 +365,7 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout
         n_cascaded_passes,
         n_operations,
         cpu_operations,
-        nng.bits_per_element,
+        nng.weights_compression_ratio,
         show_cpu_operations,
         f,
     )
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 7f66579e..1e5eb852 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -202,8 +202,11 @@ def allocate_tensors(
             else:
                 sg.memory_used_per_type[mem_type] += total_sz
 
-        nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
-        nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
+        if mem_area == arch.fast_storage_mem_area:
+            for tens in lrs.ranges:
+                if tens.purpose == TensorPurpose.Weights:
+                    nng.total_compressed_weights += tens.storage_size()
+                    nng.total_original_weights += tens.elements() * tens.element_size()
 
         print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
 
@@ -214,9 +217,10 @@ def allocate_tensors(
 
     if sg == nng.get_root_subgraph():
         nng.memory_used = sg.memory_used
-        for mem_area in nng.total_elements.keys():
+        if mem_area == arch.fast_storage_mem_area:
             try:
-                nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area]
+                nng.weights_compression_ratio = nng.total_compressed_weights / nng.total_original_weights
             except ZeroDivisionError:
-                nng.bits_per_element[mem_area] = 0.0
+                nng.weights_compression_ratio = 0.0
+
     return True
-- 
cgit v1.2.1