From b9b515ca996e1ec5835a2c35033cc6f818f71f12 Mon Sep 17 00:00:00 2001
From: Tim Hall <tim.hall@arm.com>
Date: Sun, 1 Nov 2020 21:27:19 +0000
Subject: vela: Remove and change CLI options

 - Removed unused --show-minimum-possible-allocation
 - Change --allocation-alignment to --cpu-tensor-alignment

Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I00e367c3190aeea08a3f136332711e9accc85ba3
---
 OPTIONS.md                       | 21 ++++++++-------------
 ethosu/vela/compiler_driver.py   | 16 +++++-----------
 ethosu/vela/live_range.py        |  8 ++++----
 ethosu/vela/tensor_allocation.py | 20 ++++++--------------
 ethosu/vela/vela.py              | 28 +++++++++++++---------------
 5 files changed, 36 insertions(+), 57 deletions(-)

diff --git a/OPTIONS.md b/OPTIONS.md
index baf6c5a3..10bfea55 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -280,11 +280,14 @@ SRAM, albeit at the cost of performance (inference speed).
 vela network.tflite --weight-estimation-scaling=1.2
 ```
 
-### Allocation alignment
-
-Controls the allocation byte alignment.  Only affects CPU tensors, NPU tensors
-will remain 16-byte aligned independent of this option.  Alignment has to be a
-power of two and greater or equal to 16.  
+### CPU Tensor Alignment
+
+Controls the allocation byte alignment.  This affects all CPU tensors including
+Ethos-U Custom operator inputs and outputs.  In this instance a CPU tensor is
+defined as any tensor that is explicitly listed in the resulting `.tflite` file.
+The Ethos-U NPU internal tensors will remain 16-byte aligned independent of this
+option, these tensors are contained within the command stream.  Alignment has to
+be a power of two and greater or equal to 16.  
 **Type: Integer**  
 **Default: 16**  
 
@@ -305,14 +308,6 @@ Prints a summary of all the subgraphs and their inputs and outputs.
 vela network.tflite --show-subgraph-io-summary
 ```
 
-### Show Minimum Possible Allocation
-
-Prints the minimum possible allocation.  
-
-```bash
-vela network.tflite --show-minimum-possible-allocation
-```
-
 ### Show Cpu Operations
 
 Show the operations that fall back to the CPU.  
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 0739133b..a2b20e47 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -61,12 +61,11 @@ Note the difference between ArchitectureFeatures and CompilerOptions
         verbose_high_level_command_stream=False,
         verbose_register_command_stream=False,
         verbose_operators=False,
-        show_minimum_possible_allocation=False,
         show_cpu_operations=False,
         tensor_allocator=TensorAllocator.Greedy,
         timing=False,
         output_dir="outputs",
-        allocation_alignment=Tensor.AllocationQuantum,
+        cpu_tensor_alignment=Tensor.AllocationQuantum,
     ):
 
         self.verbose_graph = verbose_graph
@@ -78,12 +77,11 @@ Note the difference between ArchitectureFeatures and CompilerOptions
         self.verbose_high_level_command_stream = verbose_high_level_command_stream
         self.verbose_register_command_stream = verbose_register_command_stream
         self.verbose_operators = verbose_operators
-        self.show_minimum_possible_allocation = show_minimum_possible_allocation
         self.show_cpu_operations = show_cpu_operations
         self.tensor_allocator = tensor_allocator
         self.timing = timing
         self.output_dir = output_dir
-        self.allocation_alignment = allocation_alignment
+        self.cpu_tensor_alignment = cpu_tensor_alignment
 
     def __str__(self):
         return type(self).__name__ + ": " + str(self.__dict__)
@@ -209,7 +207,6 @@ def compiler_driver(nng, arch, options, scheduler_options):
             set((MemType.Permanent_NPU,)),
             tensor_allocator=TensorAllocator.LinearAlloc,
             verbose_allocation=options.verbose_allocation,
-            show_minimum_possible_allocation=options.show_minimum_possible_allocation,
             lr_graph=lr_graph_flash,
         )
 
@@ -259,8 +256,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
                     dry_test=dry_test,
                     tensor_allocator=options.tensor_allocator,
                     verbose_allocation=options.verbose_allocation,
-                    show_minimum_possible_allocation=options.show_minimum_possible_allocation,
-                    allocation_alignment=options.allocation_alignment,
+                    cpu_tensor_alignment=options.cpu_tensor_alignment,
                 )
                 if dry_test or not alloc_success:
                     for sg in nng.subgraphs:
@@ -281,8 +277,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
                 mem_type_set,
                 tensor_allocator=options.tensor_allocator,
                 verbose_allocation=options.verbose_allocation,
-                show_minimum_possible_allocation=options.show_minimum_possible_allocation,
-                allocation_alignment=options.allocation_alignment,
+                cpu_tensor_alignment=options.cpu_tensor_alignment,
             )
 
     # Generate command streams and serialise Npu-ops into tensors
@@ -316,8 +311,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
         set((MemType.Permanent_CPU,)),
         tensor_allocator=TensorAllocator.LinearAlloc,
         verbose_allocation=options.verbose_allocation,
-        show_minimum_possible_allocation=options.show_minimum_possible_allocation,
-        allocation_alignment=options.allocation_alignment,
+        cpu_tensor_alignment=options.cpu_tensor_alignment,
     )
 
     npu_performance.calc_performance_for_network(nng, arch)
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index a29cafe0..dbc0ce40 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -236,7 +236,7 @@ def extract_live_ranges_from_cascaded_passes(
     target_mem_type_set,
     ignore_subgraph_input_output_tensors=False,
     lr_graph=None,
-    allocation_alignment=Tensor.AllocationQuantum,
+    cpu_tensor_alignment=Tensor.AllocationQuantum,
 ):
     if lr_graph is None:
         lr_graph = LiveRangeGraph()
@@ -261,7 +261,7 @@ def extract_live_ranges_from_cascaded_passes(
         for tens in cps.inputs:
             if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set):
                 continue
-            rng = lr_graph.get_or_create_range(tens, allocation_alignment)
+            rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment)
             rng.mark_usage(time_for_pass)
 
         cps_primary_op = cps.passes[0].primary_op
@@ -285,7 +285,7 @@ def extract_live_ranges_from_cascaded_passes(
         for tens in cps.intermediates + cps.outputs:
             if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set):
                 continue
-            rng = lr_graph.get_or_create_range(tens, allocation_alignment)
+            rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment)
             rng.mark_usage(time_for_pass)
 
         lr_graph.current_time += 2
@@ -298,7 +298,7 @@ def extract_live_ranges_from_cascaded_passes(
     for tens in sg.output_tensors:
         if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set):
             continue
-        rng = lr_graph.get_or_create_range(tens, allocation_alignment)
+        rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment)
         rng.mark_usage(end_time)
 
     # Add subgraph to set of processed subgraphs
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 8329a617..d1a33728 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -85,7 +85,7 @@ def mark_sram_used_for_cascaded_passes(sg, lrs):
             ps.sram_used = sram_used
 
 
-def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation):
+def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation):
     if verbose_allocation:
         if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)):
             print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
@@ -108,13 +108,6 @@ def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_m
         print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024))
         print()
 
-    if show_minimum_possible_allocation and mem_area == MemArea.Sram:
-        min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes)
-        print(
-            "Min possible allocation %d bytes / %.1f KB / %.1f MB"
-            % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024)
-        )
-
 
 def allocate_tensors(
     nng,
@@ -124,9 +117,8 @@ def allocate_tensors(
     mem_type_set,
     tensor_allocator=TensorAllocator.Greedy,
     verbose_allocation=False,
-    show_minimum_possible_allocation=False,
     lr_graph=None,
-    allocation_alignment=Tensor.AllocationQuantum,
+    cpu_tensor_alignment=Tensor.AllocationQuantum,
     max_size=None,
     dry_test=False,
 ):
@@ -138,15 +130,15 @@ def allocate_tensors(
         mem_type_set,
         ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
         lr_graph=lr_graph,
-        allocation_alignment=allocation_alignment,
+        cpu_tensor_alignment=cpu_tensor_alignment,
     )
 
     if lrs.ranges:
         tens_alloc = tensor_allocator
         if tens_alloc == TensorAllocator.Greedy:
-            total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, allocation_alignment, verbose_allocation)
+            total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation)
         elif tens_alloc == TensorAllocator.LinearAlloc:
-            total_sz = linear_allocate_live_ranges(lrs, allocation_alignment)
+            total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
         else:
             assert 0
         alloc_ok = max_size is None or total_sz <= max_size
@@ -171,7 +163,7 @@ def allocate_tensors(
         nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
         nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
 
-        print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation)
+        print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
 
         if mem_area == MemArea.Sram:
             # Mark Sram usage for all subgraphs
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 4f632d56..05bd9ec1 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -234,10 +234,6 @@ def main(args=None):
         "--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
     )
     parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
-
-    parser.add_argument(
-        "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
-    )
     parser.add_argument(
         "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
     )
@@ -316,8 +312,8 @@ def main(args=None):
         default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
         choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
         help=(
-            "Set the maximum value that can be used for the block dependency between npu kernel operations "
-            "(default: %(default)s)"
+            "Set the maximum value that can be used for the block dependency between npu kernel operations"
+            " (default: %(default)s)"
         ),
     )
     parser.add_argument(
@@ -334,10 +330,13 @@ def main(args=None):
         help=("Performs an additional scaling of weight compression scale estimate (default: %(default)s)"),
     )
     parser.add_argument(
-        "--allocation-alignment",
+        "--cpu-tensor-alignment",
         type=int,
         default=Tensor.AllocationQuantum,
-        help=("Controls the allocation byte alignment of cpu tensors (default: %(default)s)"),
+        help=(
+            "Controls the allocation byte alignment of cpu tensors including Ethos-U Custom operator inputs and outputs"
+            " (default: %(default)s)"
+        ),
     )
     args = parser.parse_args(args=args)
 
@@ -362,11 +361,11 @@ def main(args=None):
     else:
         force_block_config = None
 
-    alignment = args.allocation_alignment
-    if alignment < 16:
-        parser.error("the following argument needs to be greater or equal to 16: ALLOCATION_ALIGNMENT")
-    if alignment & (alignment - 1) != 0:
-        parser.error("the following argument needs to be a power of 2: ALLOCATION_ALIGNMENT")
+    if args.cpu_tensor_alignment < 16 or args.cpu_tensor_alignment & (args.cpu_tensor_alignment - 1) != 0:
+        parser.error(
+            "Invalid argument to --cpu-tensor-alignment = {} (must be greater than or equal to 16 and a power of 2)"
+            "".format(args.cpu_tensor_alignment)
+        )
 
     arch = architecture_features.ArchitectureFeatures(
         vela_config_files=args.config,
@@ -390,12 +389,11 @@ def main(args=None):
         verbose_high_level_command_stream=args.verbose_high_level_command_stream,
         verbose_register_command_stream=args.verbose_register_command_stream,
         verbose_operators=args.verbose_operators,
-        show_minimum_possible_allocation=args.show_minimum_possible_allocation,
         show_cpu_operations=args.show_cpu_operations,
         tensor_allocator=args.tensor_allocator,
         timing=args.timing,
         output_dir=args.output_dir,
-        allocation_alignment=alignment,
+        cpu_tensor_alignment=args.cpu_tensor_alignment,
     )
 
     scheduler_options = scheduler.SchedulerOptions(
-- 
cgit v1.2.1