From cda4fcb0fd3e9766a161cf3e5aa7c3283e7f7c9e Mon Sep 17 00:00:00 2001
From: Tim Hall <tim.hall@arm.com>
Date: Thu, 19 May 2022 12:36:58 +0100
Subject: MLBEDSW-6563: networks failing with memory area exceeded in vela

 - For allocations that have a hard memory limit the Hill Climb allocator
should be given more attempts to find a solution that would fit
 - The fix is to use a memory limit when there is a hard constraint, and
a minimum iteration count, reset on every improvement, when there is a soft
constraint
 - Added maximum number iterations CLI option

Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I19ff53a0b68412de280263626778a3102cbe52fa
---
 OPTIONS.md                                       | 14 ++++++++++
 ethosu/vela/compiler_driver.py                   |  2 ++
 ethosu/vela/greedy_allocation.py                 | 10 +++----
 ethosu/vela/hillclimb_allocation.py              | 33 +++++++++++++++++++-----
 ethosu/vela/register_command_stream_generator.py |  8 ++++--
 ethosu/vela/scheduler.py                         |  1 +
 ethosu/vela/tensor_allocation.py                 | 22 +++++++++++-----
 ethosu/vela/test/test_hillclimb_allocation.py    |  4 +--
 ethosu/vela/vela.py                              | 10 +++++++
 9 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/OPTIONS.md b/OPTIONS.md
index b4132d3..ddda697 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -255,6 +255,20 @@ systems, as there is a hard limit on thread stack size.
 vela network.tflite --recursion-limit 2000
 ```
 
+### HillClimb Max Iterations
+
+Sets the maximum number of iterations the Hill Climb tensor allocator will run.
+This is a hard limit on the total number of iterations of the algorithm.
+Reducing this value is unlikely to reduce the compilation time of a working
+solution, and it may cause the algorithm to terminate before finding a workable
+solution.  
+**Type: Integer**  
+**Default: 99999**
+
+```bash
+vela network.tflite --hillclimb-max-iterations 1000
+```
+
 ## Verbose Print Options
 
 All of the options below are disabled by default and enabling them will add
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index cec37ef..1d8756b 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -66,6 +66,7 @@ class CompilerOptions:
         timing=False,
         output_dir="outputs",
         cpu_tensor_alignment=Tensor.AllocationQuantum,
+        hillclimb_max_iterations=None,
     ):
 
         self.verbose_graph = verbose_graph
@@ -84,6 +85,7 @@ class CompilerOptions:
         self.timing = timing
         self.output_dir = output_dir
         self.cpu_tensor_alignment = cpu_tensor_alignment
+        self.hillclimb_max_iterations = hillclimb_max_iterations
 
     def __str__(self):
         return type(self).__name__ + ": " + str(self.__dict__)
diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py
index 6f4f801..f6d7a3a 100644
--- a/ethosu/vela/greedy_allocation.py
+++ b/ethosu/vela/greedy_allocation.py
@@ -19,11 +19,7 @@ from . import numeric_util
 
 
 class GreedyAllocator:
-    def __init__(self, nng, arch, live_ranges, mem_area):
-        self.nng = nng
-        self.arch = arch
-        self.mem_area = mem_area
-
+    def __init__(self, live_ranges):
         self.live_ranges = live_ranges
         self.memory_required = 0
 
@@ -75,6 +71,6 @@ class GreedyAllocator:
         return self.memory_required
 
 
-def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment):
-    g = GreedyAllocator(nng, arch, live_ranges, mem_area)
+def allocate_live_ranges(live_ranges, alignment):
+    g = GreedyAllocator(live_ranges)
     return g.allocate_live_ranges(alignment)
diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py
index 6d3003c..9c4dfb8 100644
--- a/ethosu/vela/hillclimb_allocation.py
+++ b/ethosu/vela/hillclimb_allocation.py
@@ -90,14 +90,17 @@ class HillClimbAllocator:
         if the new allocation is better: keep it, else set allocation to previous allocation
     """
 
-    MAX_ITERATIONS = 500
+    # Maximum number of iterations of the algorithm (can be override from the command line)
+    MAX_ITERATIONS = 99999
     NOT_ALLOCATED = -1
     # Used for live ranges allocated at address 0
     NO_PREDECESSOR = -1
     # Special handling if best solution has not improved during this many iterations
     MAX_ITERATIONS_STUCK = 50
+    # Minimum number of iterations since the last improvement (unless an optimal solution is found)
+    MIN_ITERATIONS_IMPROVE = 500
 
-    def __init__(self, live_ranges: List[LiveRange]):
+    def __init__(self, live_ranges: List[LiveRange], max_iterations: int, memory_limit: int):
         # Contains the live ranges
         self.lrs: List[LiveRangeInfo] = [
             LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())
@@ -122,6 +125,18 @@ class HillClimbAllocator:
         size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
         # The minimum possible size, assuming all live ranges can be perfectly allocated
         self.min_required_size: int = max(size_at_time)
+        # Set the maximum number of iterations the search will iterate to find a solution
+        if max_iterations is None:
+            self.max_iterations = self.MAX_ITERATIONS
+        else:
+            self.max_iterations = max_iterations
+        # Defines a memory limit that the allocation must meet
+        self.memory_limit = memory_limit
+        if self.memory_limit < self.min_required_size:
+            print(
+                f"Warning: Memory limit = {self.memory_limit} is less than the minimum possible allocation size ="
+                f" {self.min_required_size}"
+            )
         # Calculate all neighbours + the urgency of each live range
         for lr in self.lrs:
             lr.urgency = 0
@@ -272,13 +287,16 @@ class HillClimbAllocator:
             ix2 = turn_list[random.randint(0, len(turn_list) - 1)]
             indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
 
-    def search(self, indices: List[int], iterations: int):
+    def search(self, indices: List[int]):
         """
         Search for a solution, using the given indices as initial solution.
         """
         best_indices = indices[:]
         last_improvement_iteration = 0
-        for i in range(iterations):
+        i = 0
+        while (self.best_size > self.memory_limit and i < self.max_iterations) or (
+            i - last_improvement_iteration < self.MIN_ITERATIONS_IMPROVE
+        ):
             # Reorder the indices
             self.attempt_bottleneck_fix(indices, i - last_improvement_iteration)
             # Allocate the reordered indices and check if it gave an improvement
@@ -296,6 +314,7 @@ class HillClimbAllocator:
             else:
                 # The new allocation produced worse result undo the change
                 indices = best_indices[:]
+            i += 1
 
     def allocate(self) -> List[int]:
         """
@@ -316,17 +335,17 @@ class HillClimbAllocator:
         self.allocated_addresses = [lr.address for lr in self.lrs]
         if self.best_size > self.min_required_size:
             # Try to improve the heuristic allocation
-            self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
+            self.search(indices)
         # else the heuristic allocation returned an optimal solution; no search needed
         return self.allocated_addresses
 
 
-def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
+def allocate_live_ranges(lrs: List[LiveRange], max_iterations: int, memory_limit: int) -> List[int]:
     """
     Allocates live ranges using a search based allocator.
     Returns the list of allocated addresses (one for each live range)
     """
     if not lrs:
         return []
-    allocator = HillClimbAllocator(lrs)
+    allocator = HillClimbAllocator(lrs, max_iterations, memory_limit)
     return allocator.allocate()
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 0e68b14..2bdfda2 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -151,7 +151,7 @@ class CommandStreamEmitter:
 
             payload_mode = CmdMode(code & CmdMode.Mask)
 
-            s = f"0x{offset:06x}:"
+            s = f"{offset:#08x}:"
 
             if payload_mode == CmdMode.NoPayload:
                 s += f" {'':8}"
@@ -291,7 +291,11 @@ def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int
                     if offset < 0:
                         raise VelaError(f"Negative address offset: {offset}, region: {region}")
                     if offset > max:
-                        raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
+                        raise VelaError(
+                            f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
+                            f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
+                            f" allocator"
+                        )
 
 
 def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 0b79402..1e31175 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -1312,6 +1312,7 @@ def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
             tensor_allocator=options.tensor_allocator,
             verbose_allocation=options.verbose_allocation,
             cpu_tensor_alignment=options.cpu_tensor_alignment,
+            hillclimb_max_iterations=options.hillclimb_max_iterations,
         )
 
 
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index ab65740..1ffae4c 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -66,9 +66,11 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation
     return total_sz
 
 
-def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
+def hillclimb_allocate_live_ranges(
+    live_ranges: LiveRangeGraph, alloc_granularity: int, max_iterations: int, mem_limit: int
+) -> int:
     # Allocates using the hill climb allocator
-    addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs)
+    addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs, max_iterations, mem_limit)
     # The result is a list containing the allocated addresses
     total_sz = 0
     for lr, address in zip(live_ranges.lrs, addresses):
@@ -144,7 +146,10 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m
 
     memory_hist = memory_usage_histogram(lrs.lrs)
     min_mem_usage_for_alloc = max(memory_hist)
-    print("Start Time -   End Time: Start Addr -   End Addr: Tensor Size: Memory Usage:  Tensor Purpose: Tensor Name")
+    print(
+        f"{'Start Time':>10s} - {'End Time':>10s}: {'Start Addr':>10s} - {'End Addr':>10s}: {'Tensor Size':>11s}:"
+        f" {'Memory Usage':>12s}: {'Purpose':12s}: Name"
+    )
     for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
         (
             lr.start_time,
@@ -159,7 +164,7 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m
     ):
         print(
             f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
-            f" {memory_hist[start_time]:12d}: {purpose.display_name():15s}: {name:s}"
+            f" {memory_hist[start_time]:12d}: {purpose.display_name():12s}: {name:s}"
         )
 
     alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
@@ -194,6 +199,7 @@ def allocate(
     tensor_allocator=TensorAllocator.Greedy,
     lr_graph=None,
     cpu_tensor_alignment=Tensor.AllocationQuantum,
+    hillclimb_max_iterations=None,
 ):
     # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
     lrs = live_range.extract_live_ranges_from_cascaded_passes(
@@ -207,12 +213,14 @@ def allocate(
     if lrs.ranges:
         tens_alloc = tensor_allocator
         if tens_alloc == TensorAllocator.Greedy:
-            total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment)
+            total_sz = greedy_allocate_live_ranges(lrs, cpu_tensor_alignment)
             verify_allocation(lrs, cpu_tensor_alignment)
         elif tens_alloc == TensorAllocator.LinearAlloc:
             total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
         elif tens_alloc == TensorAllocator.HillClimb:
-            total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment)
+            mem_type = MemType.Scratch_fast if MemType.Scratch_fast in mem_type_set else list(mem_type_set)[0]
+            mem_size = arch.mem_type_size(mem_type)
+            total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment, hillclimb_max_iterations, mem_size)
         else:
             assert 0
     return lrs, total_sz
@@ -228,6 +236,7 @@ def allocate_tensors(
     verbose_allocation=False,
     lr_graph=None,
     cpu_tensor_alignment=Tensor.AllocationQuantum,
+    hillclimb_max_iterations=None,
     max_size=None,
     dry_test=False,
 ):
@@ -240,6 +249,7 @@ def allocate_tensors(
         tensor_allocator=tensor_allocator,
         lr_graph=lr_graph,
         cpu_tensor_alignment=cpu_tensor_alignment,
+        hillclimb_max_iterations=hillclimb_max_iterations,
     )
 
     if lrs.ranges:
diff --git a/ethosu/vela/test/test_hillclimb_allocation.py b/ethosu/vela/test/test_hillclimb_allocation.py
index 8a56c3f..03aec2b 100644
--- a/ethosu/vela/test/test_hillclimb_allocation.py
+++ b/ethosu/vela/test/test_hillclimb_allocation.py
@@ -54,10 +54,10 @@ def live_range(start_time, end_time, size):
 def test_allocate(lrs, expected_size):
     """Tests the search allocator"""
     lr_list = [live_range(start, end, size) for start, end, size in lrs]
-    res = allocate_live_ranges(lr_list)
+    res = allocate_live_ranges(lr_list, None, 1 << 32)
     assert len(res) == len(lrs)
     assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == expected_size
 
 
 def test_allocate_empty_input():
-    assert [] == allocate_live_ranges([])
+    assert [] == allocate_live_ranges([], 0, 0)
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index b5f9f98..6207800 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -37,6 +37,7 @@ from .api import API_VERSION
 from .debug_database import DebugDatabase
 from .errors import InputFileError
 from .errors import VelaError
+from .hillclimb_allocation import HillClimbAllocator
 from .nn_graph import NetworkType
 from .nn_graph import TensorAllocator
 from .tensor import MemArea
@@ -439,6 +440,14 @@ def main(args=None):
             default=1000,
             help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
         )
+        parser.add_argument(
+            "--hillclimb-max-iterations",
+            type=int,
+            default=HillClimbAllocator.MAX_ITERATIONS,
+            help=(
+                "Set the maximum number of iterations the Hill Climb tensor allocator will run (default: %(default)s)"
+            ),
+        )
         args = parser.parse_args(args=args)
 
         # Generate the supported ops report and exit
@@ -523,6 +532,7 @@ def main(args=None):
             timing=args.timing,
             output_dir=args.output_dir,
             cpu_tensor_alignment=args.cpu_tensor_alignment,
+            hillclimb_max_iterations=args.hillclimb_max_iterations,
         )
 
         scheduler_options = scheduler.SchedulerOptions(
-- 
cgit v1.2.1