aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2022-05-19 12:36:58 +0100
committertim.hall <tim.hall@arm.com>2022-05-19 15:56:19 +0000
commitcda4fcb0fd3e9766a161cf3e5aa7c3283e7f7c9e (patch)
tree2ca560bcf290bf88ab7a0058098df794486ab528
parent8bc7a652607a771e234fda6b05275542ff0fc072 (diff)
downloadethos-u-vela-cda4fcb0fd3e9766a161cf3e5aa7c3283e7f7c9e.tar.gz
MLBEDSW-6563: networks failing with memory area exceeded in vela3.4.0.rc2
- For allocations that have a hard memory limit the Hill Climb allocator should be given more attempts to find a solution that would fit - The fix is to use a memory limit when there is a hard constraint, and a minimum iteration count, reset on every improvement, when there is a soft constraint - Added maximum number iterations CLI option Signed-off-by: Tim Hall <tim.hall@arm.com> Change-Id: I19ff53a0b68412de280263626778a3102cbe52fa
-rw-r--r--OPTIONS.md14
-rw-r--r--ethosu/vela/compiler_driver.py2
-rw-r--r--ethosu/vela/greedy_allocation.py10
-rw-r--r--ethosu/vela/hillclimb_allocation.py33
-rw-r--r--ethosu/vela/register_command_stream_generator.py8
-rw-r--r--ethosu/vela/scheduler.py1
-rw-r--r--ethosu/vela/tensor_allocation.py22
-rw-r--r--ethosu/vela/test/test_hillclimb_allocation.py4
-rw-r--r--ethosu/vela/vela.py10
9 files changed, 80 insertions, 24 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index b4132d3..ddda697 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -255,6 +255,20 @@ systems, as there is a hard limit on thread stack size.
vela network.tflite --recursion-limit 2000
```
+### HillClimb Max Iterations
+
+Sets the maximum number of iterations the Hill Climb tensor allocator will run.
+This is a hard limit on the total number of iterations of the algorithm.
+Reducing this value is unlikely to reduce the compilation time of a working
+solution, and it may cause the algorithm to terminate before finding a workable
+solution.
+**Type: Integer**
+**Default: 99999**
+
+```bash
+vela network.tflite --hillclimb-max-iterations 1000
+```
+
## Verbose Print Options
All of the options below are disabled by default and enabling them will add
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index cec37ef..1d8756b 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -66,6 +66,7 @@ class CompilerOptions:
timing=False,
output_dir="outputs",
cpu_tensor_alignment=Tensor.AllocationQuantum,
+ hillclimb_max_iterations=None,
):
self.verbose_graph = verbose_graph
@@ -84,6 +85,7 @@ class CompilerOptions:
self.timing = timing
self.output_dir = output_dir
self.cpu_tensor_alignment = cpu_tensor_alignment
+ self.hillclimb_max_iterations = hillclimb_max_iterations
def __str__(self):
return type(self).__name__ + ": " + str(self.__dict__)
diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py
index 6f4f801..f6d7a3a 100644
--- a/ethosu/vela/greedy_allocation.py
+++ b/ethosu/vela/greedy_allocation.py
@@ -19,11 +19,7 @@ from . import numeric_util
class GreedyAllocator:
- def __init__(self, nng, arch, live_ranges, mem_area):
- self.nng = nng
- self.arch = arch
- self.mem_area = mem_area
-
+ def __init__(self, live_ranges):
self.live_ranges = live_ranges
self.memory_required = 0
@@ -75,6 +71,6 @@ class GreedyAllocator:
return self.memory_required
-def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment):
- g = GreedyAllocator(nng, arch, live_ranges, mem_area)
+def allocate_live_ranges(live_ranges, alignment):
+ g = GreedyAllocator(live_ranges)
return g.allocate_live_ranges(alignment)
diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py
index 6d3003c..9c4dfb8 100644
--- a/ethosu/vela/hillclimb_allocation.py
+++ b/ethosu/vela/hillclimb_allocation.py
@@ -90,14 +90,17 @@ class HillClimbAllocator:
if the new allocation is better: keep it, else set allocation to previous allocation
"""
- MAX_ITERATIONS = 500
+ # Maximum number of iterations of the algorithm (can be override from the command line)
+ MAX_ITERATIONS = 99999
NOT_ALLOCATED = -1
# Used for live ranges allocated at address 0
NO_PREDECESSOR = -1
# Special handling if best solution has not improved during this many iterations
MAX_ITERATIONS_STUCK = 50
+ # Minimum number of iterations since the last improvement (unless an optimal solution is found)
+ MIN_ITERATIONS_IMPROVE = 500
- def __init__(self, live_ranges: List[LiveRange]):
+ def __init__(self, live_ranges: List[LiveRange], max_iterations: int, memory_limit: int):
# Contains the live ranges
self.lrs: List[LiveRangeInfo] = [
LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())
@@ -122,6 +125,18 @@ class HillClimbAllocator:
size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
# The minimum possible size, assuming all live ranges can be perfectly allocated
self.min_required_size: int = max(size_at_time)
+ # Set the maximum number of iterations the search will iterate to find a solution
+ if max_iterations is None:
+ self.max_iterations = self.MAX_ITERATIONS
+ else:
+ self.max_iterations = max_iterations
+ # Defines a memory limit that the allocation must meet
+ self.memory_limit = memory_limit
+ if self.memory_limit < self.min_required_size:
+ print(
+ f"Warning: Memory limit = {self.memory_limit} is less than the minimum possible allocation size ="
+ f" {self.min_required_size}"
+ )
# Calculate all neighbours + the urgency of each live range
for lr in self.lrs:
lr.urgency = 0
@@ -272,13 +287,16 @@ class HillClimbAllocator:
ix2 = turn_list[random.randint(0, len(turn_list) - 1)]
indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
- def search(self, indices: List[int], iterations: int):
+ def search(self, indices: List[int]):
"""
Search for a solution, using the given indices as initial solution.
"""
best_indices = indices[:]
last_improvement_iteration = 0
- for i in range(iterations):
+ i = 0
+ while (self.best_size > self.memory_limit and i < self.max_iterations) or (
+ i - last_improvement_iteration < self.MIN_ITERATIONS_IMPROVE
+ ):
# Reorder the indices
self.attempt_bottleneck_fix(indices, i - last_improvement_iteration)
# Allocate the reordered indices and check if it gave an improvement
@@ -296,6 +314,7 @@ class HillClimbAllocator:
else:
# The new allocation produced worse result undo the change
indices = best_indices[:]
+ i += 1
def allocate(self) -> List[int]:
"""
@@ -316,17 +335,17 @@ class HillClimbAllocator:
self.allocated_addresses = [lr.address for lr in self.lrs]
if self.best_size > self.min_required_size:
# Try to improve the heuristic allocation
- self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
+ self.search(indices)
# else the heuristic allocation returned an optimal solution; no search needed
return self.allocated_addresses
-def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
+def allocate_live_ranges(lrs: List[LiveRange], max_iterations: int, memory_limit: int) -> List[int]:
"""
Allocates live ranges using a search based allocator.
Returns the list of allocated addresses (one for each live range)
"""
if not lrs:
return []
- allocator = HillClimbAllocator(lrs)
+ allocator = HillClimbAllocator(lrs, max_iterations, memory_limit)
return allocator.allocate()
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 0e68b14..2bdfda2 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -151,7 +151,7 @@ class CommandStreamEmitter:
payload_mode = CmdMode(code & CmdMode.Mask)
- s = f"0x{offset:06x}:"
+ s = f"{offset:#08x}:"
if payload_mode == CmdMode.NoPayload:
s += f" {'':8}"
@@ -291,7 +291,11 @@ def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int
if offset < 0:
raise VelaError(f"Negative address offset: {offset}, region: {region}")
if offset > max:
- raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
+ raise VelaError(
+ f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
+ f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
+ f" allocator"
+ )
def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 0b79402..1e31175 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -1312,6 +1312,7 @@ def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
tensor_allocator=options.tensor_allocator,
verbose_allocation=options.verbose_allocation,
cpu_tensor_alignment=options.cpu_tensor_alignment,
+ hillclimb_max_iterations=options.hillclimb_max_iterations,
)
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index ab65740..1ffae4c 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -66,9 +66,11 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation
return total_sz
-def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
+def hillclimb_allocate_live_ranges(
+ live_ranges: LiveRangeGraph, alloc_granularity: int, max_iterations: int, mem_limit: int
+) -> int:
# Allocates using the hill climb allocator
- addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs)
+ addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs, max_iterations, mem_limit)
# The result is a list containing the allocated addresses
total_sz = 0
for lr, address in zip(live_ranges.lrs, addresses):
@@ -144,7 +146,10 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m
memory_hist = memory_usage_histogram(lrs.lrs)
min_mem_usage_for_alloc = max(memory_hist)
- print("Start Time - End Time: Start Addr - End Addr: Tensor Size: Memory Usage: Tensor Purpose: Tensor Name")
+ print(
+ f"{'Start Time':>10s} - {'End Time':>10s}: {'Start Addr':>10s} - {'End Addr':>10s}: {'Tensor Size':>11s}:"
+ f" {'Memory Usage':>12s}: {'Purpose':12s}: Name"
+ )
for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
(
lr.start_time,
@@ -159,7 +164,7 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m
):
print(
f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
- f" {memory_hist[start_time]:12d}: {purpose.display_name():15s}: {name:s}"
+ f" {memory_hist[start_time]:12d}: {purpose.display_name():12s}: {name:s}"
)
alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
@@ -194,6 +199,7 @@ def allocate(
tensor_allocator=TensorAllocator.Greedy,
lr_graph=None,
cpu_tensor_alignment=Tensor.AllocationQuantum,
+ hillclimb_max_iterations=None,
):
# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
lrs = live_range.extract_live_ranges_from_cascaded_passes(
@@ -207,12 +213,14 @@ def allocate(
if lrs.ranges:
tens_alloc = tensor_allocator
if tens_alloc == TensorAllocator.Greedy:
- total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment)
+ total_sz = greedy_allocate_live_ranges(lrs, cpu_tensor_alignment)
verify_allocation(lrs, cpu_tensor_alignment)
elif tens_alloc == TensorAllocator.LinearAlloc:
total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
elif tens_alloc == TensorAllocator.HillClimb:
- total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment)
+ mem_type = MemType.Scratch_fast if MemType.Scratch_fast in mem_type_set else list(mem_type_set)[0]
+ mem_size = arch.mem_type_size(mem_type)
+ total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment, hillclimb_max_iterations, mem_size)
else:
assert 0
return lrs, total_sz
@@ -228,6 +236,7 @@ def allocate_tensors(
verbose_allocation=False,
lr_graph=None,
cpu_tensor_alignment=Tensor.AllocationQuantum,
+ hillclimb_max_iterations=None,
max_size=None,
dry_test=False,
):
@@ -240,6 +249,7 @@ def allocate_tensors(
tensor_allocator=tensor_allocator,
lr_graph=lr_graph,
cpu_tensor_alignment=cpu_tensor_alignment,
+ hillclimb_max_iterations=hillclimb_max_iterations,
)
if lrs.ranges:
diff --git a/ethosu/vela/test/test_hillclimb_allocation.py b/ethosu/vela/test/test_hillclimb_allocation.py
index 8a56c3f..03aec2b 100644
--- a/ethosu/vela/test/test_hillclimb_allocation.py
+++ b/ethosu/vela/test/test_hillclimb_allocation.py
@@ -54,10 +54,10 @@ def live_range(start_time, end_time, size):
def test_allocate(lrs, expected_size):
"""Tests the search allocator"""
lr_list = [live_range(start, end, size) for start, end, size in lrs]
- res = allocate_live_ranges(lr_list)
+ res = allocate_live_ranges(lr_list, None, 1 << 32)
assert len(res) == len(lrs)
assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == expected_size
def test_allocate_empty_input():
- assert [] == allocate_live_ranges([])
+ assert [] == allocate_live_ranges([], 0, 0)
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index b5f9f98..6207800 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -37,6 +37,7 @@ from .api import API_VERSION
from .debug_database import DebugDatabase
from .errors import InputFileError
from .errors import VelaError
+from .hillclimb_allocation import HillClimbAllocator
from .nn_graph import NetworkType
from .nn_graph import TensorAllocator
from .tensor import MemArea
@@ -439,6 +440,14 @@ def main(args=None):
default=1000,
help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
)
+ parser.add_argument(
+ "--hillclimb-max-iterations",
+ type=int,
+ default=HillClimbAllocator.MAX_ITERATIONS,
+ help=(
+ "Set the maximum number of iterations the Hill Climb tensor allocator will run (default: %(default)s)"
+ ),
+ )
args = parser.parse_args(args=args)
# Generate the supported ops report and exit
@@ -523,6 +532,7 @@ def main(args=None):
timing=args.timing,
output_dir=args.output_dir,
cpu_tensor_alignment=args.cpu_tensor_alignment,
+ hillclimb_max_iterations=args.hillclimb_max_iterations,
)
scheduler_options = scheduler.SchedulerOptions(