From cda4fcb0fd3e9766a161cf3e5aa7c3283e7f7c9e Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Thu, 19 May 2022 12:36:58 +0100 Subject: MLBEDSW-6563: networks failing with memory area exceeded in vela - For allocations that have a hard memory limit the Hill Climb allocator should be given more attempts to find a solution that would fit - The fix is to use a memory limit when there is a hard constraint, and a minimum iteration count, reset on every improvement, when there is a soft constraint - Added maximum number iterations CLI option Signed-off-by: Tim Hall Change-Id: I19ff53a0b68412de280263626778a3102cbe52fa --- OPTIONS.md | 14 ++++++++++ ethosu/vela/compiler_driver.py | 2 ++ ethosu/vela/greedy_allocation.py | 10 +++---- ethosu/vela/hillclimb_allocation.py | 33 +++++++++++++++++++----- ethosu/vela/register_command_stream_generator.py | 8 ++++-- ethosu/vela/scheduler.py | 1 + ethosu/vela/tensor_allocation.py | 22 +++++++++++----- ethosu/vela/test/test_hillclimb_allocation.py | 4 +-- ethosu/vela/vela.py | 10 +++++++ 9 files changed, 80 insertions(+), 24 deletions(-) diff --git a/OPTIONS.md b/OPTIONS.md index b4132d3..ddda697 100644 --- a/OPTIONS.md +++ b/OPTIONS.md @@ -255,6 +255,20 @@ systems, as there is a hard limit on thread stack size. vela network.tflite --recursion-limit 2000 ``` +### HillClimb Max Iterations + +Sets the maximum number of iterations the Hill Climb tensor allocator will run. +This is a hard limit on the total number of iterations of the algorithm. +Reducing this value is unlikely to reduce the compilation time of a working +solution, and it may cause the algorithm to terminate before finding a workable +solution. +**Type: Integer** +**Default: 99999** + +```bash +vela network.tflite --hillclimb-max-iterations 1000 +``` + ## Verbose Print Options All of the options below are disabled by default and enabling them will add diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index cec37ef..1d8756b 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -66,6 +66,7 @@ class CompilerOptions: timing=False, output_dir="outputs", cpu_tensor_alignment=Tensor.AllocationQuantum, + hillclimb_max_iterations=None, ): self.verbose_graph = verbose_graph @@ -84,6 +85,7 @@ class CompilerOptions: self.timing = timing self.output_dir = output_dir self.cpu_tensor_alignment = cpu_tensor_alignment + self.hillclimb_max_iterations = hillclimb_max_iterations def __str__(self): return type(self).__name__ + ": " + str(self.__dict__) diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py index 6f4f801..f6d7a3a 100644 --- a/ethosu/vela/greedy_allocation.py +++ b/ethosu/vela/greedy_allocation.py @@ -19,11 +19,7 @@ from . import numeric_util class GreedyAllocator: - def __init__(self, nng, arch, live_ranges, mem_area): - self.nng = nng - self.arch = arch - self.mem_area = mem_area - + def __init__(self, live_ranges): self.live_ranges = live_ranges self.memory_required = 0 @@ -75,6 +71,6 @@ class GreedyAllocator: return self.memory_required -def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment): - g = GreedyAllocator(nng, arch, live_ranges, mem_area) +def allocate_live_ranges(live_ranges, alignment): + g = GreedyAllocator(live_ranges) return g.allocate_live_ranges(alignment) diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py index 6d3003c..9c4dfb8 100644 --- a/ethosu/vela/hillclimb_allocation.py +++ b/ethosu/vela/hillclimb_allocation.py @@ -90,14 +90,17 @@ class HillClimbAllocator: if the new allocation is better: keep it, else set allocation to previous allocation """ - MAX_ITERATIONS = 500 + # Maximum number of iterations of the algorithm (can be override from the command line) + MAX_ITERATIONS = 99999 NOT_ALLOCATED = -1 # Used for live ranges allocated at address 0 NO_PREDECESSOR = -1 # Special handling if best solution has not improved during this many iterations MAX_ITERATIONS_STUCK = 50 + # Minimum number of iterations since the last improvement (unless an optimal solution is found) + MIN_ITERATIONS_IMPROVE = 500 - def __init__(self, live_ranges: List[LiveRange]): + def __init__(self, live_ranges: List[LiveRange], max_iterations: int, memory_limit: int): # Contains the live ranges self.lrs: List[LiveRangeInfo] = [ LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment()) @@ -122,6 +125,18 @@ class HillClimbAllocator: size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)] # The minimum possible size, assuming all live ranges can be perfectly allocated self.min_required_size: int = max(size_at_time) + # Set the maximum number of iterations the search will iterate to find a solution + if max_iterations is None: + self.max_iterations = self.MAX_ITERATIONS + else: + self.max_iterations = max_iterations + # Defines a memory limit that the allocation must meet + self.memory_limit = memory_limit + if self.memory_limit < self.min_required_size: + print( + f"Warning: Memory limit = {self.memory_limit} is less than the minimum possible allocation size =" + f" {self.min_required_size}" + ) # Calculate all neighbours + the urgency of each live range for lr in self.lrs: lr.urgency = 0 @@ -272,13 +287,16 @@ class HillClimbAllocator: ix2 = turn_list[random.randint(0, len(turn_list) - 1)] indices[ix1], indices[ix2] = indices[ix2], indices[ix1] - def search(self, indices: List[int], iterations: int): + def search(self, indices: List[int]): """ Search for a solution, using the given indices as initial solution. """ best_indices = indices[:] last_improvement_iteration = 0 - for i in range(iterations): + i = 0 + while (self.best_size > self.memory_limit and i < self.max_iterations) or ( + i - last_improvement_iteration < self.MIN_ITERATIONS_IMPROVE + ): # Reorder the indices self.attempt_bottleneck_fix(indices, i - last_improvement_iteration) # Allocate the reordered indices and check if it gave an improvement @@ -296,6 +314,7 @@ class HillClimbAllocator: else: # The new allocation produced worse result undo the change indices = best_indices[:] + i += 1 def allocate(self) -> List[int]: """ @@ -316,17 +335,17 @@ class HillClimbAllocator: self.allocated_addresses = [lr.address for lr in self.lrs] if self.best_size > self.min_required_size: # Try to improve the heuristic allocation - self.search(indices, HillClimbAllocator.MAX_ITERATIONS) + self.search(indices) # else the heuristic allocation returned an optimal solution; no search needed return self.allocated_addresses -def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]: +def allocate_live_ranges(lrs: List[LiveRange], max_iterations: int, memory_limit: int) -> List[int]: """ Allocates live ranges using a search based allocator. Returns the list of allocated addresses (one for each live range) """ if not lrs: return [] - allocator = HillClimbAllocator(lrs) + allocator = HillClimbAllocator(lrs, max_iterations, memory_limit) return allocator.allocate() diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index 0e68b14..2bdfda2 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -151,7 +151,7 @@ class CommandStreamEmitter: payload_mode = CmdMode(code & CmdMode.Mask) - s = f"0x{offset:06x}:" + s = f"{offset:#08x}:" if payload_mode == CmdMode.NoPayload: s += f" {'':8}" @@ -291,7 +291,11 @@ def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int if offset < 0: raise VelaError(f"Negative address offset: {offset}, region: {region}") if offset > max: - raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}") + raise VelaError( + f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running" + f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that" + f" allocator" + ) def quantise(value: float, quant: Optional[NpuQuantization]) -> int: diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 0b79402..1e31175 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -1312,6 +1312,7 @@ def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options): tensor_allocator=options.tensor_allocator, verbose_allocation=options.verbose_allocation, cpu_tensor_alignment=options.cpu_tensor_alignment, + hillclimb_max_iterations=options.hillclimb_max_iterations, ) diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index ab65740..1ffae4c 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -66,9 +66,11 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation return total_sz -def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int: +def hillclimb_allocate_live_ranges( + live_ranges: LiveRangeGraph, alloc_granularity: int, max_iterations: int, mem_limit: int +) -> int: # Allocates using the hill climb allocator - addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs) + addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs, max_iterations, mem_limit) # The result is a list containing the allocated addresses total_sz = 0 for lr, address in zip(live_ranges.lrs, addresses): @@ -144,7 +146,10 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m memory_hist = memory_usage_histogram(lrs.lrs) min_mem_usage_for_alloc = max(memory_hist) - print("Start Time - End Time: Start Addr - End Addr: Tensor Size: Memory Usage: Tensor Purpose: Tensor Name") + print( + f"{'Start Time':>10s} - {'End Time':>10s}: {'Start Addr':>10s} - {'End Addr':>10s}: {'Tensor Size':>11s}:" + f" {'Memory Usage':>12s}: {'Purpose':12s}: Name" + ) for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted( ( lr.start_time, @@ -159,7 +164,7 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m ): print( f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:" - f" {memory_hist[start_time]:12d}: {purpose.display_name():15s}: {name:s}" + f" {memory_hist[start_time]:12d}: {purpose.display_name():12s}: {name:s}" ) alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc @@ -194,6 +199,7 @@ def allocate( tensor_allocator=TensorAllocator.Greedy, lr_graph=None, cpu_tensor_alignment=Tensor.AllocationQuantum, + hillclimb_max_iterations=None, ): # Allocates addresses to tensors, returns False if tensors could not be fit within max_size lrs = live_range.extract_live_ranges_from_cascaded_passes( @@ -207,12 +213,14 @@ def allocate( if lrs.ranges: tens_alloc = tensor_allocator if tens_alloc == TensorAllocator.Greedy: - total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment) + total_sz = greedy_allocate_live_ranges(lrs, cpu_tensor_alignment) verify_allocation(lrs, cpu_tensor_alignment) elif tens_alloc == TensorAllocator.LinearAlloc: total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment) elif tens_alloc == TensorAllocator.HillClimb: - total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment) + mem_type = MemType.Scratch_fast if MemType.Scratch_fast in mem_type_set else list(mem_type_set)[0] + mem_size = arch.mem_type_size(mem_type) + total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment, hillclimb_max_iterations, mem_size) else: assert 0 return lrs, total_sz @@ -228,6 +236,7 @@ def allocate_tensors( verbose_allocation=False, lr_graph=None, cpu_tensor_alignment=Tensor.AllocationQuantum, + hillclimb_max_iterations=None, max_size=None, dry_test=False, ): @@ -240,6 +249,7 @@ def allocate_tensors( tensor_allocator=tensor_allocator, lr_graph=lr_graph, cpu_tensor_alignment=cpu_tensor_alignment, + hillclimb_max_iterations=hillclimb_max_iterations, ) if lrs.ranges: diff --git a/ethosu/vela/test/test_hillclimb_allocation.py b/ethosu/vela/test/test_hillclimb_allocation.py index 8a56c3f..03aec2b 100644 --- a/ethosu/vela/test/test_hillclimb_allocation.py +++ b/ethosu/vela/test/test_hillclimb_allocation.py @@ -54,10 +54,10 @@ def live_range(start_time, end_time, size): def test_allocate(lrs, expected_size): """Tests the search allocator""" lr_list = [live_range(start, end, size) for start, end, size in lrs] - res = allocate_live_ranges(lr_list) + res = allocate_live_ranges(lr_list, None, 1 << 32) assert len(res) == len(lrs) assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == expected_size def test_allocate_empty_input(): - assert [] == allocate_live_ranges([]) + assert [] == allocate_live_ranges([], 0, 0) diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index b5f9f98..6207800 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -37,6 +37,7 @@ from .api import API_VERSION from .debug_database import DebugDatabase from .errors import InputFileError from .errors import VelaError +from .hillclimb_allocation import HillClimbAllocator from .nn_graph import NetworkType from .nn_graph import TensorAllocator from .tensor import MemArea @@ -439,6 +440,14 @@ def main(args=None): default=1000, help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)", ) + parser.add_argument( + "--hillclimb-max-iterations", + type=int, + default=HillClimbAllocator.MAX_ITERATIONS, + help=( + "Set the maximum number of iterations the Hill Climb tensor allocator will run (default: %(default)s)" + ), + ) args = parser.parse_args(args=args) # Generate the supported ops report and exit @@ -523,6 +532,7 @@ def main(args=None): timing=args.timing, output_dir=args.output_dir, cpu_tensor_alignment=args.cpu_tensor_alignment, + hillclimb_max_iterations=args.hillclimb_max_iterations, ) scheduler_options = scheduler.SchedulerOptions( -- cgit v1.2.1