aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/hillclimb_allocation.py
diff options
context:
space:
mode:
Diffstat (limited to 'ethosu/vela/hillclimb_allocation.py')
-rw-r--r--ethosu/vela/hillclimb_allocation.py310
1 files changed, 310 insertions, 0 deletions
diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py
new file mode 100644
index 00000000..de53ab83
--- /dev/null
+++ b/ethosu/vela/hillclimb_allocation.py
@@ -0,0 +1,310 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Tensor allocator based on a hill-climb search
+import random
+from typing import List
+from typing import Set
+
+from .live_range import LiveRange
+
+
+class LiveRangeInfo:
+ def __init__(self, id: int, start_time: int, end_time: int, size: int):
+ # Index of this live range
+ self.id = id
+ # Start time (input to the allocator algorithm)
+ self.start_time = start_time
+ # End time, inclusive (input to the allocator algorithm)
+ self.end_time = end_time
+ # Size in bytes (input to the allocator algorithm)
+ self.size = size
+ # Allocated address (the main output from the allocator algorithm)
+ self.address: int = 0
+ # End address, exclusive
+ self.end_address: int = 0
+ # id of predecessor live range (predecessor's end address == this lr's address)
+ self.predecessor: int = 0
+ # Turn at which the live range was allocated
+ self.turn: int = 0
+ # Max value of size_at_time (only used in the heuristic allocation)
+ self.urgency = 0
+ self.neighbours: List["LiveRangeInfo"] = []
+
+ def overlaps(self, addr2: int, size2: int) -> int:
+ return self.address < addr2 + size2 and addr2 < self.end_address
+
+ def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
+ return self.start_time <= lr.end_time and lr.start_time <= self.end_time
+
+ def __str__(self):
+ return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
+ self.id, self.start_time, self.end_time, self.size, self.address
+ )
+
+ def __lt__(self, other) -> bool:
+ if self.urgency != other.urgency:
+ return self.urgency > other.urgency
+ duration1 = self.end_time - self.start_time
+ duration2 = other.end_time - other.start_time
+ if duration1 != duration2:
+ return duration1 > duration2
+
+ if self.start_time != other.start_time:
+ return self.start_time < other.start_time
+
+ if self.size != other.size:
+ return self.size > other.size
+
+ return self.id < other.id
+
+
+class HillClimbAllocator:
+ """
+ Implements tensor allocator using a hill climb search.
+
+ The basic algorithm is:
+
+ Use a heuristic allocator to find an initial allocation
+ while allocation is not optimal and iterations < MAX_ITERATIONS:
+ find the "bottleneck": the live range with highest end address
+ find all live ranges that affected the allocation of the bottleneck
+ swap the order of any two affecting live ranges
+ reallocate tensors using the reordered live ranges
+ if the new allocation is better: keep it, else set allocation to previous allocation
+ """
+
+ MAX_ITERATIONS = 500
+ NOT_ALLOCATED = -1
+ # Used for live ranges allocated at address 0
+ NO_PREDECESSOR = -1
+
+ def __init__(self, live_ranges: List[LiveRange]):
+ # Contains the live ranges
+ self.lrs: List[LiveRangeInfo] = [
+ LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size) for id, lr in enumerate(live_ranges)
+ ]
+ self.lrs_at_time = []
+ # The available size (input to algorithm).
+ self.available_size: int = 0
+ # The algorithm stops once the target size has been achieved
+ self.target_size: int = 0
+ # The highest end address of the best found allocation
+ self.best_size: int = 1 << 63
+ # For each live range: max value of size_at_time (only used in the heuristic allocation)
+ self.lr_urgency = len(self.lrs) * [0]
+ nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
+ # Contains active live ranges at each timestamp
+ self.lrs_at_time = [[] for i in range(nr_time_slots)]
+ for lr in self.lrs:
+ for t in range(lr.start_time, lr.end_time + 1):
+ self.lrs_at_time[t].append(lr)
+ # At each timestamp: accumulated size of active live ranges
+ size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
+ # The minimum possible size, assuming all live ranges can be perfectly allocated
+ self.min_required_size: int = max(size_at_time)
+ # Calculate all neighbours + the urgency of each live range
+ for lr in self.lrs:
+ lr.urgency = 0
+ lr.neighbours = []
+ neighbours = set()
+ for t in range(lr.start_time, lr.end_time + 1):
+ lr.urgency = max(size_at_time[t], lr.urgency)
+ for lr2 in self.lrs_at_time[t]:
+ if lr2 not in neighbours and lr != lr2:
+ neighbours.add(lr2)
+ lr.neighbours.append(lr2)
+
+ def allocate_lr(self, lr: LiveRangeInfo):
+ """
+ Allocates the given live range at the smallest possible address
+ """
+ address = 0
+ predecessor = HillClimbAllocator.NO_PREDECESSOR
+ fits = False
+ while not fits:
+ fits = True
+ # Find neighbours that overlap with address
+ for lr2 in lr.neighbours:
+ if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
+ continue
+ if lr2.overlaps(address, lr.size):
+ # Overlap found increase address
+ fits = False
+ address = lr2.end_address
+ predecessor = lr2.id
+ lr.address = address
+ lr.end_address = address + lr.size
+ lr.predecessor = predecessor
+
+ def allocate_indices(self, indices: List[int]):
+ """
+ Allocates the live ranges in the order indicated by the indices;
+ allocates each live range at the lowest possible address.
+ """
+ for lr in self.lrs:
+ lr.address = HillClimbAllocator.NOT_ALLOCATED
+ size = 0
+ for turn, index in enumerate(indices):
+ lr = self.lrs[index]
+ self.allocate_lr(lr)
+ lr.turn = turn
+ size = max(size, lr.end_address)
+ if size > self.best_size:
+ # This allocation is worse than the best known allocation;
+ # no need to continue
+ break
+ return size
+
+ def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
+ """
+ Adds the given live range + predecessors to the turns vector.
+ Note: the turn_set is for quick detection of duplicates,
+ the turn_list is to get reproduceable results
+ """
+ if lr.turn not in turn_set:
+ turn_set.add(lr.turn)
+ turn_list.append(lr.turn)
+ id = lr.id
+ while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
+ id = self.lrs[id].predecessor
+ turn = self.lrs[id].turn
+ if turn not in turn_set:
+ turn_set.add(turn)
+ turn_list.append(turn)
+
+ def attempt_bottleneck_fix(self, indices: List[int]):
+ """
+ Finds the "bottleneck", the live range with highest end address, and reorders the indices
+ such that a next allocation might lower the memory usage.
+
+ ---------
+ | |
+ | D |
+ | |
+ ----------------------------------
+ | B |
+ -------------------------------
+ | |
+ |A| ---
+ | | |C|
+ | | | |
+ ---------------------------------------
+
+ In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
+ The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
+ and all direct and indirect predecessors of D and its neighbours
+ (i.e. A, which is the predecessor of B, and indirect predecessor of D).
+
+ By permuting the order in which the affecting live ranges are allocated, the bottleneck might
+ be lowered. In the above example, almost any permutation would lower the bottleneck.
+ """
+ # Find the bottleneck
+ max_lr = self.lrs[0]
+ for lr in self.lrs[1:]:
+ if lr.end_address > max_lr.end_address:
+ max_lr = lr
+
+ # Find all live ranges that affected the placement of the bottleneck live range.
+ # This consists of two types of live ranges:
+ # - direct neighbours of the bottleneck live range
+ # - direct and indirect predecessors of these neighbours + bottleneck
+ # The turns at which these live ranges were allocated are put in the turns set.
+ turn_set = set()
+ turn_list = list()
+ self.add_predecessor_turns(turn_set, turn_list, max_lr)
+ for lr2 in max_lr.neighbours:
+ self.add_predecessor_turns(turn_set, turn_list, lr2)
+
+ # Non-direct neighbours that interfere with the allocation of the bottleneck are the
+ # immediate cause for gaps in the allocation, and are selected with higher probability.
+ non_nb_turn_list = []
+ for turn in turn_list:
+ lr = self.lrs[indices[turn]]
+ if not max_lr.is_neighbour(lr):
+ non_nb_turn_list.append(turn)
+ assert turn_list
+ # Pick from non-neighbour list with 30% probability
+ # (magic number based on tuning)
+ if random.randint(0, 100) < 30 and non_nb_turn_list:
+ # Pick a live range from the "non-neighbour list"
+ ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
+ else:
+ # Pick any affecting live range.
+ ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
+
+ ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
+ if ix1 == ix2:
+ ix2 = turn_list[-1]
+ # Swap indices
+ indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
+
+ def search(self, indices: List[int], iterations: int):
+ """
+ Search for a solution, using the given indices as initial solution.
+ """
+ best_indices = indices[:]
+ for _ in range(iterations):
+ # Reorder the indices
+ self.attempt_bottleneck_fix(indices)
+ # Allocate the reordered indices and check if it gave an improvement
+ new_size = self.allocate_indices(indices)
+ if new_size <= self.best_size:
+ # The new allocation produced a new best result remember it
+ self.best_size = new_size
+ best_indices = indices[:]
+ self.allocated_addresses = [lr.address for lr in self.lrs]
+ if self.best_size <= self.min_required_size:
+ # Target reached stop
+ return
+ else:
+ # The new allocation produced worse result undo the change
+ indices = best_indices[:]
+
+ def allocate(self) -> List[int]:
+ """
+ Runs the allocation algorithm. Finishes when an optimal solution has been
+ found or when maximum iterations have been run.
+ The allocated addresses are placed in the output vector, in the same
+ order as the input vector.
+
+ Implementation note: the algorithm produces reproduceable results by using
+ a well-defined random number generator with well-defined default seed,
+ and using a fixed number of iterations.
+ """
+ random.seed(1)
+ # Sort indices on priority. Note: self.lrs must be left unchanged
+ indices = [lr.id for lr in sorted(self.lrs)]
+ # Allocate the initial solution
+ self.best_size = self.allocate_indices(indices)
+ self.allocated_addresses = [lr.address for lr in self.lrs]
+ if self.best_size > self.min_required_size:
+ # Try to improve the heuristic allocation
+ self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
+ # else the heuristic allocation returned an optimal solution; no search needed
+ return self.allocated_addresses
+
+
+def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
+ """
+ Allocates live ranges using a search based allocator.
+ Returns the list of allocated addresses (one for each live range)
+ """
+ if not lrs:
+ return []
+ allocator = HillClimbAllocator(lrs)
+ return allocator.allocate()