aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/tensor_allocation.py
diff options
context:
space:
mode:
authorLouis Verhaard <louis.verhaard@arm.com>2021-03-30 10:18:28 +0200
committerLouis Verhaard <louis.verhaard@arm.com>2021-03-30 16:15:11 +0200
commit226ecaf4561f421206d1593eac0fa57dd56db82e (patch)
tree539c69b7f61d0d9452ec1554c861677dfb6a480e /ethosu/vela/tensor_allocation.py
parent3438c929528583bc019055ad7057c08271b0cee7 (diff)
downloadethos-u-vela-226ecaf4561f421206d1593eac0fa57dd56db82e.tar.gz
Performance improvement in tensor allocation
- Tensor allocation verification was O(N^2), is now closer to O(N) - Removed a sort in HillClimb allocator Change-Id: I286a269881490c485cc2b0eeab3b1ecffa8f3df0 Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
Diffstat (limited to 'ethosu/vela/tensor_allocation.py')
-rw-r--r--ethosu/vela/tensor_allocation.py38
1 files changed, 19 insertions, 19 deletions
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index b2ea7de6..0ad30e5f 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -17,6 +17,7 @@
# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
# worked out from the allowable overlaps that are calculated by the live range analysis.
import math
+from typing import List
import numpy as np
@@ -25,6 +26,7 @@ from . import live_range
from . import numeric_util
from .errors import AllocationError
from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
+from .live_range import LiveRange
from .live_range import LiveRangeGraph
from .nn_graph import TensorAllocator
from .tensor import MemArea
@@ -65,14 +67,10 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation
def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
# Allocates using the hill climb allocator
- lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()}
- lr_list = [lr for _, _, lr in lr_set]
- lr_list.sort()
-
- addresses = hillclimb_allocation.allocate_live_ranges(lr_list)
+ addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs)
# The result is a list containing the allocated addresses
total_sz = 0
- for lr, address in zip(lr_list, addresses):
+ for lr, address in zip(live_ranges.lrs, addresses):
total_sz = max(total_sz, address + lr.size)
lr.set_address(address)
verify_allocation(live_ranges, alloc_granularity)
@@ -80,7 +78,7 @@ def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularit
def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
- for lr in live_ranges.ranges.values():
+ for lr in live_ranges.lrs:
for tens in lr.tensors:
if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
# This is a CPU tensor, verify alignment
@@ -89,12 +87,16 @@ def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
- lrs = list(live_ranges.ranges.values())
- for n in lrs:
- verify_alignment(live_ranges, alignment)
-
- for m in lrs:
- if n != m and n.overlaps_ranges(m):
+ verify_alignment(live_ranges, alignment)
+ nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
+ # Contains active live ranges at each timestamp
+ lrs_at_time = [[] for i in range(nr_time_slots)]
+ for lr in live_ranges.lrs:
+ for t in range(lr.start_time, lr.end_time + 1):
+ lrs_at_time[t].append(lr)
+ for t in range(nr_time_slots):
+ for ix, n in enumerate(lrs_at_time[t]):
+ for m in lrs_at_time[t][ix + 1 :]:
overlap, tens_n, tens_m = n.overlaps_address(m)
if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
raise AllocationError(
@@ -142,11 +144,9 @@ def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation):
print()
-def calculate_allocation_efficiency(lrs):
- lr_set = set(lrs.ranges.values())
-
- size_at_time = [0] * (1 + max(lr.end_time for lr in lr_set))
- for lr in lr_set:
+def calculate_allocation_efficiency(lrs: List[LiveRange]):
+ size_at_time = [0] * (1 + max(lr.end_time for lr in lrs))
+ for lr in lrs:
for t in range(lr.start_time, lr.end_time + 1):
size_at_time[t] += lr.size
@@ -210,7 +210,7 @@ def allocate_tensors(
print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
if mem_area == MemArea.Sram:
- sg.min_mem_usage = calculate_allocation_efficiency(lrs)
+ sg.min_mem_usage = calculate_allocation_efficiency(lrs.lrs)
# Mark Sram usage for all subgraphs
for sg_ in nng.subgraphs:
mark_sram_used_for_cascaded_passes(sg_, lrs)