aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Verhaard <louis.verhaard@arm.com>2021-03-30 10:18:28 +0200
committerLouis Verhaard <louis.verhaard@arm.com>2021-03-30 16:15:11 +0200
commit226ecaf4561f421206d1593eac0fa57dd56db82e (patch)
tree539c69b7f61d0d9452ec1554c861677dfb6a480e
parent3438c929528583bc019055ad7057c08271b0cee7 (diff)
downloadethos-u-vela-226ecaf4561f421206d1593eac0fa57dd56db82e.tar.gz
Performance improvement in tensor allocation
- Tensor allocation verification was O(N^2), is now closer to O(N) - Removed a sort in HillClimb allocator Change-Id: I286a269881490c485cc2b0eeab3b1ecffa8f3df0 Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
-rw-r--r--ethosu/vela/greedy_allocation.py2
-rw-r--r--ethosu/vela/live_range.py4
-rw-r--r--ethosu/vela/register_command_stream_generator.py1
-rw-r--r--ethosu/vela/tensor_allocation.py38
-rw-r--r--ethosu/vela/test/test_tensor_allocation.py46
5 files changed, 70 insertions, 21 deletions
diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py
index 51b07805..c68a507d 100644
--- a/ethosu/vela/greedy_allocation.py
+++ b/ethosu/vela/greedy_allocation.py
@@ -60,7 +60,7 @@ class GreedyAllocator:
def allocate_live_ranges(self, verbose_allocation, alignment):
lrs = set()
- for lr in self.live_ranges.ranges.values():
+ for lr in self.live_ranges.lrs:
lrs.add((lr.start_time, -lr.end_time, lr))
lrs = sorted(lrs)
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 0cc89e27..de001e56 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -16,6 +16,8 @@
# Description:
# Build a live range graph for tensors in one or more subgraphs. Used for tensor allocation as well as in the scheduler.
# Can work with either a pass packed subgraph or a scheduled subgraph.
+from typing import List
+
from .nn_graph import PassPlacement
from .operation import Op
from .tensor import MemType
@@ -99,6 +101,7 @@ class LiveRange:
class LiveRangeGraph:
def __init__(self):
+ self.lrs: List[LiveRange] = [] # List of all created ranges
self.ranges = {} # tens -> range
self.ignore_tensors = set()
self.processed_subgraphs = set()
@@ -114,6 +117,7 @@ class LiveRangeGraph:
# No live range found for the tensor, create a new one
rng = LiveRange(tens, alignment)
self.ranges[tens] = rng
+ self.lrs.append(rng)
return rng
def fuse_ranges(self, in_tens, out_tens):
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 65801b82..733be590 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -86,7 +86,6 @@ from .register_command_stream_util import Watermark
from .shared_buffer_allocation import find_suitable_block_configs
from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
from .shared_buffer_allocation import SharedBufferAllocation
-from ethosu.vela.errors import VelaError
class RegisterMachine:
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index b2ea7de6..0ad30e5f 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -17,6 +17,7 @@
# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
# worked out from the allowable overlaps that are calculated by the live range analysis.
import math
+from typing import List
import numpy as np
@@ -25,6 +26,7 @@ from . import live_range
from . import numeric_util
from .errors import AllocationError
from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
+from .live_range import LiveRange
from .live_range import LiveRangeGraph
from .nn_graph import TensorAllocator
from .tensor import MemArea
@@ -65,14 +67,10 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation
def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
# Allocates using the hill climb allocator
- lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()}
- lr_list = [lr for _, _, lr in lr_set]
- lr_list.sort()
-
- addresses = hillclimb_allocation.allocate_live_ranges(lr_list)
+ addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs)
# The result is a list containing the allocated addresses
total_sz = 0
- for lr, address in zip(lr_list, addresses):
+ for lr, address in zip(live_ranges.lrs, addresses):
total_sz = max(total_sz, address + lr.size)
lr.set_address(address)
verify_allocation(live_ranges, alloc_granularity)
@@ -80,7 +78,7 @@ def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularit
def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
- for lr in live_ranges.ranges.values():
+ for lr in live_ranges.lrs:
for tens in lr.tensors:
if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
# This is a CPU tensor, verify alignment
@@ -89,12 +87,16 @@ def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
- lrs = list(live_ranges.ranges.values())
- for n in lrs:
- verify_alignment(live_ranges, alignment)
-
- for m in lrs:
- if n != m and n.overlaps_ranges(m):
+ verify_alignment(live_ranges, alignment)
+ nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
+ # Contains active live ranges at each timestamp
+ lrs_at_time = [[] for i in range(nr_time_slots)]
+ for lr in live_ranges.lrs:
+ for t in range(lr.start_time, lr.end_time + 1):
+ lrs_at_time[t].append(lr)
+ for t in range(nr_time_slots):
+ for ix, n in enumerate(lrs_at_time[t]):
+ for m in lrs_at_time[t][ix + 1 :]:
overlap, tens_n, tens_m = n.overlaps_address(m)
if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
raise AllocationError(
@@ -142,11 +144,9 @@ def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation):
print()
-def calculate_allocation_efficiency(lrs):
- lr_set = set(lrs.ranges.values())
-
- size_at_time = [0] * (1 + max(lr.end_time for lr in lr_set))
- for lr in lr_set:
+def calculate_allocation_efficiency(lrs: List[LiveRange]):
+ size_at_time = [0] * (1 + max(lr.end_time for lr in lrs))
+ for lr in lrs:
for t in range(lr.start_time, lr.end_time + 1):
size_at_time[t] += lr.size
@@ -210,7 +210,7 @@ def allocate_tensors(
print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
if mem_area == MemArea.Sram:
- sg.min_mem_usage = calculate_allocation_efficiency(lrs)
+ sg.min_mem_usage = calculate_allocation_efficiency(lrs.lrs)
# Mark Sram usage for all subgraphs
for sg_ in nng.subgraphs:
mark_sram_used_for_cascaded_passes(sg_, lrs)
diff --git a/ethosu/vela/test/test_tensor_allocation.py b/ethosu/vela/test/test_tensor_allocation.py
new file mode 100644
index 00000000..f437bbb4
--- /dev/null
+++ b/ethosu/vela/test/test_tensor_allocation.py
@@ -0,0 +1,46 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Description:
+# Contains unit tests for tensor allocation
+import pytest
+
+from ethosu.vela.data_type import DataType
+from ethosu.vela.errors import AllocationError
+from ethosu.vela.live_range import LiveRangeGraph
+from ethosu.vela.tensor import Tensor
+from ethosu.vela.tensor_allocation import verify_allocation
+
+
+def test_verify_allocation():
+ # Create live range graph with 2 live ranges with overlapping start/end time
+ lr_graph = LiveRangeGraph()
+ t1 = Tensor([1, 100, 10, 10], DataType.int8, "t1")
+ lr1 = lr_graph.get_or_create_range(t1)
+ lr1.mark_usage(4)
+ lr1.mark_usage(8)
+ t2 = Tensor([1, 10, 20, 10], DataType.int8, "t2")
+ lr2 = lr_graph.get_or_create_range(t2)
+ # Set overlapping addresses, should lead to verification failure
+ lr1.set_address(16)
+ lr2.set_address(32)
+ lr2.mark_usage(7)
+ lr2.mark_usage(12)
+ with pytest.raises(AllocationError):
+ verify_allocation(lr_graph, 16)
+ # Set non-overlapping addresses, verification should now succeed
+ lr2.set_address(None)
+ lr2.set_address(160000)
+ verify_allocation(lr_graph, 16)