From 226ecaf4561f421206d1593eac0fa57dd56db82e Mon Sep 17 00:00:00 2001 From: Louis Verhaard Date: Tue, 30 Mar 2021 10:18:28 +0200 Subject: Performance improvement in tensor allocation - Tensor allocation verification was O(N^2), is now closer to O(N) - Removed a sort in HillClimb allocator Change-Id: I286a269881490c485cc2b0eeab3b1ecffa8f3df0 Signed-off-by: Louis Verhaard --- ethosu/vela/greedy_allocation.py | 2 +- ethosu/vela/live_range.py | 4 +++ ethosu/vela/register_command_stream_generator.py | 1 - ethosu/vela/tensor_allocation.py | 38 ++++++++++---------- ethosu/vela/test/test_tensor_allocation.py | 46 ++++++++++++++++++++++++ 5 files changed, 70 insertions(+), 21 deletions(-) create mode 100644 ethosu/vela/test/test_tensor_allocation.py diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py index 51b07805..c68a507d 100644 --- a/ethosu/vela/greedy_allocation.py +++ b/ethosu/vela/greedy_allocation.py @@ -60,7 +60,7 @@ class GreedyAllocator: def allocate_live_ranges(self, verbose_allocation, alignment): lrs = set() - for lr in self.live_ranges.ranges.values(): + for lr in self.live_ranges.lrs: lrs.add((lr.start_time, -lr.end_time, lr)) lrs = sorted(lrs) diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py index 0cc89e27..de001e56 100644 --- a/ethosu/vela/live_range.py +++ b/ethosu/vela/live_range.py @@ -16,6 +16,8 @@ # Description: # Build a live range graph for tensors in one or more subgraphs. Used for tensor allocation as well as in the scheduler. # Can work with either a pass packed subgraph or a scheduled subgraph. +from typing import List + from .nn_graph import PassPlacement from .operation import Op from .tensor import MemType @@ -99,6 +101,7 @@ class LiveRange: class LiveRangeGraph: def __init__(self): + self.lrs: List[LiveRange] = [] # List of all created ranges self.ranges = {} # tens -> range self.ignore_tensors = set() self.processed_subgraphs = set() @@ -114,6 +117,7 @@ class LiveRangeGraph: # No live range found for the tensor, create a new one rng = LiveRange(tens, alignment) self.ranges[tens] = rng + self.lrs.append(rng) return rng def fuse_ranges(self, in_tens, out_tens): diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index 65801b82..733be590 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -86,7 +86,6 @@ from .register_command_stream_util import Watermark from .shared_buffer_allocation import find_suitable_block_configs from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op from .shared_buffer_allocation import SharedBufferAllocation -from ethosu.vela.errors import VelaError class RegisterMachine: diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index b2ea7de6..0ad30e5f 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -17,6 +17,7 @@ # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been # worked out from the allowable overlaps that are calculated by the live range analysis. import math +from typing import List import numpy as np @@ -25,6 +26,7 @@ from . import live_range from . import numeric_util from .errors import AllocationError from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges +from .live_range import LiveRange from .live_range import LiveRangeGraph from .nn_graph import TensorAllocator from .tensor import MemArea @@ -65,14 +67,10 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int: # Allocates using the hill climb allocator - lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()} - lr_list = [lr for _, _, lr in lr_set] - lr_list.sort() - - addresses = hillclimb_allocation.allocate_live_ranges(lr_list) + addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs) # The result is a list containing the allocated addresses total_sz = 0 - for lr, address in zip(lr_list, addresses): + for lr, address in zip(live_ranges.lrs, addresses): total_sz = max(total_sz, address + lr.size) lr.set_address(address) verify_allocation(live_ranges, alloc_granularity) @@ -80,7 +78,7 @@ def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularit def verify_alignment(live_ranges: LiveRangeGraph, alignment: int): - for lr in live_ranges.ranges.values(): + for lr in live_ranges.lrs: for tens in lr.tensors: if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list): # This is a CPU tensor, verify alignment @@ -89,12 +87,16 @@ def verify_alignment(live_ranges: LiveRangeGraph, alignment: int): def verify_allocation(live_ranges: LiveRangeGraph, alignment: int): - lrs = list(live_ranges.ranges.values()) - for n in lrs: - verify_alignment(live_ranges, alignment) - - for m in lrs: - if n != m and n.overlaps_ranges(m): + verify_alignment(live_ranges, alignment) + nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs) + # Contains active live ranges at each timestamp + lrs_at_time = [[] for i in range(nr_time_slots)] + for lr in live_ranges.lrs: + for t in range(lr.start_time, lr.end_time + 1): + lrs_at_time[t].append(lr) + for t in range(nr_time_slots): + for ix, n in enumerate(lrs_at_time[t]): + for m in lrs_at_time[t][ix + 1 :]: overlap, tens_n, tens_m = n.overlaps_address(m) if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address): raise AllocationError( @@ -142,11 +144,9 @@ def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation): print() -def calculate_allocation_efficiency(lrs): - lr_set = set(lrs.ranges.values()) - - size_at_time = [0] * (1 + max(lr.end_time for lr in lr_set)) - for lr in lr_set: +def calculate_allocation_efficiency(lrs: List[LiveRange]): + size_at_time = [0] * (1 + max(lr.end_time for lr in lrs)) + for lr in lrs: for t in range(lr.start_time, lr.end_time + 1): size_at_time[t] += lr.size @@ -210,7 +210,7 @@ def allocate_tensors( print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation) if mem_area == MemArea.Sram: - sg.min_mem_usage = calculate_allocation_efficiency(lrs) + sg.min_mem_usage = calculate_allocation_efficiency(lrs.lrs) # Mark Sram usage for all subgraphs for sg_ in nng.subgraphs: mark_sram_used_for_cascaded_passes(sg_, lrs) diff --git a/ethosu/vela/test/test_tensor_allocation.py b/ethosu/vela/test/test_tensor_allocation.py new file mode 100644 index 00000000..f437bbb4 --- /dev/null +++ b/ethosu/vela/test/test_tensor_allocation.py @@ -0,0 +1,46 @@ +# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Description: +# Contains unit tests for tensor allocation +import pytest + +from ethosu.vela.data_type import DataType +from ethosu.vela.errors import AllocationError +from ethosu.vela.live_range import LiveRangeGraph +from ethosu.vela.tensor import Tensor +from ethosu.vela.tensor_allocation import verify_allocation + + +def test_verify_allocation(): + # Create live range graph with 2 live ranges with overlapping start/end time + lr_graph = LiveRangeGraph() + t1 = Tensor([1, 100, 10, 10], DataType.int8, "t1") + lr1 = lr_graph.get_or_create_range(t1) + lr1.mark_usage(4) + lr1.mark_usage(8) + t2 = Tensor([1, 10, 20, 10], DataType.int8, "t2") + lr2 = lr_graph.get_or_create_range(t2) + # Set overlapping addresses, should lead to verification failure + lr1.set_address(16) + lr2.set_address(32) + lr2.mark_usage(7) + lr2.mark_usage(12) + with pytest.raises(AllocationError): + verify_allocation(lr_graph, 16) + # Set non-overlapping addresses, verification should now succeed + lr2.set_address(None) + lr2.set_address(160000) + verify_allocation(lr_graph, 16) -- cgit v1.2.1