From 79d07d2cbf1c5013ab40bb46a6ccd4c569966536 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Mon, 27 Apr 2020 18:20:16 +0100 Subject: Add Vela codebase - Added modules ethosu.vela and ethosu.mlw_codec. - Added README and various configuration files. Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee --- ethosu/vela/tensor_allocation.py | 139 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 ethosu/vela/tensor_allocation.py (limited to 'ethosu/vela/tensor_allocation.py') diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py new file mode 100644 index 00000000..94aa6088 --- /dev/null +++ b/ethosu/vela/tensor_allocation.py @@ -0,0 +1,139 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been +# worked out from the allowable overlaps that are calculated by the live range analysis. + +from . import live_range +from .tensor import MemArea +import math +from . import numeric_util +import numpy as np +from .nn_graph import TensorAllocator, PassPlacement + +from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges + + +def linear_allocate_live_ranges(live_ranges, alloc_granularity=256): + total_sz = 0 + allocated_tensors = [] + + # just assign increasing addresses + for tens, lr in live_ranges.ranges.items(): + if tens in allocated_tensors: + continue + + lr.set_address(total_sz) + allocated_tensors += lr.tensors + total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity) + + return total_sz + + +def mark_sram_used_for_cascaded_passes(sg, lrs): + end_pos = max(ps.time for ps in sg.cascaded_passes) + 2 + mem_usage = np.zeros(end_pos, dtype=np.int64) + + for tens, rng in lrs.ranges.items(): + storage_size = tens.storage_size() + mem_usage[rng.start_time : rng.end_time] += storage_size + + for cps in sg.cascaded_passes: + sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1]) + cps.sram_used = sram_used + for ps in cps.passes: + ps.sram_used = sram_used + + +def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation): + if verbose_allocation: + if mem_area == MemArea.Sram: + print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs") + else: + print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)") + for start_time, start, end, name, end_time in sorted( + ( + lr.start_time, + tens.address, + tens.address + int(math.ceil(tens.storage_size())), + tens.name + " " + str(tens.purpose), + lr.end_time, + ) + for tens, lr in lrs.ranges.items() + ): + name = name.replace("\x00", "") + print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name)) + print() + + if show_minimum_possible_allocation and mem_area == MemArea.Sram: + min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes) + print( + "Min possible allocation %d bytes / %.1f KB / %.1f MB" + % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024) + ) + + +def allocate_tensors( + nng, + sg, + arch, + mem_area, + use_ifm_ofm_overlap=True, + tensor_allocator=TensorAllocator.Greedy, + verbose_allocation=False, + show_minimum_possible_allocation=False, + lr_graph=None, +): + ignore_subgraph_input_output_tensors = False + lrs = live_range.extract_live_ranges_from_cascaded_passes( + sg, + mem_area, + mark_output_tensors_overlapping_with_input_tensors=False, + use_ifm_ofm_overlap=use_ifm_ofm_overlap, + ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors, + lr_graph=lr_graph, + ) + + if lrs.ranges: + tens_alloc = tensor_allocator + if tens_alloc == TensorAllocator.Greedy: + total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation) + elif tens_alloc == TensorAllocator.LinearAlloc: + total_sz = linear_allocate_live_ranges(lrs) + else: + assert 0 + + sg.memory_used[mem_area] = total_sz + + nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges) + nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges) + + print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation) + + if mem_area == MemArea.Sram: + # Mark Sram usage for all subgraphs + for sg_ in nng.subgraphs: + mark_sram_used_for_cascaded_passes(sg_, lrs) + + if sg == nng.get_root_subgraph(): + nng.memory_used = sg.memory_used + for mem_area in nng.total_elements.keys(): + try: + nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area] + except ZeroDivisionError: + nng.bits_per_element[mem_area] = 0.0 -- cgit v1.2.1