# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the License); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an AS IS BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Description: # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been # worked out from the allowable overlaps that are calculated by the live range analysis. import math import numpy as np from . import live_range from . import numeric_util from .tensor import MemArea from .nn_graph import TensorAllocator from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges def linear_allocate_live_ranges(live_ranges, alloc_granularity=256): total_sz = 0 allocated_tensors = [] # just assign increasing addresses for tens, lr in live_ranges.ranges.items(): if tens in allocated_tensors: continue lr.set_address(total_sz) allocated_tensors += lr.tensors total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity) return total_sz def mark_sram_used_for_cascaded_passes(sg, lrs): end_pos = max(ps.time for ps in sg.cascaded_passes) + 2 mem_usage = np.zeros(end_pos, dtype=np.int64) for tens, rng in lrs.ranges.items(): storage_size = tens.storage_size() mem_usage[rng.start_time : rng.end_time] += storage_size for cps in sg.cascaded_passes: sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1]) cps.sram_used = sram_used for ps in cps.passes: ps.sram_used = sram_used def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation): if verbose_allocation: if mem_area == MemArea.Sram: print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs") else: print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)") for start_time, start, end, name, end_time in sorted( ( lr.start_time, tens.address, tens.address + int(math.ceil(tens.storage_size())), tens.name + " " + str(tens.purpose), lr.end_time, ) for tens, lr in lrs.ranges.items() ): name = name.replace("\x00", "") print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name)) print() if show_minimum_possible_allocation and mem_area == MemArea.Sram: min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes) print( "Min possible allocation %d bytes / %.1f KB / %.1f MB" % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024) ) def allocate_tensors( nng, sg, arch, mem_area, use_ifm_ofm_overlap=True, tensor_allocator=TensorAllocator.Greedy, verbose_allocation=False, show_minimum_possible_allocation=False, lr_graph=None, ): ignore_subgraph_input_output_tensors = False lrs = live_range.extract_live_ranges_from_cascaded_passes( sg, mem_area, mark_output_tensors_overlapping_with_input_tensors=False, use_ifm_ofm_overlap=use_ifm_ofm_overlap, ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors, lr_graph=lr_graph, ) if lrs.ranges: tens_alloc = tensor_allocator if tens_alloc == TensorAllocator.Greedy: total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation) elif tens_alloc == TensorAllocator.LinearAlloc: total_sz = linear_allocate_live_ranges(lrs) else: assert 0 sg.memory_used[mem_area] = total_sz nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges) nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges) print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation) if mem_area == MemArea.Sram: # Mark Sram usage for all subgraphs for sg_ in nng.subgraphs: mark_sram_used_for_cascaded_passes(sg_, lrs) if sg == nng.get_root_subgraph(): nng.memory_used = sg.memory_used for mem_area in nng.total_elements.keys(): try: nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area] except ZeroDivisionError: nng.bits_per_element[mem_area] = 0.0