# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the License); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an AS IS BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Description: # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been # worked out from the allowable overlaps that are calculated by the live range analysis. import math from typing import List import numpy as np from . import hillclimb_allocation from . import live_range from . import numeric_util from .errors import AllocationError from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges from .live_range import LiveRange from .live_range import LiveRangeGraph from .nn_graph import TensorAllocator from .tensor import MemArea from .tensor import MemType from .tensor import Tensor from .tensor import TensorPurpose def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum): # Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address total_sz = 0 allocated_tensors = [] # just assign increasing addresses, except for duplicates for tens, lr in live_ranges.ranges.items(): if tens in allocated_tensors: continue address = total_sz if tens.weight_compression_config is not None: for allocated_tens in allocated_tensors: if allocated_tens.weight_compression_config == tens.weight_compression_config: assert allocated_tens.scale_compression_config == tens.scale_compression_config address = allocated_tens.address break if tens.purpose == TensorPurpose.LUT: for allocated_tens in allocated_tensors: if allocated_tens.equivalent(tens): address = allocated_tens.address break lr.set_address(address) allocated_tensors += lr.tensors if address == total_sz: total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity) verify_alignment(live_ranges, alloc_granularity) return total_sz def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int: # Allocates using the hill climb allocator addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs) # The result is a list containing the allocated addresses total_sz = 0 for lr, address in zip(live_ranges.lrs, addresses): total_sz = max(total_sz, address + lr.size) lr.set_address(address) verify_allocation(live_ranges, alloc_granularity) return total_sz def verify_alignment(live_ranges: LiveRangeGraph, alignment: int): for lr in live_ranges.lrs: for tens in lr.tensors: if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list): # This is a CPU tensor, verify alignment if tens.address % alignment != 0: raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes") def verify_allocation(live_ranges: LiveRangeGraph, alignment: int): verify_alignment(live_ranges, alignment) nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs) # Contains active live ranges at each timestamp lrs_at_time: List[List[LiveRange]] = [[] for i in range(nr_time_slots)] for lr in live_ranges.lrs: for t in range(lr.start_time, lr.end_time + 1): lrs_at_time[t].append(lr) for t in range(nr_time_slots): lrs_new_items = [lr for lr in lrs_at_time[t] if t == 0 or lr not in lrs_at_time[t - 1]] for m in lrs_new_items: for n in lrs_at_time[t]: overlap, tens_n, tens_m = n.overlaps_address(m) if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address): raise AllocationError( f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}" f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}" ) def mark_sram_used_for_cascaded_passes(sg, lrs): if len(sg.cascaded_passes) < 1: return end_pos = max(ps.time for ps in sg.cascaded_passes) + 2 mem_usage = np.zeros(end_pos, dtype=np.int64) for tens, rng in lrs.ranges.items(): storage_size = tens.storage_size() mem_usage[rng.start_time : rng.end_time] += storage_size for cps in sg.cascaded_passes: sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1]) cps.sram_used = sram_used for ps in cps.passes: ps.sram_used = sram_used def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc): print("\n" + "#" * 80) sg_placement = ( sg.placement.name if mem_type_set.intersection( ( MemType.Permanent_NPU, MemType.Permanent_CPU, ) ) else "Cpu and Npu" ) print( f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set (" f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}' f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:" ) memory_hist = memory_usage_histogram(lrs.lrs) min_mem_usage_for_alloc = max(memory_hist) print("Start Time - End Time: Start Addr - End Addr: Tensor Size: Memory Usage: Tensor Purpose: Tensor Name") for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted( ( lr.start_time, lr.end_time, lr.size, tens.address, tens.address + lr.size, tens.purpose, tens.name, ) for tens, lr in lrs.ranges.items() ): print( f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:" f" {memory_hist[start_time]:12d}: {purpose.display_name():15s}: {name:s}" ) alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc print( f"Allocation Peak Tensor Size: {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})" f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB" ) print( f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})" f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB" ) print( f"Allocation Overhead: {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}" f" Bytes ({100*alloc_overhead_fraction:.2f} %)" ) def memory_usage_histogram(lrs: List[LiveRange]): histogram = [0] * (1 + max(lr.end_time for lr in lrs)) for lr in lrs: for t in range(lr.start_time, lr.end_time + 1): histogram[t] += lr.size return histogram def allocate( sg, arch, mem_area, mem_type_set, tensor_allocator=TensorAllocator.Greedy, lr_graph=None, cpu_tensor_alignment=Tensor.AllocationQuantum, ): # Allocates addresses to tensors, returns False if tensors could not be fit within max_size lrs = live_range.extract_live_ranges_from_cascaded_passes( sg, mem_area, mem_type_set, lr_graph=lr_graph, cpu_tensor_alignment=cpu_tensor_alignment, ) total_sz = 0 if lrs.ranges: tens_alloc = tensor_allocator if tens_alloc == TensorAllocator.Greedy: total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment) verify_allocation(lrs, cpu_tensor_alignment) elif tens_alloc == TensorAllocator.LinearAlloc: total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment) elif tens_alloc == TensorAllocator.HillClimb: total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment) else: assert 0 return lrs, total_sz def allocate_tensors( nng, sg, arch, mem_area, mem_type_set, tensor_allocator=TensorAllocator.Greedy, verbose_allocation=False, lr_graph=None, cpu_tensor_alignment=Tensor.AllocationQuantum, max_size=None, dry_test=False, ): # Allocates addresses to tensors, returns False if tensors could not be fit within max_size lrs, total_sz = allocate( sg, arch, mem_area, mem_type_set, tensor_allocator=tensor_allocator, lr_graph=lr_graph, cpu_tensor_alignment=cpu_tensor_alignment, ) if lrs.ranges: alloc_ok = max_size is None or total_sz <= max_size if dry_test or not alloc_ok: # Dry test or allocation failed; undo allocation for lr in lrs.ranges.values(): lr.set_address(None) return alloc_ok if sg.memory_used.get(mem_area, 0) == 0: sg.memory_used[mem_area] = total_sz else: sg.memory_used[mem_area] += total_sz # Keep track of how much should be used for scratch or permanent storage for NPU for mem_type in mem_type_set: if sg.memory_used_per_type.get(mem_type, 0) == 0: sg.memory_used_per_type[mem_type] = total_sz else: sg.memory_used_per_type[mem_type] += total_sz if verbose_allocation: print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz) if mem_area == MemArea.Sram: # Mark Sram usage for all subgraphs for sg_ in nng.subgraphs: mark_sram_used_for_cascaded_passes(sg_, lrs) if sg == nng.get_root_subgraph(): nng.memory_used = sg.memory_used return True