diff options
Diffstat (limited to 'ethosu/vela/shared_buffer_allocation.py')
-rw-r--r-- | ethosu/vela/shared_buffer_allocation.py | 199 |
1 files changed, 199 insertions, 0 deletions
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py new file mode 100644 index 00000000..b5408d19 --- /dev/null +++ b/ethosu/vela/shared_buffer_allocation.py @@ -0,0 +1,199 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass. + +import numpy as np +from .nn_graph import NpuBlockType +from .numeric_util import round_up_divide, round_up +from .architecture_features import Block, Kernel, SHRAMElements, SharedBufferArea, ArchitectureFeatures +from . import pass_packing + + +class SharedBufferAllocation: + def __init__(self, arch, ps): + self.arch = arch + + self.bank_locations = np.zeros(SharedBufferArea.Size) + self.banks_required = np.zeros(SharedBufferArea.Size) + + ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm() + + strides = (1, 1, 1, 1) + dilation = (1, 1, 1, 1) + self.kernel = Kernel(1, 1) + is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise + + if ps.primary_op: + strides = ps.primary_op.attrs.get("strides", strides) + dilation = ps.primary_op.attrs.get("dilation", dilation) + k_h = 1 + k_w = 1 + if weight_tensor: + if ps.primary_op.type != "FullyConnectedAct": + k_h = weight_tensor.shape[0] + k_w = weight_tensor.shape[1] + else: + k_h = ps.primary_op.attrs.get("filter_height", 1) + k_w = ps.primary_op.attrs.get("filter_width", 1) + + self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1]) + + self.is_equal_depth_op = is_elementwise or ps.npu_block_type in ( + NpuBlockType.ConvolutionDepthWise, + NpuBlockType.Pooling, + ) + self.strides = strides + + self.use_accumulator_element = SHRAMElements.Acc32 + if is_elementwise: + self.use_ifm_element = SHRAMElements.IFM8_Elementwise + else: + self.use_ifm_element = SHRAMElements.IFM8 + + self.ifm_bits = 0 + self.ifm_depth = 0 + if ifm_tensor: + self.ifm_bits = ifm_tensor.dtype.size_in_bits() + if ifm_tensor.shape == [] and is_elementwise: + # Elementwise operator with scalar in ifm, use ifm2 depth + self.ifm_depth = ifm2_tensor.shape[-1] + else: + self.ifm_depth = ifm_tensor.shape[-1] + if self.ifm_bits == 16: + self.use_accumulator_element = SHRAMElements.Acc40 + self.use_ifm_element = self.use_ifm_element + 1 + assert (self.use_ifm_element == SHRAMElements.IFM16) or ( + self.use_ifm_element == SHRAMElements.IFM16_Elementwise + ) + else: + assert self.ifm_bits == 8, "Unexpected IFM bitdepth" + + self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits) + self.ofm_tensor = ofm_tensor + + self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks + self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks + + def is_valid(self): + # Assign zero-based bank starts (first element remains zero) + self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1] + + # Accumulator area is measured from the end of the buffer + self.bank_locations[SharedBufferArea.Accumulators] = ( + self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators] + ) + ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM] + return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators] + + def try_block(self, ofm_block: Block): + # Get IFM block configuration + ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth + ifm_block = self.arch.get_ifm_block_size(ifm_block_depth, ofm_block, self.kernel) + ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth) + if ifm_config is None: + return None + + # Get OFM block configuration + ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth) + if ofm_config is None: + return None + + # Update bank counts for IFM and Accumulator + self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] + self.banks_required[SharedBufferArea.Accumulators] = ofm_config.banks[self.use_accumulator_element] + + # Validating calculates bank layout and returns validity + if not self.is_valid(): + return None + + return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth) + + def generate_used_mask(self, active_set): + res = np.zeros(self.arch.shram_total_banks, dtype=np.int64) + for kind in active_set: + start = int(self.bank_locations[kind]) + end = start + int(self.banks_required[kind]) + res[start:end] = 1 + return res + + def is_compatible(first, second): + """See if the bank allocations of two convolutions are compatible, + so that they can run back-to-back without a fence in between""" + + first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators)) + second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights)) + + first_mask = first.generate_used_mask(first_set) + second_mask = second.generate_used_mask(second_set) + + if np.sum(first_mask & second_mask): + # overlap + return False + + return True + + +def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config): + alloc = SharedBufferAllocation(arch, ps) + assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op + if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])): + return alloc + + return None + + +def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps): + alloc = SharedBufferAllocation(arch, ps) + + if arch.override_block_config: + config = alloc.try_block(arch.override_block_config) + assert config, "Block config override cannot be used" + return [config] + + # Constrain the search space if the OFM is smaller than the max block size + # - Add other block search constraints here if required + if len(alloc.ofm_tensor.shape) == 2: + max_block_height = max_block_width = alloc.ofm_tensor.shape[0] + else: + max_block_width = alloc.ofm_tensor.shape[-2] + max_block_height = alloc.ofm_tensor.shape[-3] + + # Common block depth + max_block_depth = alloc.ofm_tensor.shape[-1] + + # Constrain to valid ranges before search + max_block_width = min(arch.ofm_block_max.width, max_block_width) + max_block_height = min(arch.ofm_block_max.height, max_block_height) + max_block_depth = min(arch.ofm_block_max.depth, max_block_depth) + + valid_block_configs = [] + # Try a range of block shapes against this pass + for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width): + for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height): + # Try valid OFM block depths + for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth): + # OFM block depth has the constraint that if it causes the OFM to be + # split, it must be a multiple of the OFM split size + if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0): + config = alloc.try_block(Block(w, h, c)) + if config: + valid_block_configs.append(config) + + assert len(valid_block_configs) > 0 + return valid_block_configs |