From 0b8268a0dac80aa22133ca83ed6912d3b565439a Mon Sep 17 00:00:00 2001 From: Louis Verhaard Date: Wed, 5 Aug 2020 16:11:29 +0200 Subject: MLBEDSW-2688: Improved LUT support - Support for more than one 256-byte LUT in SHRAM - No DMA is performed for a LUT that is already located in SHRAM - Added MemArea.Shram, used for LUT, to avoid false address collision asserts during SRAM tensor allocation - Added read access to LUT in memory access calculation Change-Id: If4d1eded5ed029d253f4f5efb2d80495fc3eac99 Signed-off-by: Louis Verhaard --- ethosu/vela/lut.py | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 ethosu/vela/lut.py (limited to 'ethosu/vela/lut.py') diff --git a/ethosu/vela/lut.py b/ethosu/vela/lut.py new file mode 100644 index 00000000..39101fac --- /dev/null +++ b/ethosu/vela/lut.py @@ -0,0 +1,120 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Description: +# Functionality for lookup table support. +import uuid +from functools import lru_cache + +from . import numeric_util +from .high_level_command_stream import CommandType +from .tensor import TensorPurpose + + +@lru_cache(maxsize=None) +def create_equivalence_id(key): + # Generates equivalence_id based on key. + # The DMA optimization of LUT-s assumes that 2 LUT tensors are identical + # if they have the same equivalence_id. + # So for example all created 256-byte tanh LUT tensors should have + # the same equivalence id. + return uuid.uuid4() + + +class LUTState: + # Tracks which LUT-s are located in SHRAM. + def __init__(self): + self.tensors = [] + + def get_equivalent(self, lut_tens): + # Returns existing lut with same equivalence id, None if not found + for t in self.tensors: + if t.equivalent(lut_tens): + return t + return None + + def put(self, lut_tens): + # Returns new LUT state containing given tensor + all tensors in this state + # that do not overlap with the given tensor + new_state = LUTState() + new_state.tensors.append(lut_tens) + start = lut_tens.address + end = start + lut_tens.storage_size() + for tens in self.tensors: + start2 = tens.address + end2 = start2 + tens.storage_size() + if not numeric_util.overlaps(start, end, start2, end2): + new_state.tensors.append(tens) + return new_state + + def find_best_address(self, start, stop, step): + # Finds the address in the given range that overlaps with the minimum number of + # currently present LUT-s. + # An improvement would be to also take future LUT usage into account + best_addr = start + best_nr_overlaps = stop + for addr in range(start, stop, step): + nr_overlaps = 0 + for tens in self.tensors: + start2 = tens.address + end2 = start2 + tens.storage_size() + if numeric_util.overlaps(addr, addr + step, start2, end2): + nr_overlaps += 1 + if nr_overlaps < best_nr_overlaps: + best_nr_overlaps = nr_overlaps + best_addr = addr + return best_addr + + +def get_lut_index(arch, lut_tensor): + # Returns the index in SHRAM where the given LUT is stored, a value between 0 and 8 + slot = (lut_tensor.address - arch.shram_lut_address) // lut_tensor.storage_size() + assert 0 <= slot < 8 + return slot + + +def optimize_high_level_cmd_stream(sg, arch): + # - Allocates SHRAM address/lut index to LUT tensors + # - Removes unnecessary DMA operations of LUT-s that are already present in SHRAM from sg's command stream + cmd_stream = [] # will contain existing command stream minus unneeded DMA operations + lut_state = LUTState() + slot_size = 256 + lut_start = arch.shram_lut_address + lut_end = lut_start + arch.shram_lut_size + for cmd in sg.high_level_command_stream: + if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.lut_tensor is None and arch.shram_reserved_unused_banks == 0: + # The command overwrites the last 2 banks containing the LUT; next LUT operation will require DMA + # TODO: check the command's SHRAM usage in more detail to determine if the LUT is overwritten or not + lut_state = LUTState() + if cmd.cmdtype != CommandType.DMA or cmd.out_tensor.purpose != TensorPurpose.LUT: + # Non-LUT operation; leave untouched + cmd_stream.append(cmd) + continue + # LUT DMA operation + lut_tens = cmd.out_tensor + existing_tens = lut_state.get_equivalent(lut_tens) + if existing_tens is not None: + # LUT is already in SHRAM, no need to perform DMA + lut_tens.address = existing_tens.address + cmd.ps.primary_op.attrs["lut_index"] = get_lut_index(arch, existing_tens) + continue + # Place the LUT in the last 2 blocks of SHRAM + # Alignment is always on the size of the LUT, 256 for 256-byte LUT, 1K for 1K LUT, etc + address = lut_state.find_best_address(lut_start, lut_end, lut_tens.storage_size()) + lut_tens.address = address + cmd.ps.primary_op.attrs["lut_index"] = (address - lut_start) // slot_size + lut_state = lut_state.put(lut_tens) + cmd_stream.append(cmd) + sg.high_level_command_stream = cmd_stream -- cgit v1.2.1