diff options
Diffstat (limited to 'ethosu/vela/npu_serialisation.py')
-rw-r--r-- | ethosu/vela/npu_serialisation.py | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py new file mode 100644 index 00000000..4542c25b --- /dev/null +++ b/ethosu/vela/npu_serialisation.py @@ -0,0 +1,145 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Serialises and packs an NPU subgraph into tensors. + +from .nn_graph import PassPlacement +from .tensor import MemArea, Tensor, TensorPurpose, TensorFormat +from .operation import Operation +from .data_type import DataType +import numpy as np +from . import driver_actions +import struct + + +def make_memory_tensor(name, mem_area, sz, want_values, arch): + tens = Tensor([sz], DataType.uint8, name) + tens.mem_area = mem_area + tens.purpose = TensorPurpose.FeatureMap + tens.set_format(TensorFormat.NHWC, arch) + if want_values: + tens.values = np.zeros(tens.shape, np.uint8) + return tens + + +def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor): + start_addr = src_tensor.address + for compressed_values in src_tensor.compressed_values: + end_addr = start_addr + len(compressed_values) + memory_tensor.values[start_addr:end_addr] = compressed_values + start_addr = end_addr + + +def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens): + if sg.placement != PassPlacement.Npu: + return scratch_tens, flash_tens + + flash_area = arch.permanent_storage_mem_area + scratch_area = MemArea.Sram + + flash_size = sg.memory_used.get(flash_area, 0) + scratch_size = sg.memory_used.get(scratch_area, 0) + + # Prepare driver actions for this command tensor + da_list = [] + driver_actions.emit_fourcc(da_list, "COP1") + driver_actions.emit_config(da_list, 0, 1, arch) + driver_actions.emit_cmd_stream_header(da_list, len(sg.register_command_stream)) + + # Append command stream words + da_list.extend(sg.register_command_stream) + + # Convert to bytes + payload_bytes = struct.pack("<{0}I".format(len(da_list)), *da_list) + + command_stream_size_bytes = len(payload_bytes) + + # Adjust the bits per element calculation to exclude metadata generated by Vela + nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes + nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes + nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size + nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size + + if flash_tens == scratch_tens == None: + # First Npu subgraph, create scratch and flash tensors + sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch) + sg.scratch_tensor.purpose = TensorPurpose.Scratch + sg.flash_tensor = make_memory_tensor(sg.name + "_flash", flash_area, flash_size, True, arch) + else: + sg.scratch_tensor = scratch_tens + sg.scratch_tensor.shape[0] += scratch_size + sg.flash_tensor = flash_tens + sg.flash_tensor.shape[0] += flash_size + + for cps in sg.cascaded_passes: + for ps in cps.passes: + if ps.placement == PassPlacement.Npu and ps.weight_tensor != None: + # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address + # is pointing at the destination address of where the weights should be placed in SRAM. + # This ensures that the Flash weight tensor is used instead and thus gets the correct address. + if ps.weight_tensor.ops[0].type == "DMA": + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0]) + else: + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor) + + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor) + + sg.command_stream_tensor = make_memory_tensor( + sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch + ) + sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8) + + return sg.scratch_tensor, sg.flash_tensor + + +def add_const_tens_to_startup_cascaded_pass(startup_cps, tens): + op = Operation("Const", tens.name + "_const") + op.outputs = [tens] + tens.ops = [op] + startup_cps.passes[0].ops.insert(0, op) + startup_cps.passes[0].outputs.insert(0, tens) + startup_cps.outputs.insert(0, tens) + + +def rewrite_npu_call_ops(nng, sg, arch): + if sg.placement != PassPlacement.Cpu: + return + + startup_cps = sg.cascaded_passes[0] + + for idx, cps in enumerate(sg.cascaded_passes): + for ps in cps.passes: + for op in ps.ops: + if op.type == "NpuOp": + callee = op.attrs["subgraph"] + op.attrs["custom_options"] = {"type": op.type} + + sz = 0 + for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]: + op.inputs.insert(0, tens) + ps.inputs.insert(0, tens) + cps.inputs.insert(0, tens) + if tens != callee.scratch_tensor: + add_const_tens_to_startup_cascaded_pass(startup_cps, tens) + sz += tens.storage_size() + + for prev_cps in sg.cascaded_passes[: idx + 1]: + prev_cps.sram_used += sz + + if callee.scratch_tensor is not None: + cps.sram_used += callee.scratch_tensor.storage_size() |