diff options
Diffstat (limited to 'ethosu/vela/insert_dma.py')
-rw-r--r-- | ethosu/vela/insert_dma.py | 110 |
1 files changed, 0 insertions, 110 deletions
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py deleted file mode 100644 index bbe18f7b..00000000 --- a/ethosu/vela/insert_dma.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Description: -# Insert DMA operations into the graph for transfering weights. -from . import rewrite_graph -from .operation import NpuBlockType -from .operation import Op -from .operation import Operation -from .tensor import MemArea -from .tensor import MemType -from .tensor import TensorPurpose -from .weight_compressor import compress_weights - - -def weights_fit_sram(arch, op, tens, nng): - # Compresses weights and checks if they fit in SRAM - if tens.purpose != TensorPurpose.Weights: - return True - - min_weight_size = 0 - if len(tens.shape) == 4: - min_weight_size = tens.shape[0] * tens.shape[1] * tens.shape[2] * arch.OFMSplitDepth - elif len(tens.shape) == 2: - min_weight_size = tens.shape[0] * arch.OFMSplitDepth - - compress_weights(arch, nng, tens, op.type.npu_block_type, 16, 16, op.get_dilation_h_w()) - - # Need to be fit into Sram, as a double buffer - worst_buffer_size = tens.compression_scale_for_worst_weight_stream * min_weight_size * 2 - if worst_buffer_size > arch.sram_size: - print( - "Weights, {}, are too big to be DMAed to SRAM, estimated minimum size is {} bytes".format( - tens.name, worst_buffer_size - ) - ) - return False - return True - - -def insert_dma_cmd(op, arch, nng): - if op.type == Op.DMA or not op.run_on_npu: - return op - - is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in op.inputs) - max_ifm_shram_avail = ( - (arch.available_shram_banks(is_lut_used) - arch.shram_reserved_output_banks) * arch.shram_bank_size // 2 - ) - - for idx, tens in enumerate(op.inputs): - - if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast): - # Tensor is in permanent storage - # Only when permanent storage differs from fast storage, there is a point moving the data - if ( - tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) - and arch.permanent_storage_mem_area != arch.fast_storage_mem_area - ) or tens.purpose == TensorPurpose.LUT: - if tens.purpose in (TensorPurpose.Weights, TensorPurpose.LUT) or ( - tens.purpose == TensorPurpose.FeatureMap - and op.type.is_binary_elementwise_op() - and tens.shape != [] - and op.ifm_shapes[0] != op.ofm_shapes[0] - and tens.storage_size() > max_ifm_shram_avail - ): - only_vector_product_consumers = True - for oper in tens.consumers(): - if oper is None or oper.type.npu_block_type != NpuBlockType.VectorProduct: - only_vector_product_consumers = False - break - - # Tensor products has no need for DMA, tensors are only read once and can be in flash. - # Other operations re-reads tensors, this is better done from SRAM. - # LUTs must be placed in the last 2 blocks of SHRAM. - if ( - not only_vector_product_consumers and weights_fit_sram(arch, op, tens, nng) - ) or tens.purpose == TensorPurpose.LUT: - # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size. - new_tens = tens.clone_into_fast_storage(arch) - dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma") - dma_cmd.inputs = [tens] - dma_cmd.set_output_tensor(new_tens) - dma_cmd.attrs["source"] = tens.mem_area - dma_cmd.attrs["destination"] = new_tens.mem_area - dma_cmd.run_on_npu = True - if tens.purpose == TensorPurpose.LUT: - new_tens.mem_area = MemArea.Shram - op.inputs[idx] = new_tens - return op - - -def insert_dma_commands(nng, arch, verbose_graph=False): - - for idx, sg in enumerate(nng.subgraphs): - nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [insert_dma_cmd]) - if verbose_graph: - nng.print_graph("After DMA insertion") - return nng |