From 3010d9b5c90628e07c7d0f0c33e7355b8bc3e19d Mon Sep 17 00:00:00 2001 From: Patrik Gustavsson Date: Thu, 1 Oct 2020 08:22:10 +0200 Subject: MLBEDSW-3060 Adjust check if weights fit in sram When deciding if weights fit sram: A compression of the weights has been added when a weight compression test limit makes it impossible to fit weights in a double buffer in sram. The worst compression ratio from compression, is used to decide if weights can be fit in sram. Signed-off-by: Patrik Gustavsson Change-Id: I9458769866b3f9fc15659185aae09658ed10fb38 --- ethosu/vela/insert_dma.py | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'ethosu/vela/insert_dma.py') diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py index 9304526a..99b46c07 100644 --- a/ethosu/vela/insert_dma.py +++ b/ethosu/vela/insert_dma.py @@ -21,12 +21,13 @@ from .operation import Operation from .tensor import MemArea from .tensor import MemType from .tensor import TensorPurpose +from .weight_compressor import compress_weights binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum")) -def weights_fit_sram(arch, tens): +def weights_fit_sram(arch, op, tens, nng): if tens.purpose != TensorPurpose.Weights: return True @@ -36,25 +37,33 @@ def weights_fit_sram(arch, tens): elif len(tens.shape) == 2: min_weight_size = tens.shape[0] * arch.OFMSplitDepth - w_compression = 1 # TODO worst compression ratio currently assumed - # Need to be fit into Sram, as a double buffer - if (w_compression * min_weight_size * 2) > arch.sram_size: - print( - "Weights, {}, are too big to be DMAed to SRAM, estimated minimum size is {} bytes".format( - tens.name, (w_compression * min_weight_size * 2) + # Only evaluate when the compression test limit will make it impossible to fit + w_comp_test_limit = 2 + if (w_comp_test_limit * min_weight_size * 2) > arch.sram_size: + # check worst compression ratio + npu_block_type = op.attrs.get("npu_block_type", NpuBlockType.Default) + compress_weights(arch, nng, tens, npu_block_type, 16, 16, op.get_dilation_h_w()) + + worst_buffer_size = tens.compression_scale_for_worst_weight_stream * min_weight_size * 2 + if worst_buffer_size > arch.sram_size: + print( + "Weights, {}, are too big to be DMAed to SRAM, estimated minimum size is {} bytes".format( + tens.name, worst_buffer_size + ) ) - ) - return False + return False return True -def insert_dma_cmd(op, arch): +def insert_dma_cmd(op, arch, nng): if op.type == "DMA" or not op.run_on_npu: return op - is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in op.inputs) - max_ifm_shram_avail = (arch.available_shram_banks(is_lut_used) - arch.shram_reserved_output_banks) * arch.shram_bank_size // 2 + is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in op.inputs) + max_ifm_shram_avail = ( + (arch.available_shram_banks(is_lut_used) - arch.shram_reserved_output_banks) * arch.shram_bank_size // 2 + ) for idx, tens in enumerate(op.inputs): @@ -66,8 +75,11 @@ def insert_dma_cmd(op, arch): and arch.permanent_storage_mem_area != arch.fast_storage_mem_area ) or tens.purpose == TensorPurpose.LUT: if tens.purpose in (TensorPurpose.Weights, TensorPurpose.LUT) or ( - tens.purpose == TensorPurpose.FeatureMap and op.type in binary_elementwise_op and - tens.shape != [] and tens.shape != op.outputs[0].shape and tens.storage_size() > max_ifm_shram_avail + tens.purpose == TensorPurpose.FeatureMap + and op.type in binary_elementwise_op + and tens.shape != [] + and tens.shape != op.outputs[0].shape + and tens.storage_size() > max_ifm_shram_avail ): only_vector_product_consumers = True for oper in tens.consumers(): @@ -79,7 +91,7 @@ def insert_dma_cmd(op, arch): # Other operations re-reads tensors, this is better done from SRAM. # LUTs must be placed in the last 2 blocks of SHRAM. if ( - not only_vector_product_consumers and weights_fit_sram(arch, tens) + not only_vector_product_consumers and weights_fit_sram(arch, op, tens, nng) ) or tens.purpose == TensorPurpose.LUT: # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size. new_tens = tens.clone_into_fast_storage(arch) @@ -98,7 +110,7 @@ def insert_dma_cmd(op, arch): def insert_dma_commands(nng, arch, verbose_graph=False): for idx, sg in enumerate(nng.subgraphs): - nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [], [insert_dma_cmd]) + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [insert_dma_cmd]) if verbose_graph: nng.print_graph() return nng -- cgit v1.2.1