From e843d3311b8945baa32654af0dccb229b6861438 Mon Sep 17 00:00:00 2001 From: Jacob Bohlin Date: Tue, 23 Jun 2020 12:12:56 +0200 Subject: MLBEDSW-2548: Fix for Double Buffer size estimate This will give a worst case estimate of the Double Buffer size in the Scheduler and it will no longer be able to choose strategies that end up with a buffer that doesn't fit in SRAM. Signed-off-by: Jacob Bohlin Change-Id: I763731f63c7672679f3b8cd6db65dad03b946ae5 --- ethosu/vela/graph_optimiser.py | 13 ++++++++++ ethosu/vela/mark_tensors.py | 2 +- ethosu/vela/tensor.py | 37 ++++++++++++++------------- ethosu/vela/weight_compressor.py | 55 ++++++++++++++++++++++------------------ 4 files changed, 64 insertions(+), 43 deletions(-) diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index cd4ac63e..dbf2b7b9 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -435,6 +435,18 @@ def convert_depthwise_to_conv(op, arch): return op +def reorder_depthwise_weights(op, arch): + if "DepthwiseConv2d" in op.type: + weight_tensor = op.inputs[1] + weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2)) + weight_tensor.shape = weight_tensor.storage_shape = weight_tensor.bandwidth_shape = list( + weight_tensor.quant_values.shape + ) + weight_tensor.weight_transpose_depthwise = True + + return op + + # Reorder activation op if it's after the memory only operations def fixup_act_reorder(op, arch): if op.type in activation_ops: @@ -589,6 +601,7 @@ def optimise_graph_a(nng, arch, verbose_graph=False): add_padding_fields, mark_npu_block_type, fixup_elementwise_with_scalars, + reorder_depthwise_weights, # convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved ] diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py index 705f839b..5f3a13fa 100644 --- a/ethosu/vela/mark_tensors.py +++ b/ethosu/vela/mark_tensors.py @@ -368,7 +368,7 @@ def mark_tensor_format(nng, arch, verbose_tensor_format=False): if src_tens is not None: op = tens.find_npu_op() npu_block_type = op.attrs["npu_block_type"] - weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 32, 32, op.get_dilation_h_w()) + weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 16, 16, op.get_dilation_h_w()) # Alias compressed weights back into source tensor src_tens.copy_compressed_weight_info(tens) diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index bc0597f6..5e97cfe8 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -414,27 +414,30 @@ class Tensor: return rounded_size def storage_shape_for_sub_purpose(self, sub_purpose, param_a, param_b): - shp = list(self.storage_shape) if sub_purpose == TensorSubPurpose.DoubleBuffer: + shp = list(self.shape) assert len(shp) >= 2 shp[-1] = min(shp[-1], param_a * 2) - elif sub_purpose == TensorSubPurpose.RollingBufferX: - assert len(shp) == 4 - shp[0] = 1 - shp[2] = min(shp[2], param_a) - elif sub_purpose == TensorSubPurpose.RollingBufferY: - assert len(shp) == 4 - shp[0] = 1 - shp[1] = min(shp[1], param_a) - elif sub_purpose == TensorSubPurpose.RollingBufferXY: - assert len(shp) == 4 - shp[0] = 1 - shp[2] = min(shp[2], param_a) - shp[1] = min(shp[1], param_b) - elif sub_purpose == TensorSubPurpose.Standard: - pass else: - assert 0, "did not expect new sub purpose %s" % (sub_purpose,) + shp = list(self.storage_shape) + if sub_purpose == TensorSubPurpose.RollingBufferX: + assert len(shp) == 4 + shp[0] = 1 + shp[2] = min(shp[2], param_a) + elif sub_purpose == TensorSubPurpose.RollingBufferY: + assert len(shp) == 4 + shp[0] = 1 + shp[1] = min(shp[1], param_a) + elif sub_purpose == TensorSubPurpose.RollingBufferXY: + assert len(shp) == 4 + shp[0] = 1 + shp[2] = min(shp[2], param_a) + shp[1] = min(shp[1], param_b) + elif sub_purpose == TensorSubPurpose.Standard: + pass + else: + assert 0, "did not expect new sub purpose %s" % (sub_purpose,) + return shp def set_new_sub_purpose(self, sub_purpose, param_a=None, param_b=None): diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py index d3562891..2554b7c8 100644 --- a/ethosu/vela/weight_compressor.py +++ b/ethosu/vela/weight_compressor.py @@ -19,7 +19,6 @@ import math from collections import namedtuple import numpy as np -from ethosu import mlw_codec from .data_type import DataType from .errors import UnsupportedFeatureError @@ -32,6 +31,7 @@ from .tensor import TensorBlockTraversal from .tensor import TensorFormat from .tensor import TensorPurpose from .tensor import TensorSubPurpose +from ethosu import mlw_codec # Contains meta info for a weight compression. If two tensors have identical weight compression config, @@ -177,10 +177,12 @@ def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bi stream.append(brick_weights[ofm_z][wy][wx][ifm_z]) return stream + def core_deinterleave(hwio, core, ncores): # Put weights back into OHWI - ohwi = np.transpose(hwio, (3,0,1,2)) - return ohwi[core:ohwi.shape[0]:ncores] + ohwi = np.transpose(hwio, (3, 0, 1, 2)) + return ohwi[core : ohwi.shape[0] : ncores] + # Compress the weights def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation): @@ -244,6 +246,10 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth # Transpose Convoluion, reverse weights in H and W axes weights = np.flip(weights, axis=(0, 1)) + # Calculate brick size + brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step)) + elements_in_brick = np.prod(brick_size) + # Slice weight stream up depth-ways into bricks and compress full_ofm_depth = quant_buf.shape[-1] for idx in range(0, full_ofm_depth, ofm_depth_step): @@ -262,17 +268,19 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth block_depth = (ofm_block_depth + arch.ncores - 1 - core) // arch.ncores if block_depth != 0: - raw_stream = generate_brick(arch, core_weights, block_depth, tens.block_traversal, ifm_bitdepth, dilation) + raw_stream = generate_brick( + arch, core_weights, block_depth, tens.block_traversal, ifm_bitdepth, dilation + ) else: raw_stream = [] - raw_size += len( raw_stream ) - encoded_substream = encode( raw_stream ) - encoded_stream.extend( encoded_substream ) - substream_offsets.append( len(encoded_stream) ) + raw_size += len(raw_stream) + encoded_substream = encode(raw_stream) + encoded_stream.extend(encoded_substream) + substream_offsets.append(len(encoded_stream)) - encoded_streams.append( encoded_stream ) - encoded_streams_substream_offsets.append( substream_offsets ) + encoded_streams.append(encoded_stream) + encoded_streams_substream_offsets.append(substream_offsets) # Remember maximum encoded length for DoubleBuffering max_single_buffer_len = max(max_single_buffer_len, len(encoded_stream)) @@ -283,7 +291,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth assert offset % 16 == 0 # Compression scale tracking - compression_scales.append(len(encoded_stream) / raw_size) + compression_scales.append(len(encoded_stream) / elements_in_brick) # Track total length as last element of the offsets array compressed_offsets.append(offset) @@ -294,10 +302,11 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales) tens.compressed_values = encoded_streams tens.compressed_values_substream_offsets = encoded_streams_substream_offsets - tens.brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step)) + tens.brick_size = brick_size set_storage_shape(tens) nng.weight_cache.add(tens) + def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False): assert tens.purpose == TensorPurpose.FeatureMap assert tens.format == TensorFormat.NHWC @@ -399,24 +408,25 @@ def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=Fals substream_offsets = [0] max_len = min(ofm_depth_step, total_elements - i) for core in range(0, min(arch.ncores, max_len)): - core_scales = quantised_scales[i+core:i+core+max_len:arch.ncores] - core_biases = biases[i+core:i+core+max_len:arch.ncores] + core_scales = quantised_scales[i + core : i + core + max_len : arch.ncores] + core_biases = biases[i + core : i + core + max_len : arch.ncores] for j, core_bias in enumerate(core_biases): - stream.extend( pack_bias_and_scale(core_bias, *core_scales[j]) ) + stream.extend(pack_bias_and_scale(core_bias, *core_scales[j])) # Align to 16 for start for next substream - remainder = ( len(stream) ) % 16 + remainder = (len(stream)) % 16 if remainder > 0: - stream.extend( bytearray(16 - remainder) ) + stream.extend(bytearray(16 - remainder)) - substream_offsets.append( len(stream) ) + substream_offsets.append(len(stream)) # Add to compressed values with their substream offset lists to the tensor - tens.compressed_values.append( stream ) - tens.compressed_values_substream_offsets.append( substream_offsets ) + tens.compressed_values.append(stream) + tens.compressed_values_substream_offsets.append(substream_offsets) tens.storage_shape = [total_elements * tens.element_size_bytes] + def update_pass_weight_and_scale_tensors(nng, arch): for sg in nng.subgraphs: for ps in sg.passes: @@ -424,11 +434,6 @@ def update_pass_weight_and_scale_tensors(nng, arch): if tens is not None: op = tens.find_npu_op() npu_usage_of_tensor = op.attrs["npu_block_type"] - if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise: - tens.quant_values = np.transpose(tens.quant_values, (0, 1, 3, 2)) - tens.shape = tens.storage_shape = tens.bandwidth_shape = list(tens.quant_values.shape) - tens.weight_transpose_depthwise = True - needs_dma = tens.needs_dma() if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma: ofm_depth_step = ps.block_config[-1] -- cgit v1.2.1