diff options
author | Tim Hall <tim.hall@arm.com> | 2020-06-25 15:04:31 +0100 |
---|---|---|
committer | Tim Hall <tim.hall@arm.com> | 2020-06-25 15:07:58 +0100 |
commit | f7e810a695c1426799d945d61671126543efc123 (patch) | |
tree | dad3bfd9120bdcd92d3fe216a01a4e798276a080 /ethosu/vela | |
parent | eca2e95e1fea150d8a942f8b5f0a4d9d7aefebc1 (diff) | |
download | ethos-u-vela-f7e810a695c1426799d945d61671126543efc123.tar.gz |
vela: MLBEDSW-828 weight/scale stream interleaving
- Multicore weight and scale stream interleaving for
multicore hardware architecture.
Change-Id: Ic82850463391c629d90d08c26cf0c48dd438286d
Signed-off-by: Tim Hall <tim.hall@arm.com>
Diffstat (limited to 'ethosu/vela')
-rw-r--r-- | ethosu/vela/driver_actions.py | 4 | ||||
-rw-r--r-- | ethosu/vela/register_command_stream_generator.py | 42 | ||||
-rw-r--r-- | ethosu/vela/tensor.py | 3 | ||||
-rw-r--r-- | ethosu/vela/weight_compressor.py | 90 |
4 files changed, 98 insertions, 41 deletions
diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py index 79ac11a1..29c2b181 100644 --- a/ethosu/vela/driver_actions.py +++ b/ethosu/vela/driver_actions.py @@ -65,9 +65,9 @@ def build_id_word(): def build_config_word(arch): - macs_cc = arch.config.macs + macs_cc = arch.ncores * arch.config.macs log2_macs_cc = int(np.log2(macs_cc) + 0.5) - shram_size = int(arch.shram_size_bytes / 1024) + shram_size = arch.ncores * int(arch.shram_size_bytes / 1024) n = config_r() n.set_shram_size(shram_size) n.set_cmd_stream_version(0) # may be incremented in the future diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index 9dd290a9..e753885c 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -1,3 +1,4 @@ + # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -390,6 +391,8 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): param = 0 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0]) + emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores-1) + for cmd in cmd_stream: if cmd.cmdtype == CommandType.DMA: start_coord = cmd.box.start_coord @@ -689,26 +692,45 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): # Emit Weight base address commands, only maps the area required for # this command's weights from the larger tensor. stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord) + weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index] + substreams = len( weight_substream_offsets ) - 1 # Offset list must terminate with full stream length + assert substreams == arch.ncores + + # Extract weight substream offsets and calculate their lengths + assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0) weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord) - weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index) + + if substreams > 0: + emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr + weight_substream_offsets[0] ) + emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_substream_offsets[1] - weight_substream_offsets[0]) + if substreams > 1: + emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT1_BASE, weight_addr + weight_substream_offsets[1]) + emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT1_LENGTH, weight_substream_offsets[2] - weight_substream_offsets[1]) + weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type] emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region) - emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr) - emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len) # Emit Scale & Bias base address commands, with length matching the amount required by # the weight tensors. if cmd.scale_tensor is not None: - # Get address and size of the scale/bias data area - scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:]) - scale_len = ( - cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr - ) + scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index] + substreams = len( scale_substream_offsets ) - 1 # Offset list must terminate with full stream length + assert substreams == arch.ncores + + # Extract scale substream offsets and calculate their lengths + assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0) + scale_addr = cmd.scale_tensor.address_for_coordinate( cmd.weight_box.start_coord[-1:] ) + + if substreams > 0: + emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr + scale_substream_offsets[0]) + emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, scale_substream_offsets[1] - scale_substream_offsets[0] ) + if substreams > 1: + emit.cmd1_with_offset(cmd1.NPU_SET_SCALE1_BASE, scale_addr + scale_substream_offsets[1]) + emit.cmd1_with_offset(cmd1.NPU_SET_SCALE1_LENGTH, scale_substream_offsets[2] - scale_substream_offsets[1] ) + # Emit base address for NPU to access scale & bias data scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type] emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region) - emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr) - emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16)) ofm_quant = cmd.ofm_tensor.quantization ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 3990164d..eda21c9c 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -229,6 +229,7 @@ class Tensor: "values", "quant_values", "compressed_values", + "compressed_values_substream_offsets", "mem_area", "mem_type", "format", @@ -273,6 +274,7 @@ class Tensor: self.values = None self.quant_values = None self.compressed_values = None + self.compressed_values_substream_offsets = None self.mem_area = MemArea.Unknown self.mem_type = MemType.Unknown self.format = TensorFormat.Unknown @@ -342,6 +344,7 @@ class Tensor: def copy_compressed_weight_info(self, src_tens): # Copies compressed values + all related weight compression info from the given tensor self.compressed_values = src_tens.compressed_values + self.compressed_values_substream_offsets = src_tens.compressed_values_substream_offsets self.storage_shape = src_tens.storage_shape self.brick_size = src_tens.brick_size self.weight_compression_scales = src_tens.weight_compression_scales diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py index 77220a93..fe8f04b9 100644 --- a/ethosu/vela/weight_compressor.py +++ b/ethosu/vela/weight_compressor.py @@ -97,11 +97,11 @@ def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bi decomp_w = arch.subkernel_max.width // dilation[1] ofm_ublock = arch.ofm_ublock ifm_ublock = arch.ifm_ublock - # Expect weights formatted HWIO - ofm_depth = brick_weights.shape[-1] - ifm_depth = brick_weights.shape[-2] - kernel_width = brick_weights.shape[-3] - kernel_height = brick_weights.shape[-4] + # Expect weights formatted OHWI + ofm_depth = brick_weights.shape[-4] + ifm_depth = brick_weights.shape[-1] + kernel_width = brick_weights.shape[-2] + kernel_height = brick_weights.shape[-3] # IFM block depth if is_partkernel or (ifm_bitdepth == 16): # IFM block depth is always 16 for part-kernel-first @@ -174,9 +174,13 @@ def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bi if (ifm_z >= ifm_depth) or (ofm_z >= ofm_depth) or (ky >= sub_height): stream.append(0) else: - stream.append(brick_weights[wy][wx][ifm_z][ofm_z]) + stream.append(brick_weights[ofm_z][wy][wx][ifm_z]) return stream +def core_deinterleave(hwio, core, ncores): + # Put weights back into OHWI + ohwi = np.transpose(hwio, (3,0,1,2)) + return ohwi[core:ohwi.shape[0]:ncores] # Compress the weights def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation): @@ -215,7 +219,9 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth compression_scales = [] compressed_offsets = [] encoded_streams = [] + encoded_streams_substream_offsets = [] offset = 0 + max_single_buffer_len = 0 ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits() ifm_depth = weights.shape[-2] @@ -240,25 +246,41 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth # Slice weight stream up depth-ways into bricks and compress full_ofm_depth = quant_buf.shape[-1] + ofm_block_depth = ofm_block_depth // arch.ncores for idx in range(0, full_ofm_depth, ofm_depth_step): # Get the weights necessary for this brick count = min(full_ofm_depth - idx, ofm_depth_step) brick_weights = weights[:, :, :, idx : idx + count] - # Encode all weights into one chunk - raw_stream = generate_brick(arch, brick_weights, ofm_block_depth, tens.block_traversal, ifm_bitdepth, dilation) - encoded = encode(raw_stream) - encoded_streams.append(encoded) + substream_offsets = [0] + encoded_stream = [] + raw_size = 0 + + # For each core, deinterleave weights from the larger volume + # and generate separate compressed streams. + for core in range(0, min(arch.ncores, full_ofm_depth)): + core_weights = core_deinterleave(brick_weights, core, arch.ncores) + raw_stream = generate_brick(arch, core_weights, ofm_block_depth, tens.block_traversal, ifm_bitdepth, dilation) + raw_size += len( raw_stream ) + encoded_substream = encode( raw_stream ) + encoded_stream.extend( encoded_substream ) + substream_offsets.append( len(encoded_stream) ) + + encoded_streams.append( encoded_stream ) + encoded_streams_substream_offsets.append( substream_offsets ) + + # Remember maximum encoded length for DoubleBuffering + max_single_buffer_len = max(max_single_buffer_len, len(encoded_stream)) # Remember where we put it for linear addressing compressed_offsets.append(offset) - offset += len(encoded) + offset += len(encoded_stream) assert offset % 16 == 0 # Compression scale tracking - compression_scales.append(len(encoded) / len(raw_stream)) + compression_scales.append(len(encoded_stream) / raw_size) - # Also track complete length in the offsets array + # Track total length as last element of the offsets array compressed_offsets.append(offset) tens.weight_compression_scales = compression_scales @@ -266,12 +288,12 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth tens.compression_scale_for_worst_weight_stream = np.amax(compression_scales) tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales) tens.compressed_values = encoded_streams + tens.compressed_values_substream_offsets = encoded_streams_substream_offsets tens.brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step)) set_storage_shape(tens) nng.weight_cache.add(tens) - -def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False): +def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False): assert tens.purpose == TensorPurpose.FeatureMap assert tens.format == TensorFormat.NHWC # the connected operator should expect a bias input unless it is a FullyConnected @@ -356,29 +378,39 @@ def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False): assert shift >= 16 # pack the biases and scales - tens.compressed_values = [] if len(quantised_scales) == 1: # If only 1 quantised scale is used, repeat that value for the length of the biases quantised_scales = [quantised_scales[0]] * len(biases) assert len(quantised_scales) == len(biases) - for i, bias in enumerate(biases): - tens.compressed_values.append(pack_bias_and_scale(bias, *quantised_scales[i])) - tens.element_size_bytes = 10 + tens.compressed_values = [] + tens.compressed_values_substream_offsets = [] + + total_elements = len(quantised_scales) + for i in range(0, total_elements, ofm_depth_step): + # Extract streams from brick to generate substreams for each core + stream = bytearray() + substream_offsets = [0] + max_len = min(ofm_depth_step, total_elements - i) + for core in range(0, min(arch.ncores, max_len)): + core_scales = quantised_scales[i+core:i+core+max_len:arch.ncores] + core_biases = biases[i+core:i+core+max_len:arch.ncores] + for j, core_bias in enumerate(core_biases): + stream.extend( pack_bias_and_scale(core_bias, *core_scales[j]) ) - # Figure out if we need padded storage (extra whole elements) - padding = (len(tens.compressed_values) * tens.element_size_bytes) % 16 - if padding != 0: - padding = 16 - padding + # Align to 16 for start for next substream + remainder = ( len(stream) ) % 16 + if remainder > 0: + stream.extend( bytearray(16 - remainder) ) - # This adds enough padding to allow over-reads - while padding > 0: - tens.compressed_values.append(pack_bias_and_scale(0, 0, 0)) - padding = padding - tens.element_size_bytes + substream_offsets.append( len(stream) ) - tens.storage_shape = [len(tens.compressed_values)] + # Add to compressed values with their substream offset lists to the tensor + tens.compressed_values.append( stream ) + tens.compressed_values_substream_offsets.append( substream_offsets ) + tens.storage_shape = [total_elements * tens.element_size_bytes] def update_pass_weight_and_scale_tensors(nng, arch): for sg in nng.subgraphs: @@ -413,4 +445,4 @@ def update_pass_weight_and_scale_tensors(nng, arch): activation_ops = set(("Sigmoid", "Tanh")) if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise): rescale_for_faf = True - calc_scales_and_pack_biases(ps.scale_tensor, arch, ps.block_config[3], rescale_for_faf) + calc_scales_and_pack_biases(ps.scale_tensor, arch, ofm_depth_step, rescale_for_faf) |