From b2fb212216eaa29b96ddf270a0392172265ff02c Mon Sep 17 00:00:00 2001 From: Louis Verhaard Date: Thu, 4 Jun 2020 15:51:24 +0200 Subject: MLBEDSW-2420: Improved support for dilated convolution - Dilation added to SET_KERNEL_STRIDE instruction - Kernel height/width adjusted for dilation - Updated padding calculation - Updated weight compression Change-Id: I0c8190223e223b039a305aba0f37896ae1de2b80 Signed-off-by: Louis Verhaard --- ethosu/vela/graph_optimiser.py | 4 ++- ethosu/vela/mark_tensors.py | 14 +++-------- ethosu/vela/operation.py | 4 +++ ethosu/vela/register_command_stream_generator.py | 22 +++++++++------- ethosu/vela/tensor.py | 11 +++++++- ethosu/vela/weight_compressor.py | 32 +++++++++++++----------- 6 files changed, 50 insertions(+), 37 deletions(-) (limited to 'ethosu') diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index 758b51a2..b004f4cc 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -292,7 +292,9 @@ def add_padding_fields(op, arch): else: raise UnsupportedFeatureError("Unknown operation that uses padding: {}".format(op.type)) - padding, skirt = calc_padding_and_skirt(op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape) + dilation_h, dilation_w = op.get_dilation_h_w() + dilated_kernel_size = [dilation_h * (kernel_size[0] - 1) + 1, dilation_w * (kernel_size[1] - 1) + 1] + padding, skirt = calc_padding_and_skirt(op.attrs["padding"], dilated_kernel_size, op.attrs["strides"], input_shape) op.attrs["explicit_padding"] = padding op.attrs["skirt"] = skirt return op diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py index cd70446b..bf7bc45f 100644 --- a/ethosu/vela/mark_tensors.py +++ b/ethosu/vela/mark_tensors.py @@ -17,7 +17,6 @@ # Mark purpose and select formats for Tensors. Also compresses the weights. from . import rewrite_graph from . import weight_compressor -from .operation import NpuBlockType from .tensor import TensorFormat from .tensor import TensorPurpose @@ -319,14 +318,6 @@ def mark_tensor_format(nng, arch, verbose_tensor_format=False): assert 0, "unknown tensor purpose %s" % (tens.purpose,) return fmt - def find_npu_usage_of_tensor(tens): - for op in tens.consumers(): - if op.type == "DMA": - return find_npu_usage_of_tensor(op.outputs[0]) - if "npu_block_type" in op.attrs: - return op.attrs["npu_block_type"] - return NpuBlockType.Default - def visit_tens(tens, ps): if tens not in formats_for_tensor: fmt = init_tens(tens) @@ -349,8 +340,9 @@ def mark_tensor_format(nng, arch, verbose_tensor_format=False): if fmt == TensorFormat.WeightsCompressed and tens.values is not None: src_tens = tens.get_dma_src_tensor() if src_tens is not None: - npu_block_type = find_npu_usage_of_tensor(tens) - weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 32, 32) + op = tens.find_npu_op() + npu_block_type = op.attrs["npu_block_type"] + weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 32, 32, op.get_dilation_h_w()) # Alias compressed weights back into source tensor src_tens.copy_compressed_weight_info(tens) diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index 338f962e..e8a03b7d 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -194,6 +194,10 @@ input and output tensors, as well as an attribute dictionary.""" return inputs, axis + def get_dilation_h_w(self): + _, dilation_h, dilation_w, _ = self.attrs.get("dilation", (1, 1, 1, 1)) + return dilation_h, dilation_w + split_ops = set(("Split", "SplitV", "StridedSlice", "Slice", "UnpackReshaped")) def is_split_op(self): diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index da7458ed..3da8bbcf 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -42,11 +42,11 @@ from .ethos_u55_regs.ethos_u55_regs import rounding from .high_level_command_stream import CommandType from .numeric_util import clamp_sigmoid from .numeric_util import clamp_tanh +from .numeric_util import full_shape from .numeric_util import quantise_float32 from .numeric_util import round_away_zero from .numeric_util import round_up from .numeric_util import round_up_to_int -from .numeric_util import full_shape from .operation import NpuBlockType from .shared_buffer_allocation import SharedBufferAllocation from .tensor import MemArea @@ -274,7 +274,7 @@ def has_prev_op_dependency(prev_cmd, cmd): if prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id: return True elif cmd.ifm2_tensor is not None: - return (prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id) + return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id return False @@ -414,7 +414,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): use_global_scale = False # Specifies type of rounding to be used. rounding_mode = rounding.TFL - if primary_op.type == 'ResizeBilinear': + if primary_op.type == "ResizeBilinear": rounding_mode = rounding.TRUNCATE fmf = primary_op.attrs.get("fused_memory_function", None) faf = primary_op.attrs.get("fused_activation_function", None) @@ -428,6 +428,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): prev_ofm_rect = cur_ofm_rect prev_ofm_block = cur_ofm_block prev_kernel = cur_kernel + cur_kernel = get_op_kernel(ps) block_config = ps.block_config emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1) @@ -552,7 +553,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element]) - if primary_op.type == 'ResizeBilinear': + if primary_op.type == "ResizeBilinear": # perform nearest neighbor upscale emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 1) else: @@ -575,7 +576,6 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): explicit_padding[1] = 0 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]: explicit_padding[3] = 0 - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0]) emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1]) emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2]) @@ -590,7 +590,6 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): # set kernel y stride extension bits stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9 - if npu_block_type == NpuBlockType.Pooling: k_height, k_width = primary_op.attrs["ksize"][1:3] emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1) @@ -641,8 +640,14 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): # Reduced precision quantization and natural rounding used for int16 if cmd.ifm_tensor.dtype == DataType.int16: rounding_mode = rounding.NATURAL - emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, cmd.weight_tensor.shape[0] - 1) - emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, cmd.weight_tensor.shape[1] - 1) + stride |= (cur_kernel.dilation.y - 1) << 4 + stride |= (cur_kernel.dilation.x - 1) << 3 + emit.cmd0_with_param( + cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1) + ) + emit.cmd0_with_param( + cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1) + ) if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst: # Part-kernel-first weight ordering assert npu_block_type == NpuBlockType.ConvolutionMxN @@ -934,7 +939,6 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3]) cur_ofm_rect = get_op_ofm_rect(cmd) cur_ifm_rect = get_op_ifm_rect(cmd) - cur_kernel = get_op_kernel(cmd.ps) cur_padLT = get_op_padding_lt(cmd) if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd): if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape: diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 2f91f61c..426a710b 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -521,7 +521,7 @@ class Tensor: strides[4] = stride strides[3] = 16 * stride # STRIDE_X strides[1] = strides[3] * augmented_shape[2] # STRIDE_C - strides[2] = augmented_shape[2] * augmented_shape[3] * stride # STRIDE_Y + strides[2] = augmented_shape[2] * augmented_shape[3] * stride # STRIDE_Y strides[0] = strides[2] * augmented_shape[1] # STRIDE_N return strides, augmented_coord @@ -539,6 +539,15 @@ class Tensor: # Note: for DMA ops, Pass.weight_tensor is referring to the SRAM weight tensor return self.ops[0].inputs[0] if self.needs_dma() else None + def find_npu_op(self): + # Returns the NPU operator that uses this tensor, excluding DMA operators. + for op in self.consumers(): + if op.type == "DMA": + return op.outputs[0].find_npu_op() + if "npu_block_type" in op.attrs: + return op + return None + def compressed_stream_index_from_coord(self, coord): assert self.format == TensorFormat.WeightsCompressed assert len(self.compressed_values) > 0 diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py index 450e091e..9edde601 100644 --- a/ethosu/vela/weight_compressor.py +++ b/ethosu/vela/weight_compressor.py @@ -19,7 +19,6 @@ import math from collections import namedtuple import numpy as np -from ethosu import mlw_codec from .data_type import DataType from .errors import UnsupportedFeatureError @@ -32,20 +31,21 @@ from .tensor import TensorBlockTraversal from .tensor import TensorFormat from .tensor import TensorPurpose from .tensor import TensorSubPurpose +from ethosu import mlw_codec # Contains meta info for a weight compression. If two tensors have identical weight compression config, # then they also will have identical compressed weights. WeightCompressionConfig = namedtuple( - "WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "equivalence_id"] + "WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "equivalence_id"] ) -def create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step): +def create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation): # Note: for an ofm block only its depth is used in weight compression. # And block depth > ofm depth gives same result as block depth == ofm depth block_depth = min(ofm_block_depth, tens.quant_values.shape[-1]) - return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, tens.equivalence_id) + return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, dilation, tens.equivalence_id) def set_storage_shape(tens): @@ -90,10 +90,11 @@ def encode(weight_stream): return compressed -def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bitdepth): +def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bitdepth, dilation): is_depthwise = block_traversal == TensorBlockTraversal.DepthWise is_partkernel = block_traversal == TensorBlockTraversal.PartKernelFirst - subkernel_max = arch.subkernel_max + decomp_h = arch.subkernel_max.height // dilation[0] + decomp_w = arch.subkernel_max.width // dilation[1] ofm_ublock = arch.ofm_ublock ifm_ublock = arch.ifm_ublock # Expect weights formatted HWIO @@ -125,11 +126,11 @@ def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bi ) # Weight decomposition # Subkernel Splitting (H) - for subkernel_y in range(0, kernel_height, subkernel_max.height): - sub_height = min(kernel_height - subkernel_y, subkernel_max.height) + for subkernel_y in range(0, kernel_height, decomp_h): + sub_height = min(kernel_height - subkernel_y, decomp_h) # Subkernel splitting (W) - for subkernel_x in range(0, kernel_width, subkernel_max.width): - sub_width = min(kernel_width - subkernel_x, subkernel_max.width) + for subkernel_x in range(0, kernel_width, decomp_w): + sub_width = min(kernel_width - subkernel_x, decomp_w) subkernel_elements = sub_width * sub_height # Part kernel first works across the kernel H/W and needs padding if is_partkernel: @@ -178,14 +179,14 @@ def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bi # Compress the weights -def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth_step): +def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation): assert tens.purpose == TensorPurpose.Weights assert tens.format == TensorFormat.WeightsCompressed # Check the weight cache if nng.weight_cache is None: nng.weight_cache = CompressedWeightCache() - wcc = create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step) + wcc = create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation) tens.weight_compression_config = wcc tens_cached = nng.weight_cache.get_tensor_with_same_compression(wcc) if tens_cached is not None: @@ -241,7 +242,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth brick_weights = weights[:, :, :, idx : idx + count] # Encode all weights into one chunk - raw_stream = generate_brick(arch, brick_weights, ofm_block_depth, tens.block_traversal, ifm_bitdepth) + raw_stream = generate_brick(arch, brick_weights, ofm_block_depth, tens.block_traversal, ifm_bitdepth, dilation) encoded = encode(raw_stream) encoded_streams.append(encoded) @@ -387,7 +388,8 @@ def update_pass_weight_and_scale_tensors(nng, arch): for ps in sg.passes: tens = ps.weight_tensor if tens is not None: - npu_usage_of_tensor = find_npu_usage_of_tensor(tens) + op = tens.find_npu_op() + npu_usage_of_tensor = op.attrs["npu_block_type"] if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise: tens.quant_values = np.transpose(tens.quant_values, (0, 1, 3, 2)) tens.shape = tens.storage_shape = tens.bandwidth_shape = list(tens.quant_values.shape) @@ -399,7 +401,7 @@ def update_pass_weight_and_scale_tensors(nng, arch): else: ofm_depth_step = tens.shape[-1] compress_weights( - arch, nng, tens, npu_usage_of_tensor, ps.block_config[-1], ofm_depth_step, + arch, nng, tens, npu_usage_of_tensor, ps.block_config[-1], ofm_depth_step, op.get_dilation_h_w() ) # Update source tensor if needs_dma: -- cgit v1.2.1