diff options
-rw-r--r-- | ethosu/vela/graph_optimiser.py | 13 | ||||
-rw-r--r-- | ethosu/vela/high_level_command_stream_generator.py | 27 | ||||
-rw-r--r-- | ethosu/vela/insert_dma.py | 5 | ||||
-rw-r--r-- | ethosu/vela/npu_serialisation.py | 30 | ||||
-rw-r--r-- | ethosu/vela/supported_operators.py | 7 |
5 files changed, 58 insertions, 24 deletions
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index 913b9a6a..351716e0 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -25,6 +25,7 @@ from .data_type import DataType from .operation import NpuBlockType from .operation import Operation from .tensor import Tensor +from .numeric_util import full_shape passthrough_nodes = set(("Identity",)) @@ -313,6 +314,7 @@ fc_op = set( depthwise_op = set(("DepthwiseConv2dNative", "DepthwiseConv2dBiasAct",)) pool_op = set(("AvgPool", "MaxPool", "QuantizedAvgPool", "QuantizedMaxPool", "AvgPoolAct", "MaxPoolAct", "ResizeBilinear",)) elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum", "LeakyRelu", "Abs")) +binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum")) activation_ops = set(("Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh")) memory_only_ops = set(("Reshape",)) @@ -399,6 +401,16 @@ def fixup_act_reorder(op, arch): op.type = "Identity" return op +def fixup_elementwise_with_scalars(op, arch): + if op.type in binary_elementwise_op: + ifm_tensor, ifm2_tensor, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm() + if ifm2_tensor.shape != [] and ifm_tensor.shape != []: + diff = len(ifm_tensor.shape) - len(ifm2_tensor.shape) + if diff > 0: + ifm2_tensor.shape = full_shape(len(ifm_tensor.shape), ifm2_tensor.shape, 1) + elif diff < 0: + ifm_tensor.shape = full_shape(len(ifm2_tensor.shape), ifm_tensor.shape, 1) + return op # Set input/output tensor equivalence to the same id for memory operations def set_tensor_equivalence(op, arch): @@ -492,6 +504,7 @@ def optimise_graph_a(nng, arch, verbose_graph=False): fixup_act_reorder, add_padding_fields, mark_npu_block_type, + fixup_elementwise_with_scalars, # convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved ] diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py index ef21e06c..0cc70a7f 100644 --- a/ethosu/vela/high_level_command_stream_generator.py +++ b/ethosu/vela/high_level_command_stream_generator.py @@ -24,17 +24,18 @@ from .high_level_command_stream import NpuStripe from .nn_graph import PassPlacement from .nn_graph import SchedulingStrategy from .operation import NpuBlockType +from .tensor import TensorPurpose def need_dma(tens): return len(tens.ops) == 1 and tens.ops[0].type == "DMA" -def dma_weights_if_necessary(ps, box, weight_tensor): - if need_dma(weight_tensor): - dma_op = weight_tensor.ops[0] +def dma_if_necessary(ps, box, tensor): + if need_dma(tensor): + dma_op = tensor.ops[0] in_tensor = dma_op.inputs[0] - yield DMA(in_tensor, weight_tensor, box) + yield DMA(in_tensor, tensor, box) def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx): @@ -115,6 +116,13 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id else: ifm2_box = Box([], []) + for intermediate in ps.intermediates: + if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap: + intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt( + strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0] + ) + yield from dma_if_necessary(ps, intermediate_box, intermediate) + weight_box = None if weight_tensor is not None: weight_oc_start = start @@ -130,7 +138,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id weight_oc_end, weight_tensor.weight_transpose_depthwise, ) - yield from dma_weights_if_necessary(ps, weight_box, weight_tensor) + yield from dma_if_necessary(ps, weight_box, weight_tensor) yield NpuStripe( ps, @@ -201,6 +209,13 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height ) + for intermediate in ps.intermediates: + if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap: + intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt( + strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0] + ) + yield from dma_if_necessary(ps, intermediate_box, intermediate) + ifm_y_needed = 1 if len(ifm_box.end_coord) >= 3: ifm_y_needed = ifm_box.end_coord[-3] @@ -217,7 +232,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id weight_box = Box.make_weight_box( weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise ) - yield from dma_weights_if_necessary(ps, weight_box, weight_tensor) + yield from dma_if_necessary(ps, weight_box, weight_tensor) # Check if first/last stripe in pass is_first_h_stripe = start == y_start diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py index 703ab9d7..b1b89856 100644 --- a/ethosu/vela/insert_dma.py +++ b/ethosu/vela/insert_dma.py @@ -21,6 +21,7 @@ from .operation import Operation from .tensor import MemArea from .tensor import TensorPurpose +binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum")) def insert_dma_cmd(op, arch): if op.type == "DMA": @@ -28,7 +29,9 @@ def insert_dma_cmd(op, arch): for idx, tens in enumerate(op.inputs): if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area: - if tens.purpose == TensorPurpose.Weights: + if (tens.purpose == TensorPurpose.Weights or + (tens.purpose == TensorPurpose.FeatureMap and + op.type in binary_elementwise_op)): only_vector_product_consumers = True for oper in tens.consumers(): if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct: diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py index b8ac20f3..0cb40ed0 100644 --- a/ethosu/vela/npu_serialisation.py +++ b/ethosu/vela/npu_serialisation.py @@ -46,6 +46,10 @@ def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor): memory_tensor.values[start_addr:end_addr] = compressed_values start_addr = end_addr +def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor): + start_addr = src_tensor.address + end_addr = start_addr + src_tensor.quant_values.size + memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens): if sg.placement != PassPlacement.Npu: @@ -90,16 +94,22 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens) for cps in sg.cascaded_passes: for ps in cps.passes: - if ps.placement == PassPlacement.Npu and ps.weight_tensor is not None: - # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address - # is pointing at the destination address of where the weights should be placed in SRAM. - # This ensures that the Flash weight tensor is used instead and thus gets the correct address. - if ps.weight_tensor.ops[0].type == "DMA": - copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0]) - else: - copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor) - - copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor) + if ps.placement == PassPlacement.Npu: + if ps.weight_tensor != None: + # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address + # is pointing at the destination address of where the weights should be placed in SRAM. + # This ensures that the Flash weight tensor is used instead and thus gets the correct address. + if ps.weight_tensor.ops[0].type == "DMA": + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0]) + else: + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor) + + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor) + + if ps.ifm_tensor != None and ps.ifm_tensor.mem_area != MemArea.Sram: + copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor) + if ps.ifm2_tensor != None and ps.ifm2_tensor.mem_area != MemArea.Sram: + copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor) sg.command_stream_tensor = make_memory_tensor( sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py index e5271450..574b3a49 100644 --- a/ethosu/vela/supported_operators.py +++ b/ethosu/vela/supported_operators.py @@ -229,13 +229,6 @@ class SupportedOperators: if op.type in self.binary_elem_wise_main_ops: # if op type is unary, ifm2_tensor is None if len(ifm2_tensor.shape) > 2 and ifm2_tensor.shape[0] != 1: return False - - # check scalar size - if hasattr(ifm_tensor.values, "__len__") and len(ifm_tensor.values) > 1: - return False - if op.type in self.binary_elem_wise_main_ops: # same as above - if hasattr(ifm2_tensor.values, "__len__") and len(ifm2_tensor.values) > 1: - return False return True def check_memory_only_restrictions(self, op): |