From be78a053a57da7bdae240690c933824c0861f55b Mon Sep 17 00:00:00 2001 From: Rickard Bolin Date: Wed, 31 Jan 2024 12:05:11 +0000 Subject: MLBEDSW-8561: Striding support in H/W for StridedSlice Change-Id: Ie6f39d9c4125f7c16d27621de47cd76143c2e636 Signed-off-by: Rickard Bolin --- ethosu/vela/high_level_command_to_npu_op.py | 17 +++++++--- ethosu/vela/operation.py | 12 ++++--- .../vela/test/test_tflite_supported_operators.py | 6 ++-- ethosu/vela/tflite_graph_optimiser.py | 38 +++++++++++++++------- ethosu/vela/tflite_supported_operators.py | 7 ++-- 5 files changed, 54 insertions(+), 26 deletions(-) (limited to 'ethosu') diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py index 52d07187..71181d05 100644 --- a/ethosu/vela/high_level_command_to_npu_op.py +++ b/ethosu/vela/high_level_command_to_npu_op.py @@ -410,16 +410,20 @@ def create_feature_map( assert strides is not None + multiplied_strides = strides.copy() if stride_multiplier and stride_multiplier != [1, 1, 1]: assert ( tens.format == TensorFormat.NHWC ), "Only default stride multiplier ([1, 1, 1]) supported for NHCWB16 format" # Multiply strides for C/H/W (in that order) with corresponding stride factor for i, stride_factor in enumerate(stride_multiplier, start=1): - strides[i] *= stride_factor + multiplied_strides[i] *= stride_factor + + # Stride multiplier only affects tiles and addresses for OFM + _strides = multiplied_strides if is_ofm else strides height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer( - box.start_coord, box.end_coord, strides, op_shape4D + box.start_coord, box.end_coord, _strides, op_shape4D ) for idx, offset in enumerate(tile_base_offsets): @@ -427,7 +431,9 @@ def create_feature_map( fm.tiles = NpuTileBox( height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses] ) - fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1])) + fm.strides = NpuShape3D( + height=int(multiplied_strides[2]), width=int(multiplied_strides[3]), depth=int(multiplied_strides[1]) + ) fm.name = tens.name return fm @@ -518,8 +524,9 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit ifm_height = cmd.ifm_box.get_block().height ifm_width = cmd.ifm_box.get_block().width ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box) - - npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0], op.tile_base_offsets_ifm[0]) + npu_op.ifm = create_feature_map( + cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0], op.tile_base_offsets_ifm[0], op.ifm_stride_multiplier[0] + ) npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth) npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor) diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index a831537b..9a917f22 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -511,6 +511,7 @@ class Operation: "tile_base_offsets_ifm", "tile_base_offsets_ofm", "ofm_stride_multiplier", + "ifm_stride_multiplier", ) def __init__(self, op_type: Op, name: str): @@ -554,8 +555,9 @@ class Operation: self.tile_base_offsets_ifm: List[List[int]] = [[0, 0, 0, 0], [0, 0, 0, 0]] # ofm (nhwc) self.tile_base_offsets_ofm: List[int] = [0, 0, 0, 0] - # For interleaved/sparse outputs - stride is multiplied with the stride factor of the corresponding axis - # Order is [C, H, W] - default is no multiplication + # Stride is multiplied with the ifm/ofm stride factor of the corresponding axis + # Order is [C, H, W] + self.ifm_stride_multiplier: List[List[int]] = [[1, 1, 1], [1, 1, 1]] self.ofm_stride_multiplier: List[int] = [1, 1, 1] def clone(self, suffix="_clone"): @@ -585,6 +587,7 @@ class Operation: res.ifm_resampling_mode = self.ifm_resampling_mode res.tile_base_offsets_ifm = [_ifm.copy() for _ifm in self.tile_base_offsets_ifm] res.tile_base_offsets_ofm = self.tile_base_offsets_ofm.copy() + res.ifm_stride_multiplier = [_ifm.copy() for _ifm in self.ifm_stride_multiplier] res.ofm_stride_multiplier = self.ofm_stride_multiplier.copy() return res @@ -763,6 +766,7 @@ class Operation: offset_start = None offset_end = None axis = None + strides_tens = None if self.type == Op.Split: num_splits = self.attrs.get("num_splits") axis_tens = self.inputs[0] @@ -831,7 +835,7 @@ class Operation: else: assert False - return input_tens, outputs, axis, offset_start, offset_end + return input_tens, outputs, axis, offset_start, offset_end, strides_tens def set_activation_lut(self, lut_tensor): self.activation = ActivationFunction(Op.LUT) diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py index e65717a8..3b15b318 100644 --- a/ethosu/vela/test/test_tflite_supported_operators.py +++ b/ethosu/vela/test/test_tflite_supported_operators.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -542,7 +542,9 @@ def create_strided_slice(): def test_constraint_stridedslice_stride_values(): # Unsupported strides op = create_strided_slice() - op.inputs[3].values = [1, 1, 2, 1] + op.inputs[3].values = [1, 2, 2, 1] + assert support.is_operator_supported(op) + op.inputs[3].values = [2, 1, 1, 1] assert not support.is_operator_supported(op) diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index 3af8588c..ccbb1f28 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -141,7 +141,7 @@ def rewrite_split_ops(tens, arch, nng): if not split_op.run_on_npu: return tens - inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis() + inp, outputs, axis, offset_start, offset_end, strides_tens = split_op.get_split_inputs_axis() tens.ops = [] new_op = Operation(Op.SplitSliceRead, split_op.name) @@ -150,8 +150,10 @@ def rewrite_split_ops(tens, arch, nng): if None in (offset_end, offset_start): read_shape = None else: - # the read shape is relative to each start offset - read_shape = Shape4D([oe - os for oe, os in zip(offset_end, offset_start)]) + # The read shape is relative to each start offset + # Limit read shape to the size of the IFM - offset is not necessarily limited + ifm_dims = split_op.ifm_shapes[0].as_list() + read_shape = Shape4D([min(oe, ifm_dim) - os for oe, os, ifm_dim in zip(offset_end, offset_start, ifm_dims)]) # For Split the offset cannot be extracted from the tensor so it has to # be calculated from the index of the output tensor @@ -182,6 +184,9 @@ def rewrite_split_ops(tens, arch, nng): new_op.set_output_tensor(tens) new_op.ifm_shapes.append(Shape4D(inp.shape)) new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx]) + # Set stride multiplier in H/W if a stride tensor is provided + s_h, s_w = (strides_tens.values[-3], strides_tens.values[-2]) if strides_tens else (1, 1) + new_op.ifm_stride_multiplier[0] = [1, s_h, s_w] # C/H/W DebugDatabase.add_optimised(split_op, new_op) return tens @@ -193,18 +198,24 @@ def remove_SplitSliceRead(op, arch): # Check if it is possible to put the SplitSliceRead on the tensor consumer(s), # or if an avgpool need to be inserted # Not possible to move: + # - if ifm stride multiplier is larger than one in any dimension # - if consumer is a Transpose op since ifm shape has been reshaped and can not be changed # - if consumer is elementwise and ifm needs to be broadcasted - if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all( - consumer is not None - and consumer.run_on_npu - and consumer.type not in memory_only_ops - and consumer.original_type != Op.Transpose - and check_splitsliceread_to_consumer_shape(op, consumer) - and not ( - consumer.type.is_binary_elementwise_op() and Shape4D.from_list(consumer.ofm.shape) != op.ofm_shapes[0] + if ( + op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) + and all(s_mul == 1 for s_mul in op.ifm_stride_multiplier[0]) + and all( + consumer is not None + and consumer.run_on_npu + and consumer.type not in memory_only_ops + and consumer.original_type != Op.Transpose + and check_splitsliceread_to_consumer_shape(op, consumer) + and not ( + consumer.type.is_binary_elementwise_op() + and Shape4D.from_list(consumer.ofm.shape) != op.ofm_shapes[0] + ) + for consumer in op.ofm.consumer_list ) - for consumer in op.ofm.consumer_list ): # SplitSliceRead can be performed by tensor consumer(s) for cons_op in list(op.ofm.consumer_list): @@ -219,6 +230,9 @@ def remove_SplitSliceRead(op, arch): avgpool_op.ofm_shapes.append(op.ofm_shapes[0]) avgpool_op.read_offsets[0] = op.read_offsets[0] avgpool_op.read_shapes[0] = op.read_shapes[0] + if any(s_mul != 1 for s_mul in op.ifm_stride_multiplier[0]): + avgpool_op.ifm_stride_multiplier[0] = op.ifm_stride_multiplier[0].copy() + avgpool_op.ifm.force_linear_format = True op.ifm.consumer_list.remove(op) DebugDatabase.add_optimised(op, avgpool_op) diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py index 91a3ee83..b293a2ef 100644 --- a/ethosu/vela/tflite_supported_operators.py +++ b/ethosu/vela/tflite_supported_operators.py @@ -841,10 +841,11 @@ class TFLiteSupportedOperators: @staticmethod def constraint_stridedslice_stride_values(op): - "All Strides values must be 1" + "Batch and channel stride values must be 1" strides = op.inputs[3] - valid = all(stride == 1 for stride in strides.values) - return valid, f"Op has strides values {strides.values}" + s_c = strides.values[-1] + s_n = strides.values[0] if len(strides.values) > 3 else 1 + return s_n == s_c == 1, f"Op has strides values {strides.values}" @staticmethod def constraint_stridedslice_offset_false(op): -- cgit v1.2.1