From a8fda88bced0d11441467b6798885101d41ac8e9 Mon Sep 17 00:00:00 2001 From: Johan Alfven Date: Sat, 28 Oct 2023 16:04:46 +0200 Subject: MLBEDSW-8290: MLCE: Add TRANSPOSE support - Added graph optimiser function to convert TRANSPOSE op into an AvgPool op with swapped stride for height and width - Added TRANSPOSE supported op check - Added unit tests for TRANSPOSE supported op check - Updated SUPPORTED_OPS.md - Fixed problem in pass packing when optimizing the pass list. Old problem, but now seen when moving TRANSPOSE from cpu. Change-Id: I0a0ef420b0fb8241090c2e2434622881105cde15 Signed-off-by: Johan Alfven --- SUPPORTED_OPS.md | 16 +++- ethosu/vela/high_level_command_to_npu_op.py | 20 ++++- ethosu/vela/pass_packing.py | 11 ++- .../vela/test/test_tflite_supported_operators.py | 83 +++++++++++++++++++ ethosu/vela/tflite_graph_optimiser.py | 93 +++++++++++++++++++++- ethosu/vela/tflite_model_semantic.py | 3 + ethosu/vela/tflite_supported_operators.py | 47 ++++++++++- 7 files changed, 264 insertions(+), 9 deletions(-) diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md index 81704e5..ceb0205 100644 --- a/SUPPORTED_OPS.md +++ b/SUPPORTED_OPS.md @@ -19,7 +19,7 @@ limitations under the License. # Supported Ops This file was automatically generated by Vela using the `--supported-ops-report` parameter. -Vela version: `3.9.1.dev16+gd230ce9.d20231030` +Vela version: `3.9.1.dev21+gb724cdb.d20231107` This file complies with [**Gitiles Markdown syntax**](https://gerrit.googlesource.com/gitiles/+/HEAD/Documentation/markdown.md) @@ -75,6 +75,7 @@ Please check the supported operator list for your chosen runtime for further inf | STRIDED_SLICE | [Generic](#tflite-generic-constraints), [Specific](#tflite-strided_slice-constraints) | | SUB | [Generic](#tflite-generic-constraints), [Specific](#tflite-sub-constraints) | | TANH | [Generic](#tflite-generic-constraints) | +| TRANSPOSE | [Generic](#tflite-generic-constraints), [Specific](#tflite-transpose-constraints) | | TRANSPOSE_CONV | [Generic](#tflite-generic-constraints), [Specific](#tflite-transpose_conv-constraints) | | UNIDIRECTIONAL_SEQUENCE_LSTM | [Generic](#tflite-generic-constraints), [Specific](#tflite-unidirectional_sequence_lstm-constraints) | | UNPACK | [Generic](#tflite-generic-constraints) | @@ -90,12 +91,12 @@ This is a list of constraints most NPU operators must satisfy in order to be sch - Output tensors cannot be scalar - [QUANTIZE] - Scalar Input tensors are only valid for op type: ADD, ARG_MAX, EXPAND_DIMS, MAXIMUM, MEAN, MINIMUM, MUL, QUANTIZE, SPLIT, SPLIT_V, SUB - Input(s) and Output tensors must not be greater than 4D -- Input(s), Output and Weight tensors must have quantization parameters - [ARG_MAX, SHAPE] +- Input(s), Output and Weight tensors must have quantization parameters - [ARG_MAX, SHAPE, TRANSPOSE] - Input(s), Output and Weight tensors with quantization scales must be finite - Input and Output tensors must have quantization scales that fit within float32 precision - Constant tensors should not have NoneType-values - Tensors must be of type: int16, int32, int8, uint8 - [ARG_MAX] -- Tensors which are int32 are only valid when op type is: ADD, ARG_MAX, MUL, SHAPE, SUB +- Tensors which are int32 are only valid when op type is: ADD, ARG_MAX, MUL, SHAPE, SUB, TRANSPOSE - Tensor dimensions must be in the range [1, 65535] - Per-axis quantization is only supported for the following op types: CONV_2D, DEPTHWISE_CONV_2D, TRANSPOSE_CONV - IFM Tensor batch size must be 1 - [FULLY_CONNECTED, RESHAPE, SHAPE, SLICE, SOFTMAX, SPLIT, SPLIT_V, SQUEEZE, STRIDED_SLICE, UNPACK] @@ -405,6 +406,15 @@ This is a list of constraints that the SUB operator must satisfy in order to be - For IFM that are unsigned, OFM must either be the same type or int32 - Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2 +### TFLite TRANSPOSE Constraints + +This is a list of constraints that the TRANSPOSE operator must satisfy in order to be scheduled on the NPU. + +- The following shape/permutations are supported for transpose: + When ifm rank is 2: WxC -> CxW + When ifm rank is 3: HxWxC -> WxHxC, 1xWxC -> 1xCxW, Hx1xC -> Cx1xH + When ifm rank is 4: 1xHxWxC -> 1xWxHxC, 1x1xWxC -> 1x1xCxW, 1xHx1xC -> 1xCx1xW + ### TFLite TRANSPOSE_CONV Constraints This is a list of constraints that the TRANSPOSE_CONV operator must satisfy in order to be scheduled on the NPU. diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py index 5e9dffa..53df096 100644 --- a/ethosu/vela/high_level_command_to_npu_op.py +++ b/ethosu/vela/high_level_command_to_npu_op.py @@ -383,6 +383,7 @@ def create_feature_map( op_shape4D: Shape4D, tile_base_offsets: List[int], stride_multiplier: Optional[List[int]] = None, + is_ofm: bool = False, ) -> NpuFeatureMap: """Creates feature map with common fields populated""" fm = NpuFeatureMap() @@ -395,7 +396,16 @@ def create_feature_map( else: assert 0, "Incorrect tensor format" - strides = tens.get_strides(op_shape4D) + if is_ofm and tens.ops[0] is not None and tens.ops[0].original_type == Op.Transpose: + # op_shape4D has ifm shape, see fixup_transpose. Stride calculations needs to be + # based on the correct ofm shape. + op_shape4D_ofm_shape = Shape4D([op_shape4D.batch, op_shape4D.width, op_shape4D.height, op_shape4D.depth]) + strides = tens.get_strides(op_shape4D_ofm_shape) + # Swap h and w strides which will cause the transpose to happen + strides[-3], strides[-2] = strides[-2], strides[-3] + else: + strides = tens.get_strides(op_shape4D) + assert strides is not None if stride_multiplier and stride_multiplier != [1, 1, 1]: @@ -513,7 +523,13 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit out_block = cmd.ofm_box.get_block() npu_op.ofm = create_feature_map( - cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.tile_base_offsets_ofm, op.ofm_stride_multiplier + cmd.ofm_tensor, + cmd.ofm_box, + arch, + ps.ofm_shapes[0], + op.tile_base_offsets_ofm, + op.ofm_stride_multiplier, + is_ofm=True, ) npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth) npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor) diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py index 4c733cc..0de0341 100644 --- a/ethosu/vela/pass_packing.py +++ b/ethosu/vela/pass_packing.py @@ -481,7 +481,7 @@ def pack_into_passes(nng, arch, verbose_packing=False): # Try to optmize this by moving/grouping CPU ops where that is possible. # Criteria for CPU pass to be moved: # - # 1) CPU passes that only depends on sg.input_tensor can be + # 1) CPU passes that only depends on sg.input_tensors can be # moved to the top of the list. # ResourceVariables ops like VarHandle, ReadVariable, CallOnce # can also be moved to the top of list. @@ -503,9 +503,16 @@ def pack_into_passes(nng, arch, verbose_packing=False): pass_list_top.insert(0, ps) continue + ifm2 = ps.ops[0].ifm2 + if ifm2 is None: + # Dynamic weights must be treated as ifm's. + if ps.ops[0].type == Op.FullyConnected and ps.ops[0].weights.purpose == TensorPurpose.FeatureMap: + # Op has dynamic weights, include this in the check below + ifm2 = ps.ops[0].weights + if ps.placement == PassPlacement.Cpu and ( ps.ops[0].ifm in sg.input_tensors - and (ps.ops[0].ifm2 in sg.input_tensors or ps.ops[0].ifm2 is None) + and (ifm2 in sg.input_tensors or ifm2 is None) or (ps.ops[0].type in (Op.VarHandle, Op.ReadVariable, Op.CallOnce)) ): # This CPU pass only depends on sg.input_tensors or resource variable diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py index a433fb8..e65717a 100644 --- a/ethosu/vela/test/test_tflite_supported_operators.py +++ b/ethosu/vela/test/test_tflite_supported_operators.py @@ -759,3 +759,86 @@ def test_constraint_slice_inputs_const(): op.set_input_tensor(begin, 1) op.set_input_tensor(begin, 2) assert support.is_operator_supported(op) + + +def test_constraint_transpose(): + # Test supported op IFM rank 2 + ifm = Tensor([2, 4], DataType.int8, "ifm") + perm = create_const_tensor("perm", [2], DataType.int32, [1, 0]) + ofm = Tensor([4, 2], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert support.is_operator_supported(op) + # Test supported op IFM rank 3 + ifm = Tensor([2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [3], DataType.int32, [1, 0, 2]) + ofm = Tensor([4, 2, 6], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert support.is_operator_supported(op) + ifm = Tensor([1, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [3], DataType.int32, [0, 2, 1]) + ofm = Tensor([1, 6, 4], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert support.is_operator_supported(op) + ifm = Tensor([2, 1, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [3], DataType.int32, [2, 1, 0]) + ofm = Tensor([6, 1, 2], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert support.is_operator_supported(op) + # Test supported op IFM rank 4 + ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [0, 2, 1, 3]) + ofm = Tensor([1, 4, 2, 6], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert support.is_operator_supported(op) + ifm = Tensor([1, 1, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [0, 1, 3, 2]) + ofm = Tensor([1, 1, 6, 4], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert support.is_operator_supported(op) + ifm = Tensor([1, 2, 1, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [0, 3, 2, 1]) + ofm = Tensor([1, 6, 1, 2], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert support.is_operator_supported(op) + # Test not supported op IFM rank 3 + ifm = Tensor([2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [3], DataType.int32, [0, 2, 1]) + ofm = Tensor([2, 6, 4], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert not support.is_operator_supported(op) + ifm = Tensor([2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [3], DataType.int32, [2, 1, 0]) + ofm = Tensor([6, 2, 2], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert not support.is_operator_supported(op) + # Test not supported op IFM rank 4 + ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [0, 1, 3, 2]) + ofm = Tensor([1, 2, 6, 4], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert not support.is_operator_supported(op) + ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [0, 3, 2, 1]) + ofm = Tensor([1, 6, 4, 2], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert not support.is_operator_supported(op) + ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [1, 0, 2, 3]) + ofm = Tensor([2, 1, 4, 6], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert not support.is_operator_supported(op) + ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [2, 1, 0, 3]) + ofm = Tensor([4, 2, 1, 6], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert not support.is_operator_supported(op) + ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [3, 1, 2, 0]) + ofm = Tensor([6, 2, 4, 1], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert not support.is_operator_supported(op) + ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm") + perm = create_const_tensor("perm", [4], DataType.int32, [3, 2, 1, 0]) + ofm = Tensor([6, 4, 2, 1], DataType.int8, "ofm") + op = testutil.create_op(Op.Transpose, [ifm, perm], ofm) + assert not support.is_operator_supported(op) diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index 85fb8ba..cc947bc 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -191,8 +191,12 @@ def remove_SplitSliceRead(op, arch): if op.type == Op.SplitSliceRead: # Check if it is possible to put the SplitSliceRead on the tensor consumer(s), # or if an avgpool need to be inserted + # Not possible to do if consumer is a Transpose op since ifm shape has been reshaped and can not be changed if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all( - consumer is not None and consumer.run_on_npu and consumer.type not in memory_only_ops + consumer is not None + and consumer.run_on_npu + and consumer.type not in memory_only_ops + and consumer.original_type != Op.Transpose for consumer in op.ofm.consumer_list ): # SplitSliceRead can be performed by tensor consumer(s) @@ -2535,6 +2539,92 @@ def fixup_dilation_gt2(op: Operation, arch, nng) -> Operation: return op +def fixup_transpose(op, arch, nng): + """ + Convert Transpose to AvgPool where the strides for height and width is swapped on the OFM + in order to achieve the transpose. It is only possible to swap height and width on the op. + + Shape (2,3) transposed to Shape (3,2) + |0|1|2| ifm_stride_w = 1 |0|3| ofm_stride_w = 1 + |4|5|6| ifm_stride_h = 3 |1|4| ofm_stride_h = 2 + |2|5| + + To achieve the above with the AvgPool, the ofm_shape must be set equal to the ifm_shape. + The reason is that AvgPool uses the ofm shape when looping over the memory. So if the + ofm shape is not equal to the ifm shape the full ifm will not be read. + When looping over the values the following formula is used: + + IFM [h_pos, w_pos] = h_pos * ifm_stride_h + w_pos * ifm_stride_w + OFM [h_pos, w_pos] = h_pos * ofm_stride_w + w_pos * ofm_stride_h (stride has been swapped) + + Below code changes op to an AvgPool and sets the correct shapes. The actual stride swap + is done when creating the ofm featuremap. As seen there are several corner cases + when it is possible to transpose the depth channel. + """ + if op.type == Op.Transpose: + op.name = f"{op.name}_avgpool" + op.type = Op.AvgPool + op.attrs["padding"] = Padding.VALID + op.attrs["stride_w"] = 1 + op.attrs["stride_h"] = 1 + op.attrs["filter_width"] = 1 + op.attrs["filter_height"] = 1 + op.attrs["strides"] = [1, 1, 1, 1] + op.attrs["ksize"] = [1, 1, 1, 1] + # Swapping strides only works in linear format (ofm) + op.ofm.force_linear_format = True + + # Convert IFM to correct 4D shape + perm = op.inputs[1] + ifm_shape = op.ifm.shape + + # IFM rank 2 case + if len(ifm_shape) == 2: + # IFM shape: WxC -> 1xWxCx1 + op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[1], 1]) + + # IFM rank 3 cases + elif len(ifm_shape) == 3: + # Check if HxWxC -> WxHxC + if perm.values[0] == 1 and perm.values[1] == 0: + # IFM shape: HxWxC -> 1xHxWxC + op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[1], ifm_shape[2]]) + + # Check if 1xWxC -> 1xCxW + elif ifm_shape[0] == 1 and perm.values[1] == 2 and perm.values[2] == 1: + # IFM shape: 1xWxC -> 1xWxCx1 + op.ifm_shapes[0] = Shape4D([1, ifm_shape[1], ifm_shape[2], 1]) + + # Check if Hx1xC -> Cx1xH + elif ifm_shape[1] == 1 and perm.values[0] == 2 and perm.values[2] == 0: + # IFM shape: Hx1xC -> 1xHxCx1 + op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[2], 1]) + + # IFM rank 4 cases + elif len(ifm_shape) == 4: + # Check if 1xHxWxC -> 1xWxHxC + if perm.values[1] == 2 and perm.values[2] == 1: + # IFM shape is correct + pass + + # Check if 1x1xWxC -> 1x1xCxW + elif ifm_shape[1] == 1 and perm.values[2] == 3 and perm.values[3] == 2: + # IFM shape: 1x1xWxC -> 1xWxCx1 + op.ifm_shapes[0] = Shape4D([1, ifm_shape[2], ifm_shape[3], 1]) + + # Check if 1xHx1xC -> 1xCx1xH + elif ifm_shape[2] == 1 and perm.values[1] == 3 and perm.values[3] == 1: + # IFM shape: 1xHx1xC -> 1xHxCx1 + op.ifm_shapes[0] = Shape4D([1, ifm_shape[1], ifm_shape[3], 1]) + + # OFM shape must use IFM shape + op.ofm_shapes[0] = op.ifm_shapes[0] + + DebugDatabase.add_optimised(op, op) + + return op + + def fixup_reshape(op, arch, nng): def _get_explicit_shape(implicit_shape, total_size): # the explicit shape is a copy of the implicit shape but with the special -1 (remaining size) value converted to @@ -2824,6 +2914,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights): convert_quantize, replace_pad_by_hw_pad, fixup_dilation_gt2, + fixup_transpose, ] for idx, sg in enumerate(nng.subgraphs): diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py index 258af93..d9ace1e 100644 --- a/ethosu/vela/tflite_model_semantic.py +++ b/ethosu/vela/tflite_model_semantic.py @@ -251,6 +251,9 @@ class TFLiteSemantic: Op.ArgMax: [ TFLiteSemantic.constraint_tens_quant_none_check, ], + Op.Transpose: [ + TFLiteSemantic.constraint_tens_quant_none_check, + ], } return generic_constraints_exclude_list diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py index 14c2213..4500391 100644 --- a/ethosu/vela/tflite_supported_operators.py +++ b/ethosu/vela/tflite_supported_operators.py @@ -109,7 +109,9 @@ class TFLiteSupportedOperators: elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,)) pad_ops = set((Op.Pad,)) supported_int32_tensor_ops = ( - set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops + set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax, Op.Transpose)) + | binary_elem_wise_add_mul_sub + | binary_elem_wise_shift_ops ) relu_ops = set( @@ -163,6 +165,7 @@ class TFLiteSupportedOperators: Op.QuantizedReshape, Op.Squeeze, Op.ExpandDims, + Op.Transpose, ) ) | concat_ops @@ -340,6 +343,9 @@ class TFLiteSupportedOperators: # Slice specific checks: self.specific_constraints[Op.Slice].append(TFLiteSupportedOperators.constraint_slice_inputs_const) + # Transpose specific checks: + self.specific_constraints[Op.Transpose].append(TFLiteSupportedOperators.constraint_transpose) + def is_operator_supported(self, op): ext_type = optype_to_builtintype(op.type) if op.type not in TFLiteSupportedOperators.supported_operators: @@ -1027,3 +1033,42 @@ class TFLiteSupportedOperators: extra.append(f"Size tensor '{sizes.name}'") extra = ", ".join(extra) return valid, f"Op has non-constant tensors: {extra}" + + @staticmethod + def constraint_transpose(op): + """The following shape/permutations are supported for transpose: + When ifm rank is 2: WxC -> CxW + When ifm rank is 3: HxWxC -> WxHxC, 1xWxC -> 1xCxW, Hx1xC -> Cx1xH + When ifm rank is 4: 1xHxWxC -> 1xWxHxC, 1x1xWxC -> 1x1xCxW, 1xHx1xC -> 1xCx1xW""" + + ifm_shape = op.inputs[0].shape + perm = op.inputs[1] + + # WxC -> CxW + valid = len(ifm_shape) == 2 + + # HxWxC -> WxHxC + if not valid and perm.shape == [3]: + valid = perm.values[0] == 1 and perm.values[1] == 0 + + # 1xWxC -> 1xCxW + if not valid and perm.shape == [3] and ifm_shape[0] == 1: + valid = perm.values[1] == 2 and perm.values[2] == 1 + + # Hx1xC -> Cx1xH + if not valid and perm.shape == [3] and ifm_shape[1] == 1: + valid = perm.values[0] == 2 and perm.values[2] == 0 + + # 1xHxWxC -> 1xWxHxC + if not valid and perm.shape == [4]: + valid = perm.values[0] == 0 and perm.values[1] == 2 and perm.values[2] == 1 + + # 1x1xWxC -> 1x1xCxW + if not valid and perm.shape == [4] and ifm_shape[1] == 1: + valid = perm.values[0] == 0 and perm.values[2] == 3 and perm.values[3] == 2 + + # 1xHx1xC -> 1xCx1xH + if not valid and perm.shape == [4] and ifm_shape[2] == 1: + valid = perm.values[0] == 0 and perm.values[1] == 3 and perm.values[3] == 1 + + return valid, f"Op has ifm_shape: {ifm_shape} and permutation is: {perm.values}" -- cgit v1.2.1