From a8fda88bced0d11441467b6798885101d41ac8e9 Mon Sep 17 00:00:00 2001
From: Johan Alfven <johan.alfven@arm.com>
Date: Sat, 28 Oct 2023 16:04:46 +0200
Subject: MLBEDSW-8290: MLCE: Add TRANSPOSE support

 - Added graph optimiser function to convert TRANSPOSE op
into an AvgPool op with swapped stride for height and width
 - Added TRANSPOSE supported op check
 - Added unit tests for TRANSPOSE supported op check
 - Updated SUPPORTED_OPS.md
 - Fixed problem in pass packing when optimizing the pass list.
Old problem, but now seen when moving TRANSPOSE from cpu.

Change-Id: I0a0ef420b0fb8241090c2e2434622881105cde15
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
---
 SUPPORTED_OPS.md                                   | 16 +++-
 ethosu/vela/high_level_command_to_npu_op.py        | 20 ++++-
 ethosu/vela/pass_packing.py                        | 11 ++-
 .../vela/test/test_tflite_supported_operators.py   | 83 +++++++++++++++++++
 ethosu/vela/tflite_graph_optimiser.py              | 93 +++++++++++++++++++++-
 ethosu/vela/tflite_model_semantic.py               |  3 +
 ethosu/vela/tflite_supported_operators.py          | 47 ++++++++++-
 7 files changed, 264 insertions(+), 9 deletions(-)

diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md
index 81704e5..ceb0205 100644
--- a/SUPPORTED_OPS.md
+++ b/SUPPORTED_OPS.md
@@ -19,7 +19,7 @@ limitations under the License.
 # Supported Ops
 
 This file was automatically generated by Vela using the `--supported-ops-report` parameter.  
-Vela version: `3.9.1.dev16+gd230ce9.d20231030`
+Vela version: `3.9.1.dev21+gb724cdb.d20231107`
 
 This file complies with
 [**Gitiles Markdown syntax**](https://gerrit.googlesource.com/gitiles/+/HEAD/Documentation/markdown.md)
@@ -75,6 +75,7 @@ Please check the supported operator list for your chosen runtime for further inf
 | STRIDED_SLICE | [Generic](#tflite-generic-constraints), [Specific](#tflite-strided_slice-constraints) |
 | SUB | [Generic](#tflite-generic-constraints), [Specific](#tflite-sub-constraints) |
 | TANH | [Generic](#tflite-generic-constraints) |
+| TRANSPOSE | [Generic](#tflite-generic-constraints), [Specific](#tflite-transpose-constraints) |
 | TRANSPOSE_CONV | [Generic](#tflite-generic-constraints), [Specific](#tflite-transpose_conv-constraints) |
 | UNIDIRECTIONAL_SEQUENCE_LSTM | [Generic](#tflite-generic-constraints), [Specific](#tflite-unidirectional_sequence_lstm-constraints) |
 | UNPACK | [Generic](#tflite-generic-constraints) |
@@ -90,12 +91,12 @@ This is a list of constraints most NPU operators must satisfy in order to be sch
 - Output tensors cannot be scalar - [QUANTIZE]
 - Scalar Input tensors are only valid for op type: ADD, ARG_MAX, EXPAND_DIMS, MAXIMUM, MEAN, MINIMUM, MUL, QUANTIZE, SPLIT, SPLIT_V, SUB
 - Input(s) and Output tensors must not be greater than 4D
-- Input(s), Output and Weight tensors must have quantization parameters - [ARG_MAX, SHAPE]
+- Input(s), Output and Weight tensors must have quantization parameters - [ARG_MAX, SHAPE, TRANSPOSE]
 - Input(s), Output and Weight tensors with quantization scales must be finite
 - Input and Output tensors must have quantization scales that fit within float32 precision
 - Constant tensors should not have NoneType-values
 - Tensors must be of type: int16, int32, int8, uint8 - [ARG_MAX]
-- Tensors which are int32 are only valid when op type is: ADD, ARG_MAX, MUL, SHAPE, SUB
+- Tensors which are int32 are only valid when op type is: ADD, ARG_MAX, MUL, SHAPE, SUB, TRANSPOSE
 - Tensor dimensions must be in the range [1, 65535]
 - Per-axis quantization is only supported for the following op types: CONV_2D, DEPTHWISE_CONV_2D, TRANSPOSE_CONV
 - IFM Tensor batch size must be 1 - [FULLY_CONNECTED, RESHAPE, SHAPE, SLICE, SOFTMAX, SPLIT, SPLIT_V, SQUEEZE, STRIDED_SLICE, UNPACK]
@@ -405,6 +406,15 @@ This is a list of constraints that the SUB operator must satisfy in order to be
 - For IFM that are unsigned, OFM must either be the same type or int32
 - Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2
 
+### TFLite TRANSPOSE Constraints
+
+This is a list of constraints that the TRANSPOSE operator must satisfy in order to be scheduled on the NPU.
+
+- The following shape/permutations are supported for transpose:  
+        When ifm rank is 2: WxC -> CxW  
+        When ifm rank is 3: HxWxC -> WxHxC, 1xWxC -> 1xCxW, Hx1xC -> Cx1xH  
+        When ifm rank is 4: 1xHxWxC -> 1xWxHxC, 1x1xWxC -> 1x1xCxW, 1xHx1xC -> 1xCx1xW
+
 ### TFLite TRANSPOSE_CONV Constraints
 
 This is a list of constraints that the TRANSPOSE_CONV operator must satisfy in order to be scheduled on the NPU.
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 5e9dffa..53df096 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -383,6 +383,7 @@ def create_feature_map(
     op_shape4D: Shape4D,
     tile_base_offsets: List[int],
     stride_multiplier: Optional[List[int]] = None,
+    is_ofm: bool = False,
 ) -> NpuFeatureMap:
     """Creates feature map with common fields populated"""
     fm = NpuFeatureMap()
@@ -395,7 +396,16 @@ def create_feature_map(
     else:
         assert 0, "Incorrect tensor format"
 
-    strides = tens.get_strides(op_shape4D)
+    if is_ofm and tens.ops[0] is not None and tens.ops[0].original_type == Op.Transpose:
+        # op_shape4D has ifm shape, see fixup_transpose. Stride calculations needs to be
+        # based on the correct ofm shape.
+        op_shape4D_ofm_shape = Shape4D([op_shape4D.batch, op_shape4D.width, op_shape4D.height, op_shape4D.depth])
+        strides = tens.get_strides(op_shape4D_ofm_shape)
+        # Swap h and w strides which will cause the transpose to happen
+        strides[-3], strides[-2] = strides[-2], strides[-3]
+    else:
+        strides = tens.get_strides(op_shape4D)
+
     assert strides is not None
 
     if stride_multiplier and stride_multiplier != [1, 1, 1]:
@@ -513,7 +523,13 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit
 
     out_block = cmd.ofm_box.get_block()
     npu_op.ofm = create_feature_map(
-        cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.tile_base_offsets_ofm, op.ofm_stride_multiplier
+        cmd.ofm_tensor,
+        cmd.ofm_box,
+        arch,
+        ps.ofm_shapes[0],
+        op.tile_base_offsets_ofm,
+        op.ofm_stride_multiplier,
+        is_ofm=True,
     )
     npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
     npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 4c733cc..0de0341 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -481,7 +481,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
         # Try to optmize this by moving/grouping CPU ops where that is possible.
         # Criteria for CPU pass to be moved:
         #
-        # 1) CPU passes that only depends on sg.input_tensor can be
+        # 1) CPU passes that only depends on sg.input_tensors can be
         #    moved to the top of the list.
         #    ResourceVariables ops like VarHandle, ReadVariable, CallOnce
         #    can also be moved to the top of list.
@@ -503,9 +503,16 @@ def pack_into_passes(nng, arch, verbose_packing=False):
                 pass_list_top.insert(0, ps)
                 continue
 
+            ifm2 = ps.ops[0].ifm2
+            if ifm2 is None:
+                # Dynamic weights must be treated as ifm's.
+                if ps.ops[0].type == Op.FullyConnected and ps.ops[0].weights.purpose == TensorPurpose.FeatureMap:
+                    # Op has dynamic weights, include this in the check below
+                    ifm2 = ps.ops[0].weights
+
             if ps.placement == PassPlacement.Cpu and (
                 ps.ops[0].ifm in sg.input_tensors
-                and (ps.ops[0].ifm2 in sg.input_tensors or ps.ops[0].ifm2 is None)
+                and (ifm2 in sg.input_tensors or ifm2 is None)
                 or (ps.ops[0].type in (Op.VarHandle, Op.ReadVariable, Op.CallOnce))
             ):
                 # This CPU pass only depends on sg.input_tensors or resource variable
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index a433fb8..e65717a 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -759,3 +759,86 @@ def test_constraint_slice_inputs_const():
     op.set_input_tensor(begin, 1)
     op.set_input_tensor(begin, 2)
     assert support.is_operator_supported(op)
+
+
+def test_constraint_transpose():
+    # Test supported op IFM rank 2
+    ifm = Tensor([2, 4], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [2], DataType.int32, [1, 0])
+    ofm = Tensor([4, 2], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert support.is_operator_supported(op)
+    # Test supported op IFM rank 3
+    ifm = Tensor([2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [3], DataType.int32, [1, 0, 2])
+    ofm = Tensor([4, 2, 6], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert support.is_operator_supported(op)
+    ifm = Tensor([1, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [3], DataType.int32, [0, 2, 1])
+    ofm = Tensor([1, 6, 4], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert support.is_operator_supported(op)
+    ifm = Tensor([2, 1, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [3], DataType.int32, [2, 1, 0])
+    ofm = Tensor([6, 1, 2], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert support.is_operator_supported(op)
+    # Test supported op IFM rank 4
+    ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [0, 2, 1, 3])
+    ofm = Tensor([1, 4, 2, 6], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert support.is_operator_supported(op)
+    ifm = Tensor([1, 1, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [0, 1, 3, 2])
+    ofm = Tensor([1, 1, 6, 4], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert support.is_operator_supported(op)
+    ifm = Tensor([1, 2, 1, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [0, 3, 2, 1])
+    ofm = Tensor([1, 6, 1, 2], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert support.is_operator_supported(op)
+    # Test not supported op IFM rank 3
+    ifm = Tensor([2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [3], DataType.int32, [0, 2, 1])
+    ofm = Tensor([2, 6, 4], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert not support.is_operator_supported(op)
+    ifm = Tensor([2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [3], DataType.int32, [2, 1, 0])
+    ofm = Tensor([6, 2, 2], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert not support.is_operator_supported(op)
+    # Test not supported op IFM rank 4
+    ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [0, 1, 3, 2])
+    ofm = Tensor([1, 2, 6, 4], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert not support.is_operator_supported(op)
+    ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [0, 3, 2, 1])
+    ofm = Tensor([1, 6, 4, 2], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert not support.is_operator_supported(op)
+    ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [1, 0, 2, 3])
+    ofm = Tensor([2, 1, 4, 6], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert not support.is_operator_supported(op)
+    ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [2, 1, 0, 3])
+    ofm = Tensor([4, 2, 1, 6], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert not support.is_operator_supported(op)
+    ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [3, 1, 2, 0])
+    ofm = Tensor([6, 2, 4, 1], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert not support.is_operator_supported(op)
+    ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+    perm = create_const_tensor("perm", [4], DataType.int32, [3, 2, 1, 0])
+    ofm = Tensor([6, 4, 2, 1], DataType.int8, "ofm")
+    op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+    assert not support.is_operator_supported(op)
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 85fb8ba..cc947bc 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -191,8 +191,12 @@ def remove_SplitSliceRead(op, arch):
     if op.type == Op.SplitSliceRead:
         # Check if it is possible to put the SplitSliceRead on the tensor consumer(s),
         # or if an avgpool need to be inserted
+        # Not possible to do if consumer is a Transpose op since ifm shape has been reshaped and can not be changed
         if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all(
-            consumer is not None and consumer.run_on_npu and consumer.type not in memory_only_ops
+            consumer is not None
+            and consumer.run_on_npu
+            and consumer.type not in memory_only_ops
+            and consumer.original_type != Op.Transpose
             for consumer in op.ofm.consumer_list
         ):
             # SplitSliceRead can be performed by tensor consumer(s)
@@ -2535,6 +2539,92 @@ def fixup_dilation_gt2(op: Operation, arch, nng) -> Operation:
     return op
 
 
+def fixup_transpose(op, arch, nng):
+    """
+    Convert Transpose to AvgPool where the strides for height and width is swapped on the OFM
+    in order to achieve the transpose. It is only possible to swap height and width on the op.
+
+    Shape (2,3)            transposed to Shape (3,2)
+    |0|1|2| ifm_stride_w = 1             |0|3| ofm_stride_w = 1
+    |4|5|6| ifm_stride_h = 3             |1|4| ofm_stride_h = 2
+                                         |2|5|
+
+    To achieve the above with the AvgPool, the ofm_shape must be set equal to the ifm_shape.
+    The reason is that AvgPool uses the ofm shape when looping over the memory. So if the
+    ofm shape is not equal to the ifm shape the full ifm will not be read.
+    When looping over the values the following formula is used:
+
+    IFM [h_pos, w_pos] = h_pos * ifm_stride_h + w_pos * ifm_stride_w
+    OFM [h_pos, w_pos] = h_pos * ofm_stride_w + w_pos * ofm_stride_h (stride has been swapped)
+
+    Below code changes op to an AvgPool and sets the correct shapes. The actual stride swap
+    is done when creating the ofm featuremap. As seen there are several corner cases
+    when it is possible to transpose the depth channel.
+    """
+    if op.type == Op.Transpose:
+        op.name = f"{op.name}_avgpool"
+        op.type = Op.AvgPool
+        op.attrs["padding"] = Padding.VALID
+        op.attrs["stride_w"] = 1
+        op.attrs["stride_h"] = 1
+        op.attrs["filter_width"] = 1
+        op.attrs["filter_height"] = 1
+        op.attrs["strides"] = [1, 1, 1, 1]
+        op.attrs["ksize"] = [1, 1, 1, 1]
+        # Swapping strides only works in linear format (ofm)
+        op.ofm.force_linear_format = True
+
+        # Convert IFM to correct 4D shape
+        perm = op.inputs[1]
+        ifm_shape = op.ifm.shape
+
+        # IFM rank 2 case
+        if len(ifm_shape) == 2:
+            # IFM shape: WxC -> 1xWxCx1
+            op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[1], 1])
+
+        # IFM rank 3 cases
+        elif len(ifm_shape) == 3:
+            # Check if HxWxC -> WxHxC
+            if perm.values[0] == 1 and perm.values[1] == 0:
+                # IFM shape: HxWxC -> 1xHxWxC
+                op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[1], ifm_shape[2]])
+
+            # Check if 1xWxC -> 1xCxW
+            elif ifm_shape[0] == 1 and perm.values[1] == 2 and perm.values[2] == 1:
+                # IFM shape: 1xWxC -> 1xWxCx1
+                op.ifm_shapes[0] = Shape4D([1, ifm_shape[1], ifm_shape[2], 1])
+
+            # Check if Hx1xC -> Cx1xH
+            elif ifm_shape[1] == 1 and perm.values[0] == 2 and perm.values[2] == 0:
+                # IFM shape: Hx1xC -> 1xHxCx1
+                op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[2], 1])
+
+        # IFM rank 4 cases
+        elif len(ifm_shape) == 4:
+            # Check if 1xHxWxC -> 1xWxHxC
+            if perm.values[1] == 2 and perm.values[2] == 1:
+                # IFM shape is correct
+                pass
+
+            # Check if 1x1xWxC -> 1x1xCxW
+            elif ifm_shape[1] == 1 and perm.values[2] == 3 and perm.values[3] == 2:
+                # IFM shape: 1x1xWxC -> 1xWxCx1
+                op.ifm_shapes[0] = Shape4D([1, ifm_shape[2], ifm_shape[3], 1])
+
+            # Check if 1xHx1xC -> 1xCx1xH
+            elif ifm_shape[2] == 1 and perm.values[1] == 3 and perm.values[3] == 1:
+                # IFM shape: 1xHx1xC -> 1xHxCx1
+                op.ifm_shapes[0] = Shape4D([1, ifm_shape[1], ifm_shape[3], 1])
+
+        # OFM shape must use IFM shape
+        op.ofm_shapes[0] = op.ifm_shapes[0]
+
+        DebugDatabase.add_optimised(op, op)
+
+    return op
+
+
 def fixup_reshape(op, arch, nng):
     def _get_explicit_shape(implicit_shape, total_size):
         # the explicit shape is a copy of the implicit shape but with the special -1 (remaining size) value converted to
@@ -2824,6 +2914,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
         convert_quantize,
         replace_pad_by_hw_pad,
         fixup_dilation_gt2,
+        fixup_transpose,
     ]
 
     for idx, sg in enumerate(nng.subgraphs):
diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py
index 258af93..d9ace1e 100644
--- a/ethosu/vela/tflite_model_semantic.py
+++ b/ethosu/vela/tflite_model_semantic.py
@@ -251,6 +251,9 @@ class TFLiteSemantic:
             Op.ArgMax: [
                 TFLiteSemantic.constraint_tens_quant_none_check,
             ],
+            Op.Transpose: [
+                TFLiteSemantic.constraint_tens_quant_none_check,
+            ],
         }
         return generic_constraints_exclude_list
 
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 14c2213..4500391 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -109,7 +109,9 @@ class TFLiteSupportedOperators:
     elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,))
     pad_ops = set((Op.Pad,))
     supported_int32_tensor_ops = (
-        set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
+        set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax, Op.Transpose))
+        | binary_elem_wise_add_mul_sub
+        | binary_elem_wise_shift_ops
     )
 
     relu_ops = set(
@@ -163,6 +165,7 @@ class TFLiteSupportedOperators:
                 Op.QuantizedReshape,
                 Op.Squeeze,
                 Op.ExpandDims,
+                Op.Transpose,
             )
         )
         | concat_ops
@@ -340,6 +343,9 @@ class TFLiteSupportedOperators:
         # Slice specific checks:
         self.specific_constraints[Op.Slice].append(TFLiteSupportedOperators.constraint_slice_inputs_const)
 
+        # Transpose specific checks:
+        self.specific_constraints[Op.Transpose].append(TFLiteSupportedOperators.constraint_transpose)
+
     def is_operator_supported(self, op):
         ext_type = optype_to_builtintype(op.type)
         if op.type not in TFLiteSupportedOperators.supported_operators:
@@ -1027,3 +1033,42 @@ class TFLiteSupportedOperators:
             extra.append(f"Size tensor '{sizes.name}'")
         extra = ", ".join(extra)
         return valid, f"Op has non-constant tensors: {extra}"
+
+    @staticmethod
+    def constraint_transpose(op):
+        """The following shape/permutations are supported for transpose:
+        When ifm rank is 2: WxC -> CxW
+        When ifm rank is 3: HxWxC -> WxHxC, 1xWxC -> 1xCxW, Hx1xC -> Cx1xH
+        When ifm rank is 4: 1xHxWxC -> 1xWxHxC, 1x1xWxC -> 1x1xCxW, 1xHx1xC -> 1xCx1xW"""
+
+        ifm_shape = op.inputs[0].shape
+        perm = op.inputs[1]
+
+        # WxC -> CxW
+        valid = len(ifm_shape) == 2
+
+        # HxWxC -> WxHxC
+        if not valid and perm.shape == [3]:
+            valid = perm.values[0] == 1 and perm.values[1] == 0
+
+        # 1xWxC -> 1xCxW
+        if not valid and perm.shape == [3] and ifm_shape[0] == 1:
+            valid = perm.values[1] == 2 and perm.values[2] == 1
+
+        # Hx1xC -> Cx1xH
+        if not valid and perm.shape == [3] and ifm_shape[1] == 1:
+            valid = perm.values[0] == 2 and perm.values[2] == 0
+
+        # 1xHxWxC -> 1xWxHxC
+        if not valid and perm.shape == [4]:
+            valid = perm.values[0] == 0 and perm.values[1] == 2 and perm.values[2] == 1
+
+        # 1x1xWxC -> 1x1xCxW
+        if not valid and perm.shape == [4] and ifm_shape[1] == 1:
+            valid = perm.values[0] == 0 and perm.values[2] == 3 and perm.values[3] == 2
+
+        # 1xHx1xC -> 1xCx1xH
+        if not valid and perm.shape == [4] and ifm_shape[2] == 1:
+            valid = perm.values[0] == 0 and perm.values[1] == 3 and perm.values[3] == 1
+
+        return valid, f"Op has ifm_shape: {ifm_shape} and permutation is: {perm.values}"
-- 
cgit v1.2.1