aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfven <johan.alfven@arm.com>2023-10-28 16:04:46 +0200
committerJohan Alfven <johan.alfven@arm.com>2023-11-09 11:59:58 +0100
commita8fda88bced0d11441467b6798885101d41ac8e9 (patch)
tree807de7fa4eee48720255fbed4a605218f8612f6a
parent4bf0cdf58416edc030ae7507ace95224785e4aa8 (diff)
downloadethos-u-vela-a8fda88bced0d11441467b6798885101d41ac8e9.tar.gz
MLBEDSW-8290: MLCE: Add TRANSPOSE support3.10.0.rc1
- Added graph optimiser function to convert TRANSPOSE op into an AvgPool op with swapped stride for height and width - Added TRANSPOSE supported op check - Added unit tests for TRANSPOSE supported op check - Updated SUPPORTED_OPS.md - Fixed problem in pass packing when optimizing the pass list. Old problem, but now seen when moving TRANSPOSE from cpu. Change-Id: I0a0ef420b0fb8241090c2e2434622881105cde15 Signed-off-by: Johan Alfven <johan.alfven@arm.com>
-rw-r--r--SUPPORTED_OPS.md16
-rw-r--r--ethosu/vela/high_level_command_to_npu_op.py20
-rw-r--r--ethosu/vela/pass_packing.py11
-rw-r--r--ethosu/vela/test/test_tflite_supported_operators.py83
-rw-r--r--ethosu/vela/tflite_graph_optimiser.py93
-rw-r--r--ethosu/vela/tflite_model_semantic.py3
-rw-r--r--ethosu/vela/tflite_supported_operators.py47
7 files changed, 264 insertions, 9 deletions
diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md
index 81704e5..ceb0205 100644
--- a/SUPPORTED_OPS.md
+++ b/SUPPORTED_OPS.md
@@ -19,7 +19,7 @@ limitations under the License.
# Supported Ops
This file was automatically generated by Vela using the `--supported-ops-report` parameter.
-Vela version: `3.9.1.dev16+gd230ce9.d20231030`
+Vela version: `3.9.1.dev21+gb724cdb.d20231107`
This file complies with
[**Gitiles Markdown syntax**](https://gerrit.googlesource.com/gitiles/+/HEAD/Documentation/markdown.md)
@@ -75,6 +75,7 @@ Please check the supported operator list for your chosen runtime for further inf
| STRIDED_SLICE | [Generic](#tflite-generic-constraints), [Specific](#tflite-strided_slice-constraints) |
| SUB | [Generic](#tflite-generic-constraints), [Specific](#tflite-sub-constraints) |
| TANH | [Generic](#tflite-generic-constraints) |
+| TRANSPOSE | [Generic](#tflite-generic-constraints), [Specific](#tflite-transpose-constraints) |
| TRANSPOSE_CONV | [Generic](#tflite-generic-constraints), [Specific](#tflite-transpose_conv-constraints) |
| UNIDIRECTIONAL_SEQUENCE_LSTM | [Generic](#tflite-generic-constraints), [Specific](#tflite-unidirectional_sequence_lstm-constraints) |
| UNPACK | [Generic](#tflite-generic-constraints) |
@@ -90,12 +91,12 @@ This is a list of constraints most NPU operators must satisfy in order to be sch
- Output tensors cannot be scalar - [QUANTIZE]
- Scalar Input tensors are only valid for op type: ADD, ARG_MAX, EXPAND_DIMS, MAXIMUM, MEAN, MINIMUM, MUL, QUANTIZE, SPLIT, SPLIT_V, SUB
- Input(s) and Output tensors must not be greater than 4D
-- Input(s), Output and Weight tensors must have quantization parameters - [ARG_MAX, SHAPE]
+- Input(s), Output and Weight tensors must have quantization parameters - [ARG_MAX, SHAPE, TRANSPOSE]
- Input(s), Output and Weight tensors with quantization scales must be finite
- Input and Output tensors must have quantization scales that fit within float32 precision
- Constant tensors should not have NoneType-values
- Tensors must be of type: int16, int32, int8, uint8 - [ARG_MAX]
-- Tensors which are int32 are only valid when op type is: ADD, ARG_MAX, MUL, SHAPE, SUB
+- Tensors which are int32 are only valid when op type is: ADD, ARG_MAX, MUL, SHAPE, SUB, TRANSPOSE
- Tensor dimensions must be in the range [1, 65535]
- Per-axis quantization is only supported for the following op types: CONV_2D, DEPTHWISE_CONV_2D, TRANSPOSE_CONV
- IFM Tensor batch size must be 1 - [FULLY_CONNECTED, RESHAPE, SHAPE, SLICE, SOFTMAX, SPLIT, SPLIT_V, SQUEEZE, STRIDED_SLICE, UNPACK]
@@ -405,6 +406,15 @@ This is a list of constraints that the SUB operator must satisfy in order to be
- For IFM that are unsigned, OFM must either be the same type or int32
- Broadcasting is only allowed for rank indices with dimension 1, from either IFM1 or IFM2
+### TFLite TRANSPOSE Constraints
+
+This is a list of constraints that the TRANSPOSE operator must satisfy in order to be scheduled on the NPU.
+
+- The following shape/permutations are supported for transpose:
+ When ifm rank is 2: WxC -> CxW
+ When ifm rank is 3: HxWxC -> WxHxC, 1xWxC -> 1xCxW, Hx1xC -> Cx1xH
+ When ifm rank is 4: 1xHxWxC -> 1xWxHxC, 1x1xWxC -> 1x1xCxW, 1xHx1xC -> 1xCx1xW
+
### TFLite TRANSPOSE_CONV Constraints
This is a list of constraints that the TRANSPOSE_CONV operator must satisfy in order to be scheduled on the NPU.
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 5e9dffa..53df096 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -383,6 +383,7 @@ def create_feature_map(
op_shape4D: Shape4D,
tile_base_offsets: List[int],
stride_multiplier: Optional[List[int]] = None,
+ is_ofm: bool = False,
) -> NpuFeatureMap:
"""Creates feature map with common fields populated"""
fm = NpuFeatureMap()
@@ -395,7 +396,16 @@ def create_feature_map(
else:
assert 0, "Incorrect tensor format"
- strides = tens.get_strides(op_shape4D)
+ if is_ofm and tens.ops[0] is not None and tens.ops[0].original_type == Op.Transpose:
+ # op_shape4D has ifm shape, see fixup_transpose. Stride calculations needs to be
+ # based on the correct ofm shape.
+ op_shape4D_ofm_shape = Shape4D([op_shape4D.batch, op_shape4D.width, op_shape4D.height, op_shape4D.depth])
+ strides = tens.get_strides(op_shape4D_ofm_shape)
+ # Swap h and w strides which will cause the transpose to happen
+ strides[-3], strides[-2] = strides[-2], strides[-3]
+ else:
+ strides = tens.get_strides(op_shape4D)
+
assert strides is not None
if stride_multiplier and stride_multiplier != [1, 1, 1]:
@@ -513,7 +523,13 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit
out_block = cmd.ofm_box.get_block()
npu_op.ofm = create_feature_map(
- cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.tile_base_offsets_ofm, op.ofm_stride_multiplier
+ cmd.ofm_tensor,
+ cmd.ofm_box,
+ arch,
+ ps.ofm_shapes[0],
+ op.tile_base_offsets_ofm,
+ op.ofm_stride_multiplier,
+ is_ofm=True,
)
npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 4c733cc..0de0341 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -481,7 +481,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
# Try to optmize this by moving/grouping CPU ops where that is possible.
# Criteria for CPU pass to be moved:
#
- # 1) CPU passes that only depends on sg.input_tensor can be
+ # 1) CPU passes that only depends on sg.input_tensors can be
# moved to the top of the list.
# ResourceVariables ops like VarHandle, ReadVariable, CallOnce
# can also be moved to the top of list.
@@ -503,9 +503,16 @@ def pack_into_passes(nng, arch, verbose_packing=False):
pass_list_top.insert(0, ps)
continue
+ ifm2 = ps.ops[0].ifm2
+ if ifm2 is None:
+ # Dynamic weights must be treated as ifm's.
+ if ps.ops[0].type == Op.FullyConnected and ps.ops[0].weights.purpose == TensorPurpose.FeatureMap:
+ # Op has dynamic weights, include this in the check below
+ ifm2 = ps.ops[0].weights
+
if ps.placement == PassPlacement.Cpu and (
ps.ops[0].ifm in sg.input_tensors
- and (ps.ops[0].ifm2 in sg.input_tensors or ps.ops[0].ifm2 is None)
+ and (ifm2 in sg.input_tensors or ifm2 is None)
or (ps.ops[0].type in (Op.VarHandle, Op.ReadVariable, Op.CallOnce))
):
# This CPU pass only depends on sg.input_tensors or resource variable
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index a433fb8..e65717a 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -759,3 +759,86 @@ def test_constraint_slice_inputs_const():
op.set_input_tensor(begin, 1)
op.set_input_tensor(begin, 2)
assert support.is_operator_supported(op)
+
+
+def test_constraint_transpose():
+ # Test supported op IFM rank 2
+ ifm = Tensor([2, 4], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [2], DataType.int32, [1, 0])
+ ofm = Tensor([4, 2], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert support.is_operator_supported(op)
+ # Test supported op IFM rank 3
+ ifm = Tensor([2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [3], DataType.int32, [1, 0, 2])
+ ofm = Tensor([4, 2, 6], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert support.is_operator_supported(op)
+ ifm = Tensor([1, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [3], DataType.int32, [0, 2, 1])
+ ofm = Tensor([1, 6, 4], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert support.is_operator_supported(op)
+ ifm = Tensor([2, 1, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [3], DataType.int32, [2, 1, 0])
+ ofm = Tensor([6, 1, 2], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert support.is_operator_supported(op)
+ # Test supported op IFM rank 4
+ ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [0, 2, 1, 3])
+ ofm = Tensor([1, 4, 2, 6], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert support.is_operator_supported(op)
+ ifm = Tensor([1, 1, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [0, 1, 3, 2])
+ ofm = Tensor([1, 1, 6, 4], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert support.is_operator_supported(op)
+ ifm = Tensor([1, 2, 1, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [0, 3, 2, 1])
+ ofm = Tensor([1, 6, 1, 2], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert support.is_operator_supported(op)
+ # Test not supported op IFM rank 3
+ ifm = Tensor([2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [3], DataType.int32, [0, 2, 1])
+ ofm = Tensor([2, 6, 4], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert not support.is_operator_supported(op)
+ ifm = Tensor([2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [3], DataType.int32, [2, 1, 0])
+ ofm = Tensor([6, 2, 2], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert not support.is_operator_supported(op)
+ # Test not supported op IFM rank 4
+ ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [0, 1, 3, 2])
+ ofm = Tensor([1, 2, 6, 4], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert not support.is_operator_supported(op)
+ ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [0, 3, 2, 1])
+ ofm = Tensor([1, 6, 4, 2], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert not support.is_operator_supported(op)
+ ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [1, 0, 2, 3])
+ ofm = Tensor([2, 1, 4, 6], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert not support.is_operator_supported(op)
+ ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [2, 1, 0, 3])
+ ofm = Tensor([4, 2, 1, 6], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert not support.is_operator_supported(op)
+ ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [3, 1, 2, 0])
+ ofm = Tensor([6, 2, 4, 1], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert not support.is_operator_supported(op)
+ ifm = Tensor([1, 2, 4, 6], DataType.int8, "ifm")
+ perm = create_const_tensor("perm", [4], DataType.int32, [3, 2, 1, 0])
+ ofm = Tensor([6, 4, 2, 1], DataType.int8, "ofm")
+ op = testutil.create_op(Op.Transpose, [ifm, perm], ofm)
+ assert not support.is_operator_supported(op)
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 85fb8ba..cc947bc 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -191,8 +191,12 @@ def remove_SplitSliceRead(op, arch):
if op.type == Op.SplitSliceRead:
# Check if it is possible to put the SplitSliceRead on the tensor consumer(s),
# or if an avgpool need to be inserted
+ # Not possible to do if consumer is a Transpose op since ifm shape has been reshaped and can not be changed
if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all(
- consumer is not None and consumer.run_on_npu and consumer.type not in memory_only_ops
+ consumer is not None
+ and consumer.run_on_npu
+ and consumer.type not in memory_only_ops
+ and consumer.original_type != Op.Transpose
for consumer in op.ofm.consumer_list
):
# SplitSliceRead can be performed by tensor consumer(s)
@@ -2535,6 +2539,92 @@ def fixup_dilation_gt2(op: Operation, arch, nng) -> Operation:
return op
+def fixup_transpose(op, arch, nng):
+ """
+ Convert Transpose to AvgPool where the strides for height and width is swapped on the OFM
+ in order to achieve the transpose. It is only possible to swap height and width on the op.
+
+ Shape (2,3) transposed to Shape (3,2)
+ |0|1|2| ifm_stride_w = 1 |0|3| ofm_stride_w = 1
+ |4|5|6| ifm_stride_h = 3 |1|4| ofm_stride_h = 2
+ |2|5|
+
+ To achieve the above with the AvgPool, the ofm_shape must be set equal to the ifm_shape.
+ The reason is that AvgPool uses the ofm shape when looping over the memory. So if the
+ ofm shape is not equal to the ifm shape the full ifm will not be read.
+ When looping over the values the following formula is used:
+
+ IFM [h_pos, w_pos] = h_pos * ifm_stride_h + w_pos * ifm_stride_w
+ OFM [h_pos, w_pos] = h_pos * ofm_stride_w + w_pos * ofm_stride_h (stride has been swapped)
+
+ Below code changes op to an AvgPool and sets the correct shapes. The actual stride swap
+ is done when creating the ofm featuremap. As seen there are several corner cases
+ when it is possible to transpose the depth channel.
+ """
+ if op.type == Op.Transpose:
+ op.name = f"{op.name}_avgpool"
+ op.type = Op.AvgPool
+ op.attrs["padding"] = Padding.VALID
+ op.attrs["stride_w"] = 1
+ op.attrs["stride_h"] = 1
+ op.attrs["filter_width"] = 1
+ op.attrs["filter_height"] = 1
+ op.attrs["strides"] = [1, 1, 1, 1]
+ op.attrs["ksize"] = [1, 1, 1, 1]
+ # Swapping strides only works in linear format (ofm)
+ op.ofm.force_linear_format = True
+
+ # Convert IFM to correct 4D shape
+ perm = op.inputs[1]
+ ifm_shape = op.ifm.shape
+
+ # IFM rank 2 case
+ if len(ifm_shape) == 2:
+ # IFM shape: WxC -> 1xWxCx1
+ op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[1], 1])
+
+ # IFM rank 3 cases
+ elif len(ifm_shape) == 3:
+ # Check if HxWxC -> WxHxC
+ if perm.values[0] == 1 and perm.values[1] == 0:
+ # IFM shape: HxWxC -> 1xHxWxC
+ op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[1], ifm_shape[2]])
+
+ # Check if 1xWxC -> 1xCxW
+ elif ifm_shape[0] == 1 and perm.values[1] == 2 and perm.values[2] == 1:
+ # IFM shape: 1xWxC -> 1xWxCx1
+ op.ifm_shapes[0] = Shape4D([1, ifm_shape[1], ifm_shape[2], 1])
+
+ # Check if Hx1xC -> Cx1xH
+ elif ifm_shape[1] == 1 and perm.values[0] == 2 and perm.values[2] == 0:
+ # IFM shape: Hx1xC -> 1xHxCx1
+ op.ifm_shapes[0] = Shape4D([1, ifm_shape[0], ifm_shape[2], 1])
+
+ # IFM rank 4 cases
+ elif len(ifm_shape) == 4:
+ # Check if 1xHxWxC -> 1xWxHxC
+ if perm.values[1] == 2 and perm.values[2] == 1:
+ # IFM shape is correct
+ pass
+
+ # Check if 1x1xWxC -> 1x1xCxW
+ elif ifm_shape[1] == 1 and perm.values[2] == 3 and perm.values[3] == 2:
+ # IFM shape: 1x1xWxC -> 1xWxCx1
+ op.ifm_shapes[0] = Shape4D([1, ifm_shape[2], ifm_shape[3], 1])
+
+ # Check if 1xHx1xC -> 1xCx1xH
+ elif ifm_shape[2] == 1 and perm.values[1] == 3 and perm.values[3] == 1:
+ # IFM shape: 1xHx1xC -> 1xHxCx1
+ op.ifm_shapes[0] = Shape4D([1, ifm_shape[1], ifm_shape[3], 1])
+
+ # OFM shape must use IFM shape
+ op.ofm_shapes[0] = op.ifm_shapes[0]
+
+ DebugDatabase.add_optimised(op, op)
+
+ return op
+
+
def fixup_reshape(op, arch, nng):
def _get_explicit_shape(implicit_shape, total_size):
# the explicit shape is a copy of the implicit shape but with the special -1 (remaining size) value converted to
@@ -2824,6 +2914,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
convert_quantize,
replace_pad_by_hw_pad,
fixup_dilation_gt2,
+ fixup_transpose,
]
for idx, sg in enumerate(nng.subgraphs):
diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py
index 258af93..d9ace1e 100644
--- a/ethosu/vela/tflite_model_semantic.py
+++ b/ethosu/vela/tflite_model_semantic.py
@@ -251,6 +251,9 @@ class TFLiteSemantic:
Op.ArgMax: [
TFLiteSemantic.constraint_tens_quant_none_check,
],
+ Op.Transpose: [
+ TFLiteSemantic.constraint_tens_quant_none_check,
+ ],
}
return generic_constraints_exclude_list
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 14c2213..4500391 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -109,7 +109,9 @@ class TFLiteSupportedOperators:
elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,))
pad_ops = set((Op.Pad,))
supported_int32_tensor_ops = (
- set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
+ set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax, Op.Transpose))
+ | binary_elem_wise_add_mul_sub
+ | binary_elem_wise_shift_ops
)
relu_ops = set(
@@ -163,6 +165,7 @@ class TFLiteSupportedOperators:
Op.QuantizedReshape,
Op.Squeeze,
Op.ExpandDims,
+ Op.Transpose,
)
)
| concat_ops
@@ -340,6 +343,9 @@ class TFLiteSupportedOperators:
# Slice specific checks:
self.specific_constraints[Op.Slice].append(TFLiteSupportedOperators.constraint_slice_inputs_const)
+ # Transpose specific checks:
+ self.specific_constraints[Op.Transpose].append(TFLiteSupportedOperators.constraint_transpose)
+
def is_operator_supported(self, op):
ext_type = optype_to_builtintype(op.type)
if op.type not in TFLiteSupportedOperators.supported_operators:
@@ -1027,3 +1033,42 @@ class TFLiteSupportedOperators:
extra.append(f"Size tensor '{sizes.name}'")
extra = ", ".join(extra)
return valid, f"Op has non-constant tensors: {extra}"
+
+ @staticmethod
+ def constraint_transpose(op):
+ """The following shape/permutations are supported for transpose:
+ When ifm rank is 2: WxC -> CxW
+ When ifm rank is 3: HxWxC -> WxHxC, 1xWxC -> 1xCxW, Hx1xC -> Cx1xH
+ When ifm rank is 4: 1xHxWxC -> 1xWxHxC, 1x1xWxC -> 1x1xCxW, 1xHx1xC -> 1xCx1xW"""
+
+ ifm_shape = op.inputs[0].shape
+ perm = op.inputs[1]
+
+ # WxC -> CxW
+ valid = len(ifm_shape) == 2
+
+ # HxWxC -> WxHxC
+ if not valid and perm.shape == [3]:
+ valid = perm.values[0] == 1 and perm.values[1] == 0
+
+ # 1xWxC -> 1xCxW
+ if not valid and perm.shape == [3] and ifm_shape[0] == 1:
+ valid = perm.values[1] == 2 and perm.values[2] == 1
+
+ # Hx1xC -> Cx1xH
+ if not valid and perm.shape == [3] and ifm_shape[1] == 1:
+ valid = perm.values[0] == 2 and perm.values[2] == 0
+
+ # 1xHxWxC -> 1xWxHxC
+ if not valid and perm.shape == [4]:
+ valid = perm.values[0] == 0 and perm.values[1] == 2 and perm.values[2] == 1
+
+ # 1x1xWxC -> 1x1xCxW
+ if not valid and perm.shape == [4] and ifm_shape[1] == 1:
+ valid = perm.values[0] == 0 and perm.values[2] == 3 and perm.values[3] == 2
+
+ # 1xHx1xC -> 1xCx1xH
+ if not valid and perm.shape == [4] and ifm_shape[2] == 1:
+ valid = perm.values[0] == 0 and perm.values[1] == 3 and perm.values[3] == 1
+
+ return valid, f"Op has ifm_shape: {ifm_shape} and permutation is: {perm.values}"