10 files changed, 159 insertions, 42 deletions
diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py
index 46762e4d..f1b9e1aa 100644
--- a/ethosu/vela/graph_optimiser_util.py
+++ b/ethosu/vela/graph_optimiser_util.py
@@ -39,6 +39,10 @@ memory_only_ops = (
     Op.Identity,
 )
 
+# This list contains ops that requires its ofm shape to be intact in order
+# to be able to decompose it correctly in the graph optimiser step
+ofm_not_replaceable_ops = (Op.Mean,)
+
 
 def _avoid_nhcwb16_for_concat(tens):
     # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
@@ -300,8 +304,11 @@ def bypass_memory_only_ops(op, arch, nng):
 
     ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1
     ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
+    producer_ofm_not_replaceable = any(
+        ifm_prod is not None and ifm_prod.type in ofm_not_replaceable_ops for ifm_prod in op.ifm.ops
+    )
 
-    if ifm_has_multiple_cons or ifm_is_cpu_produced:
+    if ifm_has_multiple_cons or ifm_is_cpu_produced or producer_ofm_not_replaceable:
         # Convert to a memcpy op
         op.type = Op.Memcpy
         DebugDatabase.add_optimised(op, op)
@@ -348,6 +355,8 @@ def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_o
     """Creates an average pool for the given concat op/input feature map"""
     ofm = concat_op.ofm
     avgpool_op = create_avgpool_nop(name)
+    # Enforce original type since this is used in pass packing to group concat ops
+    avgpool_op._original_type = concat_op.type
     avgpool_op.inputs = [ifm]
     avgpool_op.outputs = [ofm]
 
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 52d07187..71181d05 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -410,16 +410,20 @@ def create_feature_map(
 
     assert strides is not None
 
+    multiplied_strides = strides.copy()
     if stride_multiplier and stride_multiplier != [1, 1, 1]:
         assert (
             tens.format == TensorFormat.NHWC
         ), "Only default stride multiplier ([1, 1, 1]) supported for NHCWB16 format"
         # Multiply strides for C/H/W (in that order) with corresponding stride factor
         for i, stride_factor in enumerate(stride_multiplier, start=1):
-            strides[i] *= stride_factor
+            multiplied_strides[i] *= stride_factor
+
+    # Stride multiplier only affects tiles and addresses for OFM
+    _strides = multiplied_strides if is_ofm else strides
 
     height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
-        box.start_coord, box.end_coord, strides, op_shape4D
+        box.start_coord, box.end_coord, _strides, op_shape4D
     )
 
     for idx, offset in enumerate(tile_base_offsets):
@@ -427,7 +431,9 @@ def create_feature_map(
     fm.tiles = NpuTileBox(
         height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
     )
-    fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
+    fm.strides = NpuShape3D(
+        height=int(multiplied_strides[2]), width=int(multiplied_strides[3]), depth=int(multiplied_strides[1])
+    )
     fm.name = tens.name
     return fm
 
@@ -518,8 +524,9 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit
     ifm_height = cmd.ifm_box.get_block().height
     ifm_width = cmd.ifm_box.get_block().width
     ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box)
-
-    npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0], op.tile_base_offsets_ifm[0])
+    npu_op.ifm = create_feature_map(
+        cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0], op.tile_base_offsets_ifm[0], op.ifm_stride_multiplier[0]
+    )
     npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth)
     npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
 
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 3c87f9be..b9eee28b 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -253,7 +253,10 @@ class Subgraph:
             for tens in ps.inputs:
                 for op in tens.ops:
                     pred_pass = op.scheduled_pass
-                    assert pred_pass.time < ps.time
+                    # Pass with split concat ops may end up with a dependency to
+                    # itself since output from concat is produced by several avg pool ops.
+                    # Hence pred_pass can be equal to ps.
+                    assert pred_pass == ps or pred_pass.time < ps.time
                     if ps not in pred_pass.successors:
                         pred_pass.successors.append(ps)
 
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index a831537b..9a917f22 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -511,6 +511,7 @@ class Operation:
         "tile_base_offsets_ifm",
         "tile_base_offsets_ofm",
         "ofm_stride_multiplier",
+        "ifm_stride_multiplier",
     )
 
     def __init__(self, op_type: Op, name: str):
@@ -554,8 +555,9 @@ class Operation:
         self.tile_base_offsets_ifm: List[List[int]] = [[0, 0, 0, 0], [0, 0, 0, 0]]
         # ofm (nhwc)
         self.tile_base_offsets_ofm: List[int] = [0, 0, 0, 0]
-        # For interleaved/sparse outputs - stride is multiplied with the stride factor of the corresponding axis
-        # Order is [C, H, W] - default is no multiplication
+        # Stride is multiplied with the ifm/ofm stride factor of the corresponding axis
+        # Order is [C, H, W]
+        self.ifm_stride_multiplier: List[List[int]] = [[1, 1, 1], [1, 1, 1]]
         self.ofm_stride_multiplier: List[int] = [1, 1, 1]
 
     def clone(self, suffix="_clone"):
@@ -585,6 +587,7 @@ class Operation:
         res.ifm_resampling_mode = self.ifm_resampling_mode
         res.tile_base_offsets_ifm = [_ifm.copy() for _ifm in self.tile_base_offsets_ifm]
         res.tile_base_offsets_ofm = self.tile_base_offsets_ofm.copy()
+        res.ifm_stride_multiplier = [_ifm.copy() for _ifm in self.ifm_stride_multiplier]
         res.ofm_stride_multiplier = self.ofm_stride_multiplier.copy()
 
         return res
@@ -763,6 +766,7 @@ class Operation:
         offset_start = None
         offset_end = None
         axis = None
+        strides_tens = None
         if self.type == Op.Split:
             num_splits = self.attrs.get("num_splits")
             axis_tens = self.inputs[0]
@@ -831,7 +835,7 @@ class Operation:
         else:
             assert False
 
-        return input_tens, outputs, axis, offset_start, offset_end
+        return input_tens, outputs, axis, offset_start, offset_end, strides_tens
 
     def set_activation_lut(self, lut_tensor):
         self.activation = ActivationFunction(Op.LUT)
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 0de0341d..f157e67b 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -524,6 +524,27 @@ def pack_into_passes(nng, arch, verbose_packing=False):
         # Sort ops by op_index (same call order as in the original graph)
         pass_list_top = sorted(pass_list_top, key=lambda ps: -1 if ps.ops[0].op_index is None else ps.ops[0].op_index)
 
+        # A concat is implemented by several AvgPool ops writing to the same ofm but with slice offset
+        # If there is a cpu op in between, group all AvgPool ops for a concat so that they run
+        # within the same cmd stream
+        last_idx = len(pass_list) - 1
+        for npu_ps in reversed(pass_list):
+            if npu_ps.placement == PassPlacement.Cpu or not npu_ps.ops[0].original_type.is_concat_op():
+                continue
+            # Concat pass found, search forward for the next avgpool op writing to the same ofm
+            idx = pass_list.index(npu_ps)
+            concat_is_split_between_npu_ops = False
+            for next_ps in pass_list[idx + 1 :]:
+                if next_ps.placement == PassPlacement.Cpu:
+                    concat_is_split_between_npu_ops = True
+                next_is_concat = next_ps.ops[0].original_type.is_concat_op()
+                if next_is_concat and next_ps.ops[0].ofm == npu_ps.ops[0].ofm and concat_is_split_between_npu_ops:
+                    # Avgpool writing to the same OFM and there is a cpu op between them, group them
+                    pass_list.remove(npu_ps)
+                    insert_index = pass_list.index(next_ps)
+                    pass_list.insert(insert_index, npu_ps)
+                    break
+
         # Sort the rest of the list based on critera 2.
         # Search from bottom of list and when a CPU pass is found
         # search forward in the list and see if it is possible to join another CPU pass.
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index e65717a8..3b15b318 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -542,7 +542,9 @@ def create_strided_slice():
 def test_constraint_stridedslice_stride_values():
     # Unsupported strides
     op = create_strided_slice()
-    op.inputs[3].values = [1, 1, 2, 1]
+    op.inputs[3].values = [1, 2, 2, 1]
+    assert support.is_operator_supported(op)
+    op.inputs[3].values = [2, 1, 1, 1]
     assert not support.is_operator_supported(op)
 
 
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 687e5d4f..ccbb1f28 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -141,7 +141,7 @@ def rewrite_split_ops(tens, arch, nng):
         if not split_op.run_on_npu:
             return tens
 
-        inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
+        inp, outputs, axis, offset_start, offset_end, strides_tens = split_op.get_split_inputs_axis()
 
         tens.ops = []
         new_op = Operation(Op.SplitSliceRead, split_op.name)
@@ -150,8 +150,10 @@ def rewrite_split_ops(tens, arch, nng):
         if None in (offset_end, offset_start):
             read_shape = None
         else:
-            # the read shape is relative to each start offset
-            read_shape = Shape4D([oe - os for oe, os in zip(offset_end, offset_start)])
+            # The read shape is relative to each start offset
+            # Limit read shape to the size of the IFM - offset is not necessarily limited
+            ifm_dims = split_op.ifm_shapes[0].as_list()
+            read_shape = Shape4D([min(oe, ifm_dim) - os for oe, os, ifm_dim in zip(offset_end, offset_start, ifm_dims)])
 
         # For Split the offset cannot be extracted from the tensor so it has to
         # be calculated from the index of the output tensor
@@ -182,6 +184,9 @@ def rewrite_split_ops(tens, arch, nng):
         new_op.set_output_tensor(tens)
         new_op.ifm_shapes.append(Shape4D(inp.shape))
         new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])
+        # Set stride multiplier in H/W if a stride tensor is provided
+        s_h, s_w = (strides_tens.values[-3], strides_tens.values[-2]) if strides_tens else (1, 1)
+        new_op.ifm_stride_multiplier[0] = [1, s_h, s_w]  # C/H/W
         DebugDatabase.add_optimised(split_op, new_op)
 
     return tens
@@ -193,18 +198,24 @@ def remove_SplitSliceRead(op, arch):
         # Check if it is possible to put the SplitSliceRead on the tensor consumer(s),
         # or if an avgpool need to be inserted
         # Not possible to move:
+        #   - if ifm stride multiplier is larger than one in any dimension
         #   - if consumer is a Transpose op since ifm shape has been reshaped and can not be changed
         #   - if consumer is elementwise and ifm needs to be broadcasted
-        if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all(
-            consumer is not None
-            and consumer.run_on_npu
-            and consumer.type not in memory_only_ops
-            and consumer.original_type != Op.Transpose
-            and check_splitsliceread_to_consumer_shape(op, consumer)
-            and not (
-                consumer.type.is_binary_elementwise_op() and Shape4D.from_list(consumer.ofm.shape) != op.ofm_shapes[0]
+        if (
+            op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)
+            and all(s_mul == 1 for s_mul in op.ifm_stride_multiplier[0])
+            and all(
+                consumer is not None
+                and consumer.run_on_npu
+                and consumer.type not in memory_only_ops
+                and consumer.original_type != Op.Transpose
+                and check_splitsliceread_to_consumer_shape(op, consumer)
+                and not (
+                    consumer.type.is_binary_elementwise_op()
+                    and Shape4D.from_list(consumer.ofm.shape) != op.ofm_shapes[0]
+                )
+                for consumer in op.ofm.consumer_list
             )
-            for consumer in op.ofm.consumer_list
         ):
             # SplitSliceRead can be performed by tensor consumer(s)
             for cons_op in list(op.ofm.consumer_list):
@@ -219,6 +230,9 @@ def remove_SplitSliceRead(op, arch):
             avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
             avgpool_op.read_offsets[0] = op.read_offsets[0]
             avgpool_op.read_shapes[0] = op.read_shapes[0]
+            if any(s_mul != 1 for s_mul in op.ifm_stride_multiplier[0]):
+                avgpool_op.ifm_stride_multiplier[0] = op.ifm_stride_multiplier[0].copy()
+                avgpool_op.ifm.force_linear_format = True
 
             op.ifm.consumer_list.remove(op)
             DebugDatabase.add_optimised(op, avgpool_op)
@@ -827,7 +841,7 @@ def convert_batched_fc_shape(op: Operation, arch, nng) -> Operation:
     if op.type == Op.FullyConnected:
         # Check if the first dimension indicates batching
         if op.ifm_shapes[0].batch > 1:
-            batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
+            batching_split = {4: (2, 2), 6: (2, 3), 8: (2, 4), 9: (3, 3), 12: (3, 4), 16: (4, 4)}
             n = op.ifm_shapes[0].batch
             h, w = batching_split.get(n, (1, n))
             op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
@@ -840,6 +854,13 @@ def convert_batched_fc_shape(op: Operation, arch, nng) -> Operation:
             n = op.ofm_shapes[0].batch
             h, w = batching_split.get(n, (1, n))
             op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
+            if h == 1 and w > 4:
+                # If batch can not be found in the split set the weights are going to be
+                # read from memory several times. Convert op to conv2d since this
+                # enables weight buffering.
+                op.type = Op.Conv2DBias
+                op.attrs["padding"] = Padding.SAME
+                DebugDatabase.add_optimised(op, op)
     return op
 
 
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 91a3ee83..b293a2ef 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -841,10 +841,11 @@ class TFLiteSupportedOperators:
 
     @staticmethod
     def constraint_stridedslice_stride_values(op):
-        "All Strides values must be 1"
+        "Batch and channel stride values must be 1"
         strides = op.inputs[3]
-        valid = all(stride == 1 for stride in strides.values)
-        return valid, f"Op has strides values {strides.values}"
+        s_c = strides.values[-1]
+        s_n = strides.values[0] if len(strides.values) > 3 else 1
+        return s_n == s_c == 1, f"Op has strides values {strides.values}"
 
     @staticmethod
     def constraint_stridedslice_offset_false(op):
diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
index 09b2c526..26d3dcad 100644
--- a/ethosu/vela/tosa_graph_optimiser.py
+++ b/ethosu/vela/tosa_graph_optimiser.py
@@ -247,7 +247,11 @@ def fix_sg_input_output_tosa(op, arch, nng):
         # consumed by CPU
 
         # Check if operator ifm/ofm are sg ifm/ofm
-        ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
+        ifm_is_sg_ifm = op.ifm.ops[0].type in (
+            Op.Placeholder,
+            Op.SubgraphInput,
+            Op.Const,
+        )
         ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
         ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
         # Check if ifm/ofm is produced repectivly consumed by CPU
@@ -302,7 +306,13 @@ def remove_splitsliceread(op, arch):
         else:
             name = op.name + "_add"
             ofm = op.ofm
-            ifm2 = create_const_tensor(name + "_zero_scalar", [1], ofm.dtype, [0], quantization=ofm.quantization)
+            ifm2 = create_const_tensor(
+                name + "_zero_scalar",
+                [1],
+                ofm.dtype,
+                [0],
+                quantization=ofm.quantization,
+            )
             add_op = create_add_nop(name)
             add_op.inputs = [op.ifm, ifm2]
             add_op.outputs = [ofm]
@@ -330,7 +340,13 @@ def rewrite_concat(op):
         write_offset = [0, 0, 0, 0]
         write_offset[axis_4D] = offset
         concat_end = offset + op.ifm_shapes[idx][axis_4D]
-        create_add_for_concat(op, op.name + str(idx) + "_add", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset))
+        create_add_for_concat(
+            op,
+            op.name + str(idx) + "_add",
+            inp,
+            op.ifm_shapes[idx],
+            Shape4D.from_list(write_offset),
+        )
         offset = concat_end
     assert op.ofm_shapes[0][axis_4D] == offset
 
@@ -417,7 +433,10 @@ def rewrite_rescale(op, arch, nng):
                 DebugDatabase.add_optimised(op, prev_op)
                 return op
             else:
-                print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
+                print(
+                    "Warning, unsupported fusing of TOSA Rescale previous operator is of type:",
+                    prev_op.type,
+                )
                 assert False
         elif (
             (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8)
@@ -447,7 +466,7 @@ def rewrite_rescale(op, arch, nng):
                 for a in equal_attributes:
                     assert op.attrs[a] == rescale_1.attrs[a] == rescale_2.attrs[a], (
                         f"Only handling equal {a} for all operands "
-                        "({op.attrs[a]}, {rescale_1.attrs[a]}, {rescale_2.attrs[a]}) "
+                        f"({op.attrs[a]}, {rescale_1.attrs[a]}, {rescale_2.attrs[a]}) "
                         "for all the rescale operations to be fused with Add!"
                     )
 
@@ -486,7 +505,10 @@ def rewrite_rescale(op, arch, nng):
                 print("Warning, unsupported fusing of TOSA Rescale with Add.")
                 assert False
         else:
-            print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
+            print(
+                "Warning, unsupported fusing of TOSA Rescale previous operator is of type:",
+                prev_op.type,
+            )
             assert False
 
     return op
@@ -519,17 +541,31 @@ def convert_pad_in_width(op):
     if left > 0:
         shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
         zero_tens = create_const_tensor(
-            op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
+            op.name + "_left",
+            shape.as_list(),
+            ofm.dtype,
+            shape.elements() * [pad_value],
+            quantization=quant,
         )
         zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
         create_add_for_concat(op, op.name + "_left", zero_tens, shape, shp0)
     if right > 0:
         shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
         zero_tens = create_const_tensor(
-            op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
+            op.name + "_right",
+            shape.as_list(),
+            ofm.dtype,
+            shape.elements() * [pad_value],
+            quantization=quant,
         )
         zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
-        create_add_for_concat(op, op.name + "_right", zero_tens, shape, shp0.with_width(ofm_shape.width - right))
+        create_add_for_concat(
+            op,
+            op.name + "_right",
+            zero_tens,
+            shape,
+            shp0.with_width(ofm_shape.width - right),
+        )
 
     op.type = Op.ConcatTFLite
     return add_op
@@ -992,7 +1028,12 @@ def tosa_optimise_graph(nng, arch):
         )
 
     # Rewite Operators step
-    op_rewrite_list = [set_tensor_equivalence, rewrite_rescale, convert_depthwise_to_conv, convert_table_to_lut]
+    op_rewrite_list = [
+        set_tensor_equivalence,
+        rewrite_rescale,
+        convert_depthwise_to_conv,
+        convert_table_to_lut,
+    ]
 
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
diff --git a/ethosu/vela/tosa_reader.py b/ethosu/vela/tosa_reader.py
index 6d80e10d..670b264a 100644
--- a/ethosu/vela/tosa_reader.py
+++ b/ethosu/vela/tosa_reader.py
@@ -189,7 +189,8 @@ class TosaSubgraph:
             elif op.type.is_conv2d_op():
                 inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 2, 3, 0), False)
             elif op.type.is_depthwise_conv2d_op():
-                inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 2, 0, 3), False)
+                HWCM_to_HWOI = (0, 1, 3, 2)
+                inputs[1] = clone_and_reshape_tensor(inputs[1], HWCM_to_HWOI, False)
             if op.type.needs_bias() and len(inputs) <= op_type.info.indices.biases[0]:
                 # No Bias tensor
                 inputs.append(None)
@@ -241,7 +242,14 @@ class TosaSubgraph:
                 if shift != 0:
                     op.explicit_scaling = ExplicitScaling(False, [shift], [1])
             if op.type.is_depthwise_conv2d_op():
-                op.attrs["depth_multiplier"] = op.weights.shape[3]
+                assert op.weights.shape[-1] % op.ifm.shape[-1] == 0
+                depth_multiplier = op.weights.shape[-1] / op.ifm.shape[-1]
+                if depth_multiplier > 1:
+                    assert op.ifm.shape[-1] == 1 and op.ofm.shape[-1] == depth_multiplier, (
+                        "For depth multipliers > 1, IFM channels must be 1 and "
+                        "OFM channels must be equal to the depth multiplier"
+                    )
+                op.attrs["depth_multiplier"] = depth_multiplier
             if op.type == Op.SplitSliceRead:
                 op.read_offsets[0] = Shape4D.from_list(list(op.attrs["start"]), 0)
                 op.read_shapes[0] = op.attrs["size"]