4 files changed, 73 insertions, 14 deletions
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 3c87f9b..b9eee28 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -253,7 +253,10 @@ class Subgraph:
             for tens in ps.inputs:
                 for op in tens.ops:
                     pred_pass = op.scheduled_pass
-                    assert pred_pass.time < ps.time
+                    # Pass with split concat ops may end up with a dependency to
+                    # itself since output from concat is produced by several avg pool ops.
+                    # Hence pred_pass can be equal to ps.
+                    assert pred_pass == ps or pred_pass.time < ps.time
                     if ps not in pred_pass.successors:
                         pred_pass.successors.append(ps)
 
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 687e5d4..3af8588 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -827,7 +827,7 @@ def convert_batched_fc_shape(op: Operation, arch, nng) -> Operation:
     if op.type == Op.FullyConnected:
         # Check if the first dimension indicates batching
         if op.ifm_shapes[0].batch > 1:
-            batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
+            batching_split = {4: (2, 2), 6: (2, 3), 8: (2, 4), 9: (3, 3), 12: (3, 4), 16: (4, 4)}
             n = op.ifm_shapes[0].batch
             h, w = batching_split.get(n, (1, n))
             op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
@@ -840,6 +840,13 @@ def convert_batched_fc_shape(op: Operation, arch, nng) -> Operation:
             n = op.ofm_shapes[0].batch
             h, w = batching_split.get(n, (1, n))
             op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
+            if h == 1 and w > 4:
+                # If batch can not be found in the split set the weights are going to be
+                # read from memory several times. Convert op to conv2d since this
+                # enables weight buffering.
+                op.type = Op.Conv2DBias
+                op.attrs["padding"] = Padding.SAME
+                DebugDatabase.add_optimised(op, op)
     return op
 
 
diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
index 09b2c52..26d3dca 100644
--- a/ethosu/vela/tosa_graph_optimiser.py
+++ b/ethosu/vela/tosa_graph_optimiser.py
@@ -247,7 +247,11 @@ def fix_sg_input_output_tosa(op, arch, nng):
         # consumed by CPU
 
         # Check if operator ifm/ofm are sg ifm/ofm
-        ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
+        ifm_is_sg_ifm = op.ifm.ops[0].type in (
+            Op.Placeholder,
+            Op.SubgraphInput,
+            Op.Const,
+        )
         ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
         ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
         # Check if ifm/ofm is produced repectivly consumed by CPU
@@ -302,7 +306,13 @@ def remove_splitsliceread(op, arch):
         else:
             name = op.name + "_add"
             ofm = op.ofm
-            ifm2 = create_const_tensor(name + "_zero_scalar", [1], ofm.dtype, [0], quantization=ofm.quantization)
+            ifm2 = create_const_tensor(
+                name + "_zero_scalar",
+                [1],
+                ofm.dtype,
+                [0],
+                quantization=ofm.quantization,
+            )
             add_op = create_add_nop(name)
             add_op.inputs = [op.ifm, ifm2]
             add_op.outputs = [ofm]
@@ -330,7 +340,13 @@ def rewrite_concat(op):
         write_offset = [0, 0, 0, 0]
         write_offset[axis_4D] = offset
         concat_end = offset + op.ifm_shapes[idx][axis_4D]
-        create_add_for_concat(op, op.name + str(idx) + "_add", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset))
+        create_add_for_concat(
+            op,
+            op.name + str(idx) + "_add",
+            inp,
+            op.ifm_shapes[idx],
+            Shape4D.from_list(write_offset),
+        )
         offset = concat_end
     assert op.ofm_shapes[0][axis_4D] == offset
 
@@ -417,7 +433,10 @@ def rewrite_rescale(op, arch, nng):
                 DebugDatabase.add_optimised(op, prev_op)
                 return op
             else:
-                print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
+                print(
+                    "Warning, unsupported fusing of TOSA Rescale previous operator is of type:",
+                    prev_op.type,
+                )
                 assert False
         elif (
             (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8)
@@ -447,7 +466,7 @@ def rewrite_rescale(op, arch, nng):
                 for a in equal_attributes:
                     assert op.attrs[a] == rescale_1.attrs[a] == rescale_2.attrs[a], (
                         f"Only handling equal {a} for all operands "
-                        "({op.attrs[a]}, {rescale_1.attrs[a]}, {rescale_2.attrs[a]}) "
+                        f"({op.attrs[a]}, {rescale_1.attrs[a]}, {rescale_2.attrs[a]}) "
                         "for all the rescale operations to be fused with Add!"
                     )
 
@@ -486,7 +505,10 @@ def rewrite_rescale(op, arch, nng):
                 print("Warning, unsupported fusing of TOSA Rescale with Add.")
                 assert False
         else:
-            print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
+            print(
+                "Warning, unsupported fusing of TOSA Rescale previous operator is of type:",
+                prev_op.type,
+            )
             assert False
 
     return op
@@ -519,17 +541,31 @@ def convert_pad_in_width(op):
     if left > 0:
         shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
         zero_tens = create_const_tensor(
-            op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
+            op.name + "_left",
+            shape.as_list(),
+            ofm.dtype,
+            shape.elements() * [pad_value],
+            quantization=quant,
         )
         zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
         create_add_for_concat(op, op.name + "_left", zero_tens, shape, shp0)
     if right > 0:
         shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
         zero_tens = create_const_tensor(
-            op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
+            op.name + "_right",
+            shape.as_list(),
+            ofm.dtype,
+            shape.elements() * [pad_value],
+            quantization=quant,
         )
         zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
-        create_add_for_concat(op, op.name + "_right", zero_tens, shape, shp0.with_width(ofm_shape.width - right))
+        create_add_for_concat(
+            op,
+            op.name + "_right",
+            zero_tens,
+            shape,
+            shp0.with_width(ofm_shape.width - right),
+        )
 
     op.type = Op.ConcatTFLite
     return add_op
@@ -992,7 +1028,12 @@ def tosa_optimise_graph(nng, arch):
         )
 
     # Rewite Operators step
-    op_rewrite_list = [set_tensor_equivalence, rewrite_rescale, convert_depthwise_to_conv, convert_table_to_lut]
+    op_rewrite_list = [
+        set_tensor_equivalence,
+        rewrite_rescale,
+        convert_depthwise_to_conv,
+        convert_table_to_lut,
+    ]
 
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
diff --git a/ethosu/vela/tosa_reader.py b/ethosu/vela/tosa_reader.py
index 6d80e10..670b264 100644
--- a/ethosu/vela/tosa_reader.py
+++ b/ethosu/vela/tosa_reader.py
@@ -189,7 +189,8 @@ class TosaSubgraph:
             elif op.type.is_conv2d_op():
                 inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 2, 3, 0), False)
             elif op.type.is_depthwise_conv2d_op():
-                inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 2, 0, 3), False)
+                HWCM_to_HWOI = (0, 1, 3, 2)
+                inputs[1] = clone_and_reshape_tensor(inputs[1], HWCM_to_HWOI, False)
             if op.type.needs_bias() and len(inputs) <= op_type.info.indices.biases[0]:
                 # No Bias tensor
                 inputs.append(None)
@@ -241,7 +242,14 @@ class TosaSubgraph:
                 if shift != 0:
                     op.explicit_scaling = ExplicitScaling(False, [shift], [1])
             if op.type.is_depthwise_conv2d_op():
-                op.attrs["depth_multiplier"] = op.weights.shape[3]
+                assert op.weights.shape[-1] % op.ifm.shape[-1] == 0
+                depth_multiplier = op.weights.shape[-1] / op.ifm.shape[-1]
+                if depth_multiplier > 1:
+                    assert op.ifm.shape[-1] == 1 and op.ofm.shape[-1] == depth_multiplier, (
+                        "For depth multipliers > 1, IFM channels must be 1 and "
+                        "OFM channels must be equal to the depth multiplier"
+                    )
+                op.attrs["depth_multiplier"] = depth_multiplier
             if op.type == Op.SplitSliceRead:
                 op.read_offsets[0] = Shape4D.from_list(list(op.attrs["start"]), 0)
                 op.read_shapes[0] = op.attrs["size"]