1 files changed, 279 insertions, 301 deletions
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 5f111786..bb5a9e03 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -28,6 +28,7 @@ from . import scaling
 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .errors import UnsupportedFeatureError
+from .errors import VelaError
 from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .numeric_util import clamp_sigmoid
 from .numeric_util import full_shape
@@ -42,7 +43,6 @@ from .shape4d import Shape4D
 from .softmax import SoftMax
 from .tensor import check_quantized_tens_scaling_equal
 from .tensor import create_const_tensor
-from .tensor import create_reshape_tensor
 from .tensor import QuantizationParameters
 from .tensor import Tensor
 from .tflite_mapping import optype_to_builtintype
@@ -59,52 +59,68 @@ def remove_passthrough_tensor(tens, arch, nng):
     return tens
 
 
-def rewrite_concat(tens, arch, nng):
-    if len(tens.ops) == 1 and tens.ops[0].type.is_concat_op():
-        concat_op = tens.ops[0]
-        if tens != concat_op.outputs[0]:
-            return tens  # don't attempt to rewrite the min/max outputs of QuantizedConcat
+def rewrite_concat_ops(op, arch, nng):
+    if not op.run_on_npu or not op.type.is_concat_op():
+        return op
 
-        # Not supported so leave it and run on CPU
-        if not concat_op.run_on_npu:
-            return tens
+    axis_4D = 0
+    ofm = op.ofm
+    ofm.ops = []
+    offset = 0
 
-        inputs, axis = concat_op.get_concat_inputs_axis()
+    if op.type == Op.Pack:
+        # Pack is also referred to as Stack
+        axis = int(op.attrs["axis"])
+        desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
 
-        tens.ops = []
-        offset = 0
-        for idx, inp in enumerate(inputs):
+        if axis >= 0:
+            axis_4D = axis + (4 - len(desired_shape))
+        else:
+            axis_4D = axis
+
+        for idx, inp in enumerate(op.inputs):
+            op.ifm_shapes[idx] = Shape4D(desired_shape)
+            if Shape4D(inp.shape) != op.ifm_shapes[idx]:
+                inp.avoid_NHCWB16 = True
+        op.type = Op.PackReshaped
+
+    inputs, axis = op.get_concat_inputs_axis()
+
+    for idx, inp in enumerate(inputs):
+        if op.type != Op.PackReshaped:
+            op.ifm_shapes[idx] = Shape4D(inp.shape)
             if axis >= 0:
                 axis_4D = axis + (4 - len(inp.shape))
             else:
                 axis_4D = axis
-            new_op = Operation(Op.ConcatSliceWrite, concat_op.name + str(idx))
-            new_op.inputs = [inp]
-            new_op.outputs = [tens]
-            new_op.attrs["concat_axis"] = axis_4D
-            new_op.attrs["concat_start"] = offset
-            offset += inp.shape[axis]
-            new_op.attrs["concat_end"] = offset
-            new_op.run_on_npu = True
-            tens.ops.append(new_op)
-            DebugDatabase.add_optimised(concat_op, new_op)
-            new_op.set_ifm_ofm_shapes()
-        assert tens.shape[axis] == offset
-
-        # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
-        # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
-        # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
-        # and those addresses are always 16 byte aligned due to the NHCWB16 format.
-        if axis == -1 or axis == (len(tens.shape) - 1):
-            for op in tens.ops:
-                if op.attrs["concat_start"] % 16 != 0:
-                    tens.avoid_NHCWB16 = True
-                    break
+        new_op = Operation(Op.ConcatSliceWrite, op.name + str(idx))
+        new_op.inputs = [inp]
+        new_op.outputs = [ofm]
+        new_op.attrs["concat_axis"] = axis_4D
+        new_op.attrs["concat_start"] = offset
+        offset += op.ifm_shapes[idx].get_dim(axis_4D)
 
-    return tens
+        new_op.attrs["concat_end"] = offset
+        new_op.run_on_npu = True
+        ofm.ops.append(new_op)
+        DebugDatabase.add_optimised(op, new_op)
+        new_op.ifm_shapes.append(op.ifm_shapes[idx].clone())
+        new_op.ofm_shapes.append(op.ofm_shapes[0].clone())
+    assert ofm.shape[axis] == offset
+
+    # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
+    # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
+    # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
+    # and those addresses are always 16 byte aligned due to the NHCWB16 format.
+    if axis == -1 or axis == (len(ofm.shape) - 1):
+        for op in ofm.ops:
+            if op.attrs["concat_start"] % 16 != 0:
+                ofm.avoid_NHCWB16 = True
+                break
+    return op
 
 
-def rewrite_split(tens, arch, nng):
+def rewrite_split_ops(tens, arch, nng):
 
     if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:
         split_op = tens.ops[0]
@@ -118,20 +134,27 @@ def rewrite_split(tens, arch, nng):
         tens.ops = []
         new_op = Operation(Op.SplitSliceRead, split_op.name)
         new_op.inputs = [inp]
+        ofm_shape_idx = 0
 
         # For Split the offset cannot be extracted from the tensor so it has to
         # be calculated from the index of the output tensor
         if axis is not None:
             # Get the start and end of the split
             offset_start = [0] * 4
+            axis_4D_list = split_op.attrs.get("split_axis_4D", None)  # Present for UnpackReshaped and some StridedSlice
             for idx, out in enumerate(outputs):
-                split_op.ofm_shapes[idx] = Shape4D(out.shape)
+                if axis_4D_list is not None:
+                    axis_4D = axis_4D_list[idx]
+                else:
+                    split_op.ofm_shapes[idx] = Shape4D(out.shape)
+                    if axis >= 0:
+                        axis_4D = axis + (4 - len(out.shape))
+                    else:
+                        axis_4D = axis
+
                 if out == tens:
+                    ofm_shape_idx = idx
                     break
-                if axis >= 0:
-                    axis_4D = axis + (4 - len(out.shape))
-                else:
-                    axis_4D = axis
 
                 offset_start[axis_4D] += split_op.ofm_shapes[idx].get_dim(axis_4D)
 
@@ -145,7 +168,7 @@ def rewrite_split(tens, arch, nng):
         new_op.run_on_npu = True
         new_op.set_output_tensor(tens)
         new_op.ifm_shapes.append(Shape4D(inp.shape))
-        new_op.ofm_shapes.append(Shape4D(full_shape(4, tens.shape, 1)))
+        new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx].clone())
         DebugDatabase.add_optimised(split_op, new_op)
 
     return tens
@@ -158,9 +181,9 @@ def needed_total_padding(input_size, stride, filter_size):
     return total_padding
 
 
-def calc_padding_and_skirt(padding_type, kernel_size, stride, input_dims, explicit_padding):
-    ypad = needed_total_padding(int(input_dims[1]), int(stride[1]), int(kernel_size[0]))
-    xpad = needed_total_padding(int(input_dims[2]), int(stride[2]), int(kernel_size[1]))
+def calc_padding_and_skirt(padding_type, kernel_size, stride, input_shape, explicit_padding):
+    ypad = needed_total_padding(int(input_shape.height), int(stride[1]), int(kernel_size[0]))
+    xpad = needed_total_padding(int(input_shape.width), int(stride[2]), int(kernel_size[1]))
     if padding_type == Padding.SAME:
         left_pad = (xpad + 0) // 2
         right_pad = (xpad + 1) // 2
@@ -184,11 +207,11 @@ def calc_padding_and_skirt(padding_type, kernel_size, stride, input_dims, explic
     return padding, skirt
 
 
-def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_dims, upscaling_factor):
+def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):
     kernel_height, kernel_width = kernel_size[0], kernel_size[1]
     if padding_type == Padding.SAME:
-        ypad = needed_total_padding(int(input_dims[1]) * upscaling_factor, int(stride[1]), int(kernel_height))
-        xpad = needed_total_padding(int(input_dims[2]) * upscaling_factor, int(stride[2]), int(kernel_width))
+        ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))
+        xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))
         right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)
         bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)
         left_pad = max(kernel_width - 1 - right_pad, 0)
@@ -225,7 +248,7 @@ def convert_resizebilinear_1x1_to_add(op):
     op.name = op.name + "_add"
     op.attrs["resizebilinear"] = True
     # Create an input tensor filled with zeros
-    shape = op.outputs[0].shape
+    shape = op.ofm_shapes[0].as_list()
     tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")
     tens.values = np.zeros(shape)
     tens.quant_values = np.zeros(shape, np.uint8)
@@ -258,8 +281,8 @@ def convert_resizebilinear_to_2x2_pool(op):
         op.attrs["padding"] = Padding.SAME
     op.inputs[0].resampling_mode = resampling_mode.NEAREST
 
-    upscaled_shape = np.array(op.inputs[0].shape[1:3])
-    out_shape = np.array(op.outputs[0].shape[1:3])
+    upscaled_shape = op.ifm_shape[0].get_hw_as_list()
+    out_shape = op.ofm_shape[0].get_hw_as_list()
     if (upscaled_shape == upscaled_shape * 2 - shape_modifier).all():
         return op
 
@@ -276,8 +299,8 @@ def convert_resizebilinear_to_2x2_pool(op):
             scaled_op.outputs = outputs
             scaled_op.outputs[0].ops = [scaled_op]
         else:
-            shape = outputs[0].shape.copy()
-            shape[1:3] = upscaled_shape[0:2]
+            shape = op.ofm_shapes[0].as_list()
+            shape[1:3] = upscaled_shape
             out_tens = Tensor(shape, DataType.int16, "{}_{}".format(op.outputs[0].name, count))
             out_tens.quantization = op.outputs[0].quantization.clone()
             out_tens.quantization.quant_min = np.iinfo(np.int16).min
@@ -300,11 +323,11 @@ def convert_resizebilinear_to_2x2_pool(op):
 
 def fixup_resizebilinear(op, arch, nng):
     if op.type == Op.ResizeBilinear and op.run_on_npu:
-        if op.inputs[0].shape == op.outputs[0].shape:
+        if op.ifm_shapes[0] == op.ofm_shapes[0]:
             # Bypass nop resizebilinear
             op.inputs = op.inputs[:1]
             op.type = Op.Identity
-        elif op.inputs[0].shape[1] == 1 and op.inputs[0].shape[2] == 1:
+        elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:
             convert_resizebilinear_1x1_to_add(op)
         else:
             convert_resizebilinear_to_2x2_pool(op)
@@ -321,109 +344,26 @@ def convert_nop_split_to_identity(op, arch, nng):
     return op
 
 
-def fixup_fully_connected_input(op, arch, nng):
-    if op.type == Op.FullyConnected:
-        inp = op.inputs[0]
-        weights = op.inputs[1]
-
-        n_in_elems = weights.shape[-2]
-        elms = inp.elements()
-        batch_size = elms // n_in_elems
-        assert batch_size * n_in_elems == elms
-
-        desired_shape = [batch_size, n_in_elems]
-        if inp.shape != desired_shape:
-            # mismatch, insert a reshape to fix this.
-            op.set_input_tensor(create_reshape_tensor(inp, desired_shape), 0)
-
-    return op
-
-
 def convert_batched_fc_shape(op, arch, nng):
     if op.type == Op.FullyConnected:
-        ifm = op.inputs[0]
-        ofm = op.outputs[0]
-        # Check if the FC is 2D and first dimension indicates batching
-        # TOD0 op.ifm_shape[0] > 1 is enough when refactory is complete
-        if len(ifm.shape) == len(ofm.shape) == 2 and ifm.shape[0] > 1 and op.ifm_shapes[0].batch > 1:
-            n = ifm.shape[0]
+        # Check if the first dimension indicates batching
+        if op.ifm_shapes[0].batch > 1:
             batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
+            n = op.ifm_shapes[0].batch
             h, w = batching_split.get(n, (1, n))
+            op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
 
-            prev_op = ifm.ops[0]
-            desired_shape = [1, h, w, ifm.shape[-1]]
-            op.ifm_shapes[0] = Shape4D(desired_shape)
-
-            if len(ifm.consumer_list) == 1 and prev_op is not None and prev_op.type == Op.Reshape:
-                # There is a preceding Reshape
-                # Compare input of prev_op and input of op, to see if prev_op can be removed
-                ifm_prev_op = prev_op.inputs[0]
-                if ifm_prev_op.shape == ifm.shape and check_quantized_tens_scaling_equal(ifm_prev_op, ifm):
-                    # prev_op can be removed
-                    op.set_input_tensor(ifm_prev_op, 0)
-                else:
-                    op.inputs[0].set_all_shapes(desired_shape)
-                    prev_op.set_input_tensor(
-                        create_const_tensor(prev_op.inputs[1].name, [1], DataType.int32, desired_shape), 1
-                    )
-                    prev_op.attrs["new_shape"] = desired_shape
-            else:
-                # Add reshape op to the input if there is no preceding reshape
-                ifm.consumer_list.remove(op)
-                op.set_input_tensor(create_reshape_tensor(ifm, desired_shape), 0)
+            op.ifm.avoid_NHCWB16 = True
 
             # Reshape Weights to be 4D. IO becomes HWIO
             weight_tensor = op.inputs[1]
             weight_tensor.quant_values = np.expand_dims(np.expand_dims(weight_tensor.quant_values, axis=0), axis=0)
             weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))
 
-            desired_shape = [1, h, w, ofm.shape[-1]]
-            op.ofm_shapes[0] = Shape4D(desired_shape)
-
-            if (
-                len(ofm.consumer_list) == 1
-                and ofm.consumer_list[0] is not None
-                and ofm.consumer_list[0].type == Op.Reshape
-            ):
-                # There is a subsequent Reshape
-                # Compare desired shape and output of consumer op, to see if consumer op can be removed
-                ofm_cons_op = ofm.consumer_list[0].outputs[0]
-                if desired_shape == ofm_cons_op.shape and check_quantized_tens_scaling_equal(ofm, ofm_cons_op):
-                    op.outputs[0] = ofm_cons_op
-                    op.outputs[0].ops = [op]
-                else:
-                    op.outputs[0].set_all_shapes(desired_shape)
-            else:
-                # Add reshape op to the output
-                op.set_output_tensor(create_reshape_tensor(ofm, desired_shape, False))
-    return op
-
-
-def fixup_pack_input(op, arch, nng):
-    if op.type == Op.Pack:
-        # Pack is also referred to as Stack
-        # Requires the rewrite_concat function to be called on the op afterwards
-        axis = int(op.attrs["axis"])
-        desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
-
-        # Construct 1 shape tensor to be used by all inserted reshape ops
-        new_shape_tens = create_const_tensor(op.name + "_reshape_shape", [1], DataType.int32, desired_shape)
-
-        for idx, inp in enumerate(op.inputs):
-            reshape_out = inp.clone("_reshaped")
-            reshape_out.set_all_shapes(desired_shape)
-
-            reshape_op = Operation(Op.Reshape, "{}{}_reshape".format(op.name, idx))
-            reshape_op.attrs["new_shape"] = desired_shape
-            reshape_op.inputs = [inp, new_shape_tens]
-            reshape_op.set_output_tensor(reshape_out)
-            reshape_op.set_ifm_ofm_shapes()
-            DebugDatabase.add_optimised(op, reshape_op)
-
-            op.inputs[idx] = reshape_out
-
-        op.type = Op.PackReshaped
-
+            n = op.ofm_shapes[0].batch
+            h, w = batching_split.get(n, (1, n))
+            op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
+            op.ofm.avoid_NHCWB16 = True
     return op
 
 
@@ -441,12 +381,19 @@ def unfuse_activation_function(op, arch, nng):
     return op
 
 
-def fixup_stridedslice_output(tens, arch, nng):
-    op = tens.ops[0]
-    if op.run_on_npu and op.type == Op.StridedSlice:
-        reshape_input_shape = tens.shape
-        new_axis_mask = op.attrs["new_axis_mask"]
-        shrink_axis_mask = op.attrs["shrink_axis_mask"]
+def rewrite_stridedslice_output(op, arch, nng):
+    if not op.run_on_npu or op.type != Op.StridedSlice:
+        return op
+
+    new_axis_mask = op.attrs["new_axis_mask"]
+    shrink_axis_mask = op.attrs["shrink_axis_mask"]
+
+    if shrink_axis_mask == 0 and new_axis_mask == 0:
+        return op
+
+    axis_4D = [0] * len(op.outputs)
+    for idx, out_tens in enumerate(op.outputs):
+        output_shape = list(out_tens.shape)
 
         if shrink_axis_mask != 0:
             n = 0
@@ -456,10 +403,16 @@ def fixup_stridedslice_output(tens, arch, nng):
                 n += 1
                 shrink_axis_mask &= shrink_axis_mask - 1
                 axis = int(math.log2(prev_mask - shrink_axis_mask))
-                reshape_input_shape = reshape_input_shape[:axis] + [1] + reshape_input_shape[axis:]
+                output_shape = output_shape[:axis] + [1] + output_shape[axis:]
 
-            assert len(tens.shape) == (len(op.inputs[0].shape) - n)
+            assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)
             op.attrs["shrink_axis_mask"] = 0
+            if axis >= 0:
+                axis_4D[idx] = axis + (4 - len(output_shape))
+            else:
+                axis_4D[idx] = axis
+            op.ofm_shapes[idx] = Shape4D(output_shape)
+
         elif new_axis_mask != 0:
             n = 0
             axis = 0
@@ -468,77 +421,62 @@ def fixup_stridedslice_output(tens, arch, nng):
                 n += 1
                 new_axis_mask &= new_axis_mask - 1
                 axis = int(math.log2(prev_mask - new_axis_mask))
-                reshape_input_shape = reshape_input_shape[:axis] + reshape_input_shape[(axis + 1) :]
+                output_shape = output_shape[:axis] + output_shape[(axis + 1) :]
                 new_axis_mask >>= 1
 
-            assert len(tens.shape) == (len(op.inputs[0].shape) + n)
+            assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)
             op.attrs["new_axis_mask"] = 0
-        else:
-            # Equal Rank StridedSlice, no need to insert reshape
-            return tens
-
-        # Construct 1 shape tensor to be used by all inserted reshape ops
-        new_shape_tens = create_const_tensor(op.name + "_reshape_shape", [1], DataType.int32, tens.shape)
-
-        for idx, out_tens in enumerate(op.outputs):
-            op.ofm_shapes[idx] = Shape4D(new_shape_tens.shape)
-            reshape_in = out_tens.clone("_reshaped")
-            reshape_in.set_all_shapes(reshape_input_shape)
-            reshape_in.ops = [op]
-
-            reshape_op = Operation(Op.Reshape, "{}{}_reshape".format(op.name, idx))
-            reshape_op.attrs["new_shape"] = reshape_input_shape
-            reshape_op.inputs = [reshape_in, new_shape_tens]
-            reshape_op.set_output_tensor(out_tens)
-            reshape_op.set_ifm_ofm_shapes()
+            if axis >= 0:
+                axis_4D[idx] = axis + (4 - len(output_shape))
+            else:
+                axis_4D[idx] = axis
+            op.ofm_shapes[idx] = Shape4D(output_shape)
 
-            op.outputs[idx] = reshape_in
+        if op.ofm_shapes[idx] != Shape4D(out_tens.shape):
+            out_tens.avoid_NHCWB16 = True
 
-    return tens
+    op.attrs["split_axis_4D"] = axis_4D
+    return op
 
 
-def fixup_unpack_output(tens, arch, nng):
-    op = tens.ops[0]
+def rewrite_unpack_output(op, arch, nng):
+    tens = op.outputs[0]
     if op.run_on_npu and op.type == Op.Unpack:
         # Unpack is also referred to as Unstack
-        # Requires the rewrite_split function to be called on the op afterwards
         axis = int(op.attrs["axis"])
         op.type = Op.UnpackReshaped
-        reshape_input_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
+        desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
 
-        # Construct 1 shape tensor to be used by all inserted reshape ops
-        new_shape_tens = create_const_tensor(op.name + "_reshape_shape", [1], DataType.int32, tens.shape)
+        if axis >= 0:
+            axis_4D = axis + (4 - len(desired_output_shape))
+        else:
+            axis_4D = axis
 
+        axis_4D_list = [0] * len(op.outputs)
         for idx, out_tens in enumerate(op.outputs):
-            reshape_in = out_tens.clone("_reshaped")
-            reshape_in.set_all_shapes(reshape_input_shape)
-            reshape_in.ops = [op]
-
-            reshape_op = Operation(Op.Reshape, "{}{}_reshape".format(op.name, idx))
-            reshape_op.attrs["new_shape"] = reshape_input_shape
-            reshape_op.inputs = [reshape_in, new_shape_tens]
-            reshape_op.set_output_tensor(out_tens)
-            reshape_op.set_ifm_ofm_shapes()
-            DebugDatabase.add_optimised(op, reshape_op)
-
-            op.outputs[idx] = reshape_in
-    return tens
+            op.ofm_shapes[idx] = Shape4D(desired_output_shape)
+            axis_4D_list[idx] = axis_4D
+            if op.ofm_shapes[idx] != Shape4D(out_tens.shape):
+                out_tens.avoid_NHCWB16 = True
+
+        op.attrs["split_axis_4D"] = axis_4D_list
+    return op
 
 
 def add_padding_fields(op, arch, nng):
     if op.run_on_npu:
         if "padding" in op.attrs:
+            input_shape = op.ifm_shapes[0]
+            output_shape = op.ofm_shapes[0]
             if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
                 kernel_size = op.inputs[1].shape[:2]
-                input_shape = op.inputs[0].shape
             elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
                 kernel_size = op.attrs["ksize"][1:3]
-                input_shape = op.inputs[0].shape
             else:
                 raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")
 
             if op.type == Op.Conv2DBackpropInputSwitchedBias:
-                upscaling_factor = op.outputs[0].shape[1] // input_shape[1]
+                upscaling_factor = output_shape.height // input_shape.height
                 padding, skirt = calc_upscaled_padding_and_skirt(
                     op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
                 )
@@ -582,10 +520,10 @@ def convert_depthwise_to_conv(op, arch, nng):
     # switch of the operator type (and weight order)
 
     if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
-        ifm_tensor = op.inputs[0]
+        ifm_shape = op.ifm_shapes[0]
         weight_tensor = op.inputs[1]
-        ofm_tensor = op.outputs[0]
-        if (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"]):
+        ofm_shape = op.ofm_shapes[0]
+        if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
             # Change op type to Conv2d
             op.type = Op.Conv2DBias
             del op.attrs["channel_multiplier"]
@@ -596,7 +534,7 @@ def convert_depthwise_to_conv(op, arch, nng):
         else:
             raise UnsupportedFeatureError(
                 f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
-                f" ifm channels = {ifm_tensor.shape[3]}, ofm channels = {ofm_tensor.shape[3]}",
+                f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
             )
         DebugDatabase.add_optimised(op, op)
     return op
@@ -620,17 +558,15 @@ def optimise_strided_conv(op, arch, nng):
         op.type == Op.Conv2DBias
         and op.op_index == 0
         and stride_x == 2
-        and len(ifm_tensor.shape) == 4
-        and ifm_tensor.shape[3] <= 4
-        and ifm_tensor.shape[2] % 2 == 0
+        and op.ifm_shapes[0].depth <= 4
+        and op.ifm_shapes[0].width % 2 == 0
         and weight_tensor is not None
         and weight_tensor.shape[1] >= 2
     ):
+        ifm_shape = op.ifm_shapes[0]
         # IFM
-        ifm_reshaped = create_reshape_tensor(
-            ifm_tensor, [ifm_tensor.shape[0], ifm_tensor.shape[1], ifm_tensor.shape[2] // 2, ifm_tensor.shape[3] * 2]
-        )
-        op.set_input_tensor(ifm_reshaped, 0)
+        op.ifm_shapes[0] = Shape4D([ifm_shape.batch, ifm_shape.height, ifm_shape.width // 2, ifm_shape.depth * 2])
+        op.ifm.avoid_NHCWB16 = True
 
         # Weights
         weight_shape = weight_tensor.shape
@@ -657,8 +593,6 @@ def optimise_strided_conv(op, arch, nng):
         stride_x = 1
         op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
 
-        op.set_ifm_ofm_shapes()
-
     return op
 
 
@@ -683,27 +617,6 @@ def convert_conv_to_fc(op, arch, nng):
             weight_tensor.quant_values = weight_tensor.quant_values.squeeze(axis=(0, 1))
             weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))
 
-            # The output from a fully connected is expected to be 2D so we need to add a reshape layer to convert it
-            # back to 4D afterwards as the next layer is expecting that shape
-            orig_ofm_tensor = op.outputs[0]
-            # Reshape this ops output to be 2D: {(N*H*W), C} (We know N H and W are all 1 so this becomes {1, C})
-            fc_ofm_tensor = orig_ofm_tensor.clone("_fc")
-            fc_ofm_tensor.set_all_shapes([1, fc_ofm_tensor.shape[-1]])
-            fc_ofm_tensor.ops = [op]
-            # Add a reshape after the new OFM to convert it back to the original 4D shape
-            reshape_name = op.name + "_reshape"
-            new_shape_tens = create_const_tensor(reshape_name + "_shape", [1], DataType.int32, orig_ofm_tensor.shape)
-            reshape_op = Operation(Op.Reshape, reshape_name)
-            reshape_op.attrs["new_shape"] = orig_ofm_tensor.shape
-            reshape_op.inputs = [fc_ofm_tensor, new_shape_tens]
-            reshape_op.set_output_tensor(orig_ofm_tensor)
-            reshape_op.set_ifm_ofm_shapes()
-
-            # Replace this ops OFM to point to the 2D tensor
-            op.outputs[0] = fc_ofm_tensor
-            op.set_ifm_ofm_shapes()
-            # Record optimisation in debug database
-            DebugDatabase.add_optimised(op, reshape_op)
             DebugDatabase.add_optimised(op, op)
     return op
 
@@ -722,14 +635,6 @@ def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
             # Tidy up and assign the ifm and ofm to the new op
             ifm.consumer_list.remove(op)
 
-            # if not 4d, reshape ifm/ofm
-            if len(ifm.shape) < 4:
-                ifm_shaped = create_reshape_tensor(ifm, full_shape(4, ifm.shape, 1))
-                ifm = ifm_shaped
-            if len(ofm.shape) < 4:
-                ofm_shaped = create_reshape_tensor(ofm, full_shape(4, ofm.shape, 1), False)
-                ofm = ofm_shaped
-
             relu_fused_op.add_input_tensor(ifm)
             relu_fused_op.set_output_tensor(ofm)
             relu_fused_op.set_ifm_ofm_shapes()
@@ -737,6 +642,7 @@ def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
     return op
 
 
+# TODO remove if mem only ops can all be removed
 # Reorder activation op if it's after the memory only operations
 def fixup_act_reorder(op, arch, nng):
     if op.type.is_relu_op() or op.type in (Op.Sigmoid, Op.Tanh):
@@ -752,8 +658,8 @@ def fixup_act_reorder(op, arch, nng):
             act_op_out = act_op.inputs[0].clone("_acted")
             act_op_out.quantization = op.outputs[0].quantization.clone()
             act_op.set_output_tensor(act_op_out)
-            act_op.ifm_shapes[0] = Shape4D(prep_op.inputs[0].shape)
-            act_op.ofm_shapes[0] = Shape4D(act_op_out.shape)
+            act_op.ofm_shapes[0] = act_op.ifm_shapes[0].clone()
+            act_op.ifm_shapes[0] = prep_op.ifm_shapes[0].clone()
 
             # Update the consumer list
             act_op_out.consumer_list = op.outputs[0].consumer_list.copy()
@@ -1078,39 +984,94 @@ def convert_tanh_sigmoid_to_lut(op, arch, nng):
     return op
 
 
-def remove_unwanted_reshapes(op, arch, nng):
-    # Try to remove reshapes enclosing ElementWise operator with only one non-constant input
-    if not op.run_on_npu or not op.type.is_elementwise_op():
-        return op
+def remove_reshapes(op, arch):
+    if op.run_on_npu and op.type == Op.Reshape:
+        ofm = op.ofm
+        ifm = op.ifm
 
-    # Check if the ElementWise operator only have one non-constant input
-    non_const_tens = [x for x in op.inputs if x.ops[0].type != Op.Const]
-    if len(non_const_tens) != 1:
-        return op
-    ifm = non_const_tens[0]
+        # Check if quantization is the same in the input and output for the reshape ops
+        if not check_quantized_tens_scaling_equal(ifm, ofm):
+            # TODO Both tensors are needed, since quantisation properties currently are linked to Tensors.
+            # In order to remove this reshape either quantization properties need to be moved to Operator,
+            # or the reshape need to be replace with a NOP.
+            return
+
+        # Check if ifm is a sg input
+        if ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const):
+            # put the reshape on CPU
+            op.run_on_npu = False
+            return
+
+        # Check if Reshape ifm/ofm are network ifm/ofm
+        ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)
+        ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)
+
+        if ifm_is_sg_ofm and ofm_is_sg_ofm:
+            # Both ifm and ofm are sg outputs,add reshape to the ifm and put it on CPU
+            ifm_cons_list_copy = ifm.consumer_list.copy()
+            ifm_ops_copy = ifm.ops.copy()
+            for ifm_cons in ifm_cons_list_copy:
+                if ifm_cons is None:
+                    # Create a reshape op with ifm as output
+                    name = ifm.name + "_cpu_reshape"
+                    reshape_ifm = ifm.clone()
+                    reshape_op = Operation(Op.Reshape, name)
+                    reshape_op.attrs["new_shape"] = ifm.shape
+                    reshape_op.add_input_tensor(reshape_ifm)
+                    reshape_op.add_input_tensor(create_const_tensor(name + "_shape", [1], DataType.int32, ifm.shape))
+                    reshape_op.set_output_tensor(ifm)
+                    reshape_op.set_ifm_ofm_shapes()
+                    reshape_op.run_on_npu = False
+                    reshape_op.ofm.ops = [reshape_op]
+                    reshape_op.ofm.consumer_list = [None]
+
+                    # Set reshape_ifm producers
+                    for prev_op in ifm_ops_copy:
+                        prev_op.outputs = [reshape_ifm]
+                        reshape_ifm.ops.append(prev_op)
+
+                    # Set reshape_ifm consumers
+                    for ifm_cons in ifm_cons_list_copy:
+                        if ifm_cons is not None:
+                            for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
+                                if cons_ifm == ifm:
+                                    ifm_cons.set_input_tensor(reshape_ifm, ifm_idx)
+
+                    ifm = reshape_ifm
+                    break
+            ifm_is_sg_ofm = False
+
+        if ofm_is_sg_ofm:
+            # Bypassed by replacing ifm with ofm
+            ofm.ops = []
+            for prev_op in ifm.ops:
+                prev_op.outputs = [ofm]
+                ofm.ops.append(prev_op)
+
+            # All ifm consumers need to use ofm as input
+            for ifm_cons in ifm.consumer_list:
+                for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
+                    if cons_ifm == ifm:
+                        ifm_cons.set_input_tensor(ofm, ifm_idx)
+            if op.ifm_shapes[0] != op.ofm_shapes[0]:
+                ofm.avoid_NHCWB16 = True
+        else:
+            # Bypassed Reshape by replacing ofm with ifm
+            for cons in ofm.consumer_list:
+                for ifm_idx, cons_ifm in enumerate(cons.inputs):
+                    if cons_ifm == ofm:
+                        cons.set_input_tensor(ifm, ifm_idx)
+            if op.ifm_shapes[0] != op.ofm_shapes[0]:
+                ifm.avoid_NHCWB16 = True
 
-    # Check if operation is enclosed by Reshapes that can be removed
-    ofm = op.outputs[0]
-    prev_op = ifm.ops[0]
-    if (
-        len(ifm.consumer_list) == 1
-        and prev_op.type == Op.Reshape
-        and len(ofm.consumer_list) == 1
-        and ofm.consumer_list[0].type == Op.Reshape
-    ):
-        # Operation is enclosed by reshapes, check if they can be removed
-        prev_op_ifm, prev_op_ofm = prev_op.get_ifm_ofm()
-        cons_op = ofm.consumer_list[0]
-        cons_op_ifm = ofm
-        cons_op_ofm = cons_op.outputs[0]
-        if len(prev_op_ifm.shape) == len(cons_op_ofm.shape):
-            # Check if quantization is the same in the input and output for the reshape ops
-            if check_quantized_tens_scaling_equal(prev_op_ifm, prev_op_ofm) and check_quantized_tens_scaling_equal(
-                cons_op_ifm, cons_op_ofm
-            ):
-                op.set_input_tensor(prev_op_ifm, 0)
-                op.set_output_tensor(cons_op_ofm)
-    return op
+
+def check_reshapes(op, arch):
+    if op.run_on_npu and op.type == Op.Reshape:
+        ofm = op.ofm
+
+        if check_quantized_tens_scaling_equal(op.ifm, ofm):
+            # Reshape should have been removed
+            raise VelaError(f"Reshape op {op} expected to have been removed, still remains")
 
 
 def fuse_activation_function_with_prev(op, arch, nng):
@@ -1174,13 +1135,19 @@ def optimise_pad(op, arch, nng):
 def add_attrs_to_resizebilinear(op, arch, nng):
     if op.type == Op.ResizeBilinear and op.run_on_npu:
         input_tensor = op.inputs[0]
-        upscaled_shape = [input_tensor.shape[1] * 2, input_tensor.shape[2] * 2]
-        out_shape = op.outputs[0].shape[1:3]
-        if not op.attrs["align_corners"] and out_shape == upscaled_shape:
+        input_shape = op.ifm_shapes[0]
+        upscaled_height = input_shape.height * 2
+        upscaled_width = input_shape.width * 2
+        out_shape = op.ofm_shapes[0]
+        if not op.attrs["align_corners"] and out_shape.height == upscaled_height and out_shape.width == upscaled_width:
             # this means the output is supposed to be a x2 upscale,
             # so we need to do SAME padding
             op.attrs["padding"] = Padding.SAME
-        elif op.attrs["align_corners"] and out_shape == [upscaled_shape[0] - 1, upscaled_shape[1] - 1]:
+        elif (
+            op.attrs["align_corners"]
+            and out_shape.height == (upscaled_height - 1)
+            and out_shape.width == (upscaled_width - 1)
+        ):
             # here we can just run the avg pool without padding and
             # produce a (M * 2 - 1, N * 2 - 1) sized output
             op.attrs["padding"] = Padding.VALID
@@ -1229,26 +1196,52 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
             nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
         )
 
+    # Handle Concat Ops
+    for idx, sg in enumerate(nng.subgraphs):
+        # rewrite graph pass
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
+            nng, sg, arch, [], [rewrite_concat_ops], rewrite_unsupported=False,
+        )
+
+    # Handle Split Ops
+    for idx, sg in enumerate(nng.subgraphs):
+        # rewrite graph pass
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
+            nng,
+            sg,
+            arch,
+            [],
+            [rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],
+            rewrite_unsupported=False,
+        )
+
+    for idx, sg in enumerate(nng.subgraphs):
+        # rewrite graph pass
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
+            nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False,
+        )
+
+    # Removal of reshapes
+    for sg in nng.subgraphs:
+        rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_reshapes])
+        sg.refresh_after_modification()
+
     op_rewrite_list = [
         set_tensor_equivalence,
         convert_depthwise_to_conv,
         convert_conv_to_fc,
         convert_softmax,
         optimise_strided_conv,
-        fixup_fully_connected_input,
         convert_batched_fc_shape,
-        fixup_pack_input,
         unfuse_activation_function,
         fixup_conv2d_backprop,
         fixup_relus_with_differing_ifm_ofm_scaling,
         fixup_act_reorder,
-        fixup_elementwise_with_scalars,
+        fixup_elementwise_with_scalars,  # TODO Move to early stage?
         reorder_depthwise_weights,
         fixup_resizebilinear,
         fixup_bias_tensors,
-        convert_nop_split_to_identity,
         convert_mul_max_to_abs_or_lrelu,
-        remove_unwanted_reshapes,
         convert_lrelu,
         convert_tanh_sigmoid_to_lut,
     ]
@@ -1269,24 +1262,9 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
             [fuse_activation_function_with_prev, optimise_pad, add_padding_fields],
         )
 
-    # Post-optimisation operator debug tracing
+    # Post-optimisation operator debug tracing, and checking that no undesired reshapes are left in the graph
     for sg in nng.subgraphs:
-        rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [_record_optimised])
-
-    if verbose_graph:
-        nng.print_graph()
-    return nng
-
-
-def optimise_graph_b(nng, arch, verbose_graph=False):
-    if verbose_graph:
-        nng.print_graph()
-
-    for idx, sg in enumerate(nng.subgraphs):
-        # combined rewrite graph pass
-        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [fixup_unpack_output, fixup_stridedslice_output, rewrite_concat, rewrite_split], [],
-        )
+        rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [check_reshapes, _record_optimised])
 
     if verbose_graph:
         nng.print_graph()