From eeb85154b00a9864d0d63e382e9c80ca8e294d5d Mon Sep 17 00:00:00 2001
From: "patrik.gustavsson" <patrik.gustavsson@arm.com>
Date: Mon, 21 Dec 2020 17:10:40 +0000
Subject: Revert "Revert "MLBEDSW-3645 4D class for op ifm/ofm shapes""

This reverts commit df0a5905177f3a1b836076bc3f9f39b2e86f1794.

Reason for revert: <INSERT REASONING HERE>

Change-Id: I891c66fb29db9d25e942947e8d1c29a10610de51
---
 ethosu/vela/debug_database.py                      |  15 ++-
 ethosu/vela/graph_optimiser.py                     |  42 ++++---
 ethosu/vela/high_level_command_stream.py           | 122 +++++----------------
 ethosu/vela/high_level_command_stream_generator.py |  32 +++---
 ethosu/vela/high_level_command_to_npu_op.py        |   3 +-
 ethosu/vela/nn_graph.py                            |   6 +-
 ethosu/vela/npu_performance.py                     |  41 +++----
 ethosu/vela/operation.py                           |  28 +++--
 ethosu/vela/pass_packing.py                        |  11 +-
 ethosu/vela/shape4d.py                             |  77 +++++++++++++
 ethosu/vela/shared_buffer_allocation.py            |  20 ++--
 ethosu/vela/softmax.py                             |   4 +-
 ethosu/vela/tensor.py                              |  17 +--
 ethosu/vela/test/test_graph_optimiser.py           |   5 +-
 ethosu/vela/test/test_supported_operators.py       |   2 +-
 ethosu/vela/test/testutil.py                       |   6 +-
 16 files changed, 230 insertions(+), 201 deletions(-)
 create mode 100644 ethosu/vela/shape4d.py

(limited to 'ethosu/vela')
diff --git a/ethosu/vela/debug_database.py b/ethosu/vela/debug_database.py
index 203503f2..77e13eb0 100644
--- a/ethosu/vela/debug_database.py
+++ b/ethosu/vela/debug_database.py
@@ -23,7 +23,7 @@ import lxml.etree as xml
 
 from . import numeric_util
 from .operation import Operation
-
+from .shape4d import Shape4D
 
 UntypedDict = Dict[Any, Any]
 UntypedList = List[Any]
@@ -79,9 +79,18 @@ class DebugDatabase:
                 src_uid = cls._sourceUID[parent]
             uid = len(cls._optimisedUID)
             cls._optimisedUID[op] = (uid, src_uid)
-            ofm_shape = op.ofm_shapes[0] if op.ofm_shapes else numeric_util.full_shape(3, op.outputs[0].shape, 1)
+            ofm_shape = op.ofm_shapes[0] if op.ofm_shapes else Shape4D(op.outputs[0].shape)
             cls._optimisedTable.append(
-                [uid, src_uid, op.type, op.kernel.width, op.kernel.height, ofm_shape[-2], ofm_shape[-3], ofm_shape[-1]]
+                [
+                    uid,
+                    src_uid,
+                    op.type,
+                    op.kernel.width,
+                    op.kernel.height,
+                    ofm_shape.width,
+                    ofm_shape.height,
+                    ofm_shape.depth,
+                ]
             )
 
     @classmethod
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index fdb0fae0..1128a311 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -37,6 +37,7 @@ from .operation import Op
 from .operation import Operation
 from .operation import Padding
 from .operation_util import create_avgpool_nop
+from .shape4d import Shape4D
 from .softmax import SoftMax
 from .tensor import check_quantized_tens_scaling_equal
 from .tensor import create_const_tensor
@@ -82,6 +83,7 @@ def rewrite_concat(tens, arch, nng):
             new_op.run_on_npu = True
             tens.ops.append(new_op)
             DebugDatabase.add_optimised(concat_op, new_op)
+            new_op.set_ifm_ofm_shapes()
         assert tens.shape[axis] == offset
 
         # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
@@ -121,7 +123,8 @@ def rewrite_split(tens, arch, nng):
                 if out == tens:
                     break
                 axis_4D = axis + (4 - len(out.shape))
-                offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]
+
+                offset_start[axis_4D] += split_op.ofm_shapes[idx].get_dim(axis_4D)
 
                 # If start offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input
                 if (offset_start[-1] % 16) != 0:
@@ -132,6 +135,7 @@ def rewrite_split(tens, arch, nng):
         new_op.attrs["split_start"] = offset_start
         new_op.run_on_npu = True
         new_op.set_output_tensor(tens)
+        new_op.set_ifm_ofm_shapes()
         DebugDatabase.add_optimised(split_op, new_op)
 
     return tens
@@ -189,6 +193,7 @@ def fixup_conv2d_backprop(op, arch, nng):
     if op.type == Op.Conv2DBackpropInput:
         # flip the inputs
         op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
+        op.set_ifm_ofm_shapes()
         op.type = Op.Conv2DBackpropInputSwitchedBias
 
         # Update strides
@@ -216,8 +221,7 @@ def convert_resizebilinear_1x1_to_add(op):
     # Set the add inputs
     op.inputs[1] = op.inputs[0]
     op.inputs[0] = tens
-    op.ifm_shapes = []
-    op.ofm_shapes = []
+    op.set_ifm_ofm_shapes()
 
     return op
 
@@ -323,14 +327,14 @@ def convert_batched_fc_shape(op, arch, nng):
         ofm = op.outputs[0]
         # Check if the FC is 2D and first dimension indicates batching
         # TOD0 op.ifm_shape[0] > 1 is enough when refactory is complete
-        if len(ifm.shape) == len(ofm.shape) == 2 and ifm.shape[0] > 1 and op.ifm_shapes[0][0] > 1:
+        if len(ifm.shape) == len(ofm.shape) == 2 and ifm.shape[0] > 1 and op.ifm_shapes[0].batch > 1:
             n = ifm.shape[0]
             batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
             h, w = batching_split.get(n, (1, n))
 
             prev_op = ifm.ops[0]
             desired_shape = [1, h, w, ifm.shape[-1]]
-            op.ifm_shapes[0] = desired_shape
+            op.ifm_shapes[0] = Shape4D(desired_shape)
 
             if len(ifm.consumer_list) == 1 and prev_op is not None and prev_op.type == Op.Reshape:
                 # There is a preceding Reshape
@@ -356,7 +360,7 @@ def convert_batched_fc_shape(op, arch, nng):
             weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))
 
             desired_shape = [1, h, w, ofm.shape[-1]]
-            op.ofm_shapes[0] = desired_shape
+            op.ofm_shapes[0] = Shape4D(desired_shape)
 
             if (
                 len(ofm.consumer_list) == 1
@@ -395,6 +399,7 @@ def fixup_pack_input(op, arch, nng):
             reshape_op.attrs["new_shape"] = desired_shape
             reshape_op.inputs = [inp, new_shape_tens]
             reshape_op.set_output_tensor(reshape_out)
+            reshape_op.set_ifm_ofm_shapes()
             DebugDatabase.add_optimised(op, reshape_op)
 
             op.inputs[idx] = reshape_out
@@ -413,6 +418,7 @@ def unfuse_activation_function(op, arch, nng):
         act_op.set_output_tensor(out_tens)
         act_op.add_input_tensor(intermediate_tens)
         op.set_output_tensor(intermediate_tens)
+        act_op.set_ifm_ofm_shapes()
 
     return op
 
@@ -457,7 +463,7 @@ def fixup_stridedslice_output(tens, arch, nng):
         new_shape_tens = create_const_tensor(op.name + "_reshape_shape", [1], DataType.int32, tens.shape)
 
         for idx, out_tens in enumerate(op.outputs):
-            op.ofm_shapes[idx] = new_shape_tens
+            op.ofm_shapes[idx] = Shape4D(new_shape_tens.shape)
             reshape_in = out_tens.clone("_reshaped")
             reshape_in.set_all_shapes(reshape_input_shape)
             reshape_in.ops = [op]
@@ -466,6 +472,7 @@ def fixup_stridedslice_output(tens, arch, nng):
             reshape_op.attrs["new_shape"] = reshape_input_shape
             reshape_op.inputs = [reshape_in, new_shape_tens]
             reshape_op.set_output_tensor(out_tens)
+            reshape_op.set_ifm_ofm_shapes()
 
             op.outputs[idx] = reshape_in
 
@@ -493,6 +500,7 @@ def fixup_unpack_output(tens, arch, nng):
             reshape_op.attrs["new_shape"] = reshape_input_shape
             reshape_op.inputs = [reshape_in, new_shape_tens]
             reshape_op.set_output_tensor(out_tens)
+            reshape_op.set_ifm_ofm_shapes()
             DebugDatabase.add_optimised(op, reshape_op)
 
             op.outputs[idx] = reshape_in
@@ -588,7 +596,8 @@ def convert_conv_to_fc(op, arch, nng):
     # caching/double buffering for the weights.
     # (Weights dont need to be reloaded for convs when IFM H and W are 1)
     if op.type == Op.Conv2DBias:
-        _, h, w, _ = op.ifm_shapes[0]
+        h = op.ifm_shapes[0].height
+        w = op.ifm_shapes[0].width
         kh, kw, _, _ = op.inputs[1].shape
         if h == 1 and w == 1 and kh == 1 and kw == 1:
             # Overwrite this op as a Fully Connected Op
@@ -616,9 +625,11 @@ def convert_conv_to_fc(op, arch, nng):
             reshape_op.attrs["new_shape"] = orig_ofm_tensor.shape
             reshape_op.inputs = [fc_ofm_tensor, new_shape_tens]
             reshape_op.set_output_tensor(orig_ofm_tensor)
+            reshape_op.set_ifm_ofm_shapes()
 
             # Replace this ops OFM to point to the 2D tensor
             op.outputs[0] = fc_ofm_tensor
+            op.set_ifm_ofm_shapes()
             # Record optimisation in debug database
             DebugDatabase.add_optimised(op, reshape_op)
             DebugDatabase.add_optimised(op, op)
@@ -649,6 +660,7 @@ def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
 
             relu_fused_op.add_input_tensor(ifm)
             relu_fused_op.set_output_tensor(ofm)
+            relu_fused_op.set_ifm_ofm_shapes()
             op = relu_fused_op
     return op
 
@@ -668,8 +680,8 @@ def fixup_act_reorder(op, arch, nng):
             act_op_out = act_op.inputs[0].clone("_acted")
             act_op_out.quantization = op.outputs[0].quantization.clone()
             act_op.set_output_tensor(act_op_out)
-            act_op.ifm_shapes[0] = full_shape(4, prep_op.inputs[0].shape, 1)
-            act_op.ofm_shapes[0] = full_shape(4, act_op_out.shape, 1)
+            act_op.ifm_shapes[0] = Shape4D(prep_op.inputs[0].shape)
+            act_op.ofm_shapes[0] = Shape4D(act_op_out.shape)
 
             # Update the consumer list
             act_op_out.consumer_list = op.outputs[0].consumer_list.copy()
@@ -839,6 +851,7 @@ def convert_lrelu_to_mul_max(op, arch):
     mul_alpha.add_input_tensor(alpha_tens)
     fm_alpha = ofm.clone(op.name + "_alpha")
     mul_alpha.set_output_tensor(fm_alpha)
+    mul_alpha.set_ifm_ofm_shapes()
     DebugDatabase.add_optimised(op, mul_alpha)
 
     if check_quantized_tens_scaling_equal(ifm, ofm):
@@ -860,6 +873,7 @@ def convert_lrelu_to_mul_max(op, arch):
         mul_identity.add_input_tensor(identity_tens)
         fm_id = ofm.clone(op.name + "_id")
         mul_identity.set_output_tensor(fm_id)
+        mul_identity.set_ifm_ofm_shapes()
         DebugDatabase.add_optimised(op, mul_identity)
 
     # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
@@ -890,7 +904,7 @@ def convert_to_lut(op, lut_values, lut_name):
     quantization.zero_point = 0
     tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
     op.add_input_tensor(tens)
-    op.ifm_shapes.append(full_shape(4, tens.shape, 1))
+    op.ifm_shapes.append(Shape4D(tens.shape))
 
     # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
     # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
@@ -1158,11 +1172,7 @@ def optimise_graph_b(nng, arch, verbose_graph=False):
     for idx, sg in enumerate(nng.subgraphs):
         # combined rewrite graph pass
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng,
-            sg,
-            arch,
-            [fixup_unpack_output, fixup_stridedslice_output, rewrite_concat, rewrite_split],
-            [set_ifm_ofm_op_shapes],
+            nng, sg, arch, [fixup_unpack_output, fixup_stridedslice_output, rewrite_concat, rewrite_split], [],
         )
 
     if verbose_graph:
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index bb4f1424..9cbda452 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -15,11 +15,14 @@
 # limitations under the License.
 # Description:
 # Contains classes that hold commands for the high-level command stream (one command per DMA or NPU stripe).
+from typing import List
+
 import numpy as np
 
 from .architecture_features import Block
 from .numeric_util import round_up_divide
 from .operation import NpuBlockType
+from .shape4d import Shape4D
 
 
 class Box:
@@ -32,15 +35,15 @@ class Box:
 
     def transform_with_strides_and_skirt(
         self,
-        strides,
-        skirt,
-        ifm_shape,
-        npu_block_type,
-        concat_axis=0,
-        concat_offset=0,
-        split_offset=None,
-        k_height=1,
-        upscaling_factor=1,
+        strides: List[int],
+        skirt: List[int],
+        ifm_shape: Shape4D,
+        npu_block_type: NpuBlockType,
+        concat_axis: int = 0,
+        concat_offset: int = 0,
+        split_offset: int = None,
+        k_height: int = 1,
+        upscaling_factor: int = 1,
     ):
         new_start_coord = list(self.start_coord)
         new_end_coord = list(self.end_coord)
@@ -58,15 +61,15 @@ class Box:
         ):
             # these types of operations do a "dot product" or sum over the entire IFM
             new_start_coord[-1] = 0
-            new_end_coord[-1] = ifm_shape[-1]
+            new_end_coord[-1] = ifm_shape.depth
 
-        if npu_block_type == NpuBlockType.ElementWise and min(len(new_end_coord), len(ifm_shape)) >= 1:
-            new_end_coord[-1] = min(new_end_coord[-1], ifm_shape[-1])
-        if min(len(new_end_coord), len(ifm_shape)) >= 2:
-            new_end_coord[-2] = min(new_end_coord[-2], ifm_shape[-2] * upscaling_factor)
-        if min(len(new_end_coord), len(ifm_shape)) >= 3:
+        if npu_block_type == NpuBlockType.ElementWise and len(new_end_coord) >= 1:
+            new_end_coord[-1] = min(new_end_coord[-1], ifm_shape.depth)
+        if len(new_end_coord) >= 2:
+            new_end_coord[-2] = min(new_end_coord[-2], ifm_shape.width * upscaling_factor)
+        if len(new_end_coord) >= 3:
             original_end_coord = list(new_end_coord)
-            new_end_coord[-3] = min(new_end_coord[-3], ifm_shape[-3] * upscaling_factor)
+            new_end_coord[-3] = min(new_end_coord[-3], ifm_shape.height * upscaling_factor)
 
         pad_top = 0
         pad_bottom = 0
@@ -74,7 +77,7 @@ class Box:
             if len(new_start_coord) >= 2:
                 stride = strides[2]
                 new_start_coord[-2] = max(new_start_coord[-2] * stride - skirt[1], 0)
-                new_end_coord[-2] = min(new_end_coord[-2] * stride + skirt[3], ifm_shape[-2])
+                new_end_coord[-2] = min(new_end_coord[-2] * stride + skirt[3], ifm_shape.width)
 
             if len(new_start_coord) >= 3:
                 stride = strides[1]
@@ -86,23 +89,20 @@ class Box:
                 pad_top = max(0, 0 - new_start_coord[-3]) + skirt_top_remainder
                 new_start_coord[-3] = max(new_start_coord[-3], 0)
 
-                while len(ifm_shape) < 3:
-                    ifm_shape = [1] + ifm_shape
-
-                if (new_end_coord[-3] * stride + skirt[2]) > (ifm_shape[-3] * upscaling_factor):
+                if (new_end_coord[-3] * stride + skirt[2]) > (ifm_shape.height * upscaling_factor):
                     # pad_bottom is calculated based the diff between the end position of the weight kernel,
                     # after last stride and the ifm height.
-                    if upscaling_factor != 1 and original_end_coord[-3] > ifm_shape[-3] * upscaling_factor:
+                    if upscaling_factor != 1 and original_end_coord[-3] > ifm_shape.height * upscaling_factor:
                         # Special case for Transpose Convolution with VALID padding.
-                        pad_bottom = original_end_coord[-3] - (ifm_shape[-3] * upscaling_factor)
+                        pad_bottom = original_end_coord[-3] - (ifm_shape.height * upscaling_factor)
                     else:
                         k_start = new_start_coord[-3] - pad_top
-                        pad_bottom = max(0, k_start + total_stride + k_height - (ifm_shape[-3] * upscaling_factor))
+                        pad_bottom = max(0, k_start + total_stride + k_height - (ifm_shape.height * upscaling_factor))
 
                 # Adjust for upscaling
                 new_start_coord[-3] = max(new_start_coord[-3] // upscaling_factor, 0)
                 new_end_coord[-3] = new_end_coord[-3] * stride + skirt[2] + (skirt[2] % upscaling_factor)
-                new_end_coord[-3] = max(min(new_end_coord[-3] // upscaling_factor, ifm_shape[-3]), 1)
+                new_end_coord[-3] = max(min(new_end_coord[-3] // upscaling_factor, ifm_shape.height), 1)
 
         return Box(new_start_coord, new_end_coord), pad_top, pad_bottom
 
@@ -197,7 +197,7 @@ class NpuStripe(Command):
         self.pad_top = pad_top
         self.pad_bottom = pad_bottom
         for i in range(len(self.ofm_box.end_coord)):
-            assert self.ofm_box.end_coord[i] <= ps.ofm_shapes[0][i]
+            assert self.ofm_box.end_coord[i] <= ps.ofm_shapes[0].get_dim(i)
 
     def is_npu_pass_command(self):
         return True
@@ -251,76 +251,6 @@ class NpuStripe(Command):
         assert res >= 0
         return res
 
-    def get_single_block_command(self, block_idx):
-        block_cfg = (self.block_config[0], self.block_config[1], self.block_config[3])
-        dims = self.get_block_dimensions()
-        strides = dims[1] * dims[2], dims[2], 1
-        coord = []
-        idx_left = block_idx
-        for s in strides:
-            c = idx_left // s
-            idx_left -= c * s
-            coord.append(c)
-
-        assert idx_left == 0
-
-        # put in dummy height/widths in case we're dealing with FC layers
-        ofm_start = list(self.ofm_box.start_coord)
-        ofm_end = list(self.ofm_box.end_coord)
-
-        # cut out a nice block shape
-        for idx in (-1, -2, -3):
-            if len(ofm_start) >= -idx:
-                ofm_start[idx] += block_cfg[idx] * coord[idx]
-                ofm_end[idx] = min(ofm_end[idx], ofm_start[idx] + block_cfg[idx])
-
-        ps = self.ps
-        strides = None
-        skirt = None
-        if ps.primary_op is not None:
-            strides = ps.primary_op.attrs.get("strides", None)
-            skirt = ps.primary_op.attrs.get("skirt", None)
-        npu_block_type = ps.npu_block_type
-
-        ofm_box = Box(ofm_start, ofm_end)
-        ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
-            strides, skirt, self.ifm_tensor.shape, npu_block_type, self.concat_axis, self.concat_offset
-        )
-
-        weight_box = None
-        if self.weight_tensor is not None:
-            weight_oc_start = ofm_start[-1]
-            weight_oc_end = ofm_end[-1]
-            if self.concat_axis - len(self.weight_tensor.shape) == -1:
-                weight_oc_start -= self.concat_offset
-                weight_oc_end -= self.concat_offset
-
-            weight_box = Box.make_weight_box(
-                self.weight_tensor.shape,
-                npu_block_type,
-                weight_oc_start,
-                weight_oc_end,
-                self.weight_tensor.weight_transpose_depthwise,
-            )
-
-        return NpuStripe(
-            self.ps,
-            self.block_config,
-            self.is_first,
-            self.is_last,
-            self.is_first_h_stripe,
-            self.is_last_h_stripe,
-            self.ifm_tensor,
-            ifm_box,
-            self.ofm_tensor,
-            ofm_box,
-            self.weight_tensor,
-            weight_box,
-            self.scale_tensor,
-            self.concat_axis,
-            self.concat_offset,
-        )
-
 
 class DMA(Command):
     def __init__(self, ps, in_tensor, out_tensor, box):
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 18a419c0..60e62aa6 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -27,6 +27,7 @@ from .numeric_util import round_up_divide
 from .operation import create_activation_function
 from .operation import NpuBlockType
 from .operation import Op
+from .shape4d import Shape4D
 from .tensor import TensorPurpose
 
 
@@ -90,8 +91,8 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
     weight_tensor = ps.weight_tensor
     scale_tensor = ps.scale_tensor
 
-    ofm_start = [0] * len(ofm_shape)
-    ofm_end = list(ofm_shape)
+    ofm_start = [0, 0, 0, 0]
+    ofm_end = ofm_shape.as_list()
 
     strides = None
     skirt = None
@@ -100,9 +101,9 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
         strides = ps.primary_op.attrs.get("strides", None)
         skirt = ps.primary_op.attrs.get("skirt", None)
         if ps.primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
-            upscaling = ofm_shape[-3] // ifm_shape[-3]
+            upscaling = ofm_shape.height // ifm_shape.height
         elif ps.primary_op.type == Op.ResizeBilinear:
-            upscaling = round_up_divide(ofm_shape[-3], ifm_shape[-3])
+            upscaling = round_up_divide(ofm_shape.height, ifm_shape.height)
 
     concat_axis = 0
     concat_offset = 0
@@ -135,14 +136,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
 
             if ifm_shape is not None:
                 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
-                    strides,
-                    skirt,
-                    ifm_tensor.shape,
-                    npu_block_type,
-                    concat_axis,
-                    concat_offset,
-                    split_offsets[0],
-                    upscaling,
+                    strides, skirt, ifm_shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], upscaling,
                 )
             else:
                 ifm_box = Box([], [])
@@ -163,7 +157,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
                         intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
                             strides,
                             skirt,
-                            intermediate.shape,
+                            Shape4D(intermediate.shape),
                             npu_block_type,
                             concat_axis,
                             concat_offset,
@@ -212,6 +206,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
             )
 
     elif strat == SchedulingStrategy.IfmStream:
+        assert ifm_shape is not None
         y_step = block_config[0]
         y_start = ofm_start[-3]
         y_dim = ofm_end[-3]
@@ -222,8 +217,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
             prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
         else:
             ifm_y_present = 1
-            if len(ifm_shape) >= 3:
-                ifm_y_present = ifm_shape[-3]
+            ifm_y_present = ifm_shape.height
             prev_pass_gen = []
             prev_pass = None
 
@@ -276,7 +270,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
                         intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
                             strides,
                             skirt,
-                            intermediate.shape,
+                            Shape4D(intermediate.shape),
                             npu_block_type,
                             concat_axis,
                             concat_offset,
@@ -380,13 +374,13 @@ def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
         if cmd.is_npu_pass_command():
             if cmd.is_first:
                 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(
-                    cmd.ifm_box.start_coord, shape=cmd.ps.ifm_shapes[0], is_top_box=False
+                    cmd.ifm_box.start_coord, cmd.ps.ifm_shapes[0].as_list(), is_top_box=False
                 )
                 if ifm_read is None:
                     return 0
             if cmd.is_last:
                 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(
-                    cmd.ofm_box.end_coord, shape=cmd.ps.ofm_shapes[0], is_top_box=True
+                    cmd.ofm_box.end_coord, cmd.ps.ofm_shapes[0].as_list(), is_top_box=True
                 )
                 if write_offset is None:
                     return 0
@@ -399,7 +393,7 @@ def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
 
             if cmd.is_first:
                 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(
-                    cmd.ifm_box.end_coord, shape=cmd.ps.ifm_shapes[0], is_top_box=True
+                    cmd.ifm_box.end_coord, cmd.ps.ifm_shapes[0].as_list(), is_top_box=True
                 )
 
     min_overlap = max(min_overlap, 0)
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 9380374e..07117025 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -58,6 +58,7 @@ from .register_command_stream_generator import generate_command_stream
 from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
 from .register_command_stream_util import to_npu_kernel
 from .register_command_stream_util import UNARY_ELEMWISE_OPS
+from .shape4d import Shape4D
 from .tensor import MemType
 from .tensor import Tensor
 from .tensor import TensorBlockTraversal
@@ -231,7 +232,7 @@ def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
     return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
 
 
-def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, fm_shape: List[int]) -> NpuFeatureMap:
+def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, fm_shape: Shape4D) -> NpuFeatureMap:
     """Creates feature map with common fields populated"""
     fm = NpuFeatureMap()
     fm.region = get_region(tens, arch)
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 67925176..d2c848ad 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -21,8 +21,10 @@
 # Subgraph - Holds a neural network subgraph, pointing at Tensors, Operations, Passes, and CascadedPasses.
 # Graph - A full neural network graph with one or more Subgraphs.
 import enum
+from typing import List
 
 from .operation import Op
+from .shape4d import Shape4D
 
 
 class PassPlacement(enum.Enum):
@@ -58,8 +60,8 @@ class Pass:
         self.name = name
         self.cascade = None
         self.placement = placement
-        self.ifm_shapes = []
-        self.ofm_shapes = []
+        self.ifm_shapes: List[Shape4D] = []
+        self.ofm_shapes: List[Shape4D] = []
 
         # TODO: rename is_element_wise because it is not the same as an ElementWise operator. It is used by the tensor
         # allocation and requires that the OFM and IFM has the exact same address. Essentially complete overlap.
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index c2ec4424..4ca46831 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -48,7 +48,7 @@ def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_conf
 
     if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
         op = ps2.primary_op
-        ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0][-1], op.ifm.dtype.size_in_bits())
+        ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())
     else:
         ifm_block_depth = block_config_ps2[-1]
 
@@ -231,9 +231,9 @@ def estimate_conv_pooling_cycles(
         arch.config.ofm_ublock.height == 2
         and npu_block_type
         in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
-        and ofm_tens_shape[1] == 1
+        and ofm_tens_shape.height == 1
         # Optimisation only applies for even width tensors
-        and ofm_tens_shape[2] % 2 == 0
+        and ofm_tens_shape.width % 2 == 0
         and kernel_dims[0] == 1
     ):
         ofm_ublock.width = 4
@@ -319,14 +319,14 @@ def estimate_conv_pooling_cycles(
         cycles_dpu_blk += delay_cycles
 
     if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
-        cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape[3], ifm_block.depth)
+        cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)
 
     cycles_dpu_blk /= arch.ncores
 
     num_ofm_blk = (
-        numeric_util.round_up_divide(ofm_tens_shape[1], ofm_block.height)
-        * numeric_util.round_up_divide(ofm_tens_shape[2], ofm_block.width)
-        * numeric_util.round_up_divide(ofm_tens_shape[3], ofm_block.depth)
+        numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)
+        * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)
+        * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)
     )
 
     cycles_output_blk = estimate_output_cycles(
@@ -336,7 +336,7 @@ def estimate_conv_pooling_cycles(
     if scale_tensor:
         cycles_bias_blk = (
             10
-            * min(ofm_block.depth, ofm_tens_shape[3])
+            * min(ofm_block.depth, ofm_tens_shape.depth)
             * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
             / 256
         )
@@ -420,8 +420,8 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
         npu_block_type = primary_op.type.npu_block_type
 
         ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
-        ifm_tensor_shape = list(ps.primary_op.ifm_shapes[0])
-        ofm_tensor_shape = list(ps.primary_op.ofm_shapes[0])
+        ifm_tensor_shape = ps.primary_op.ifm_shapes[0].clone()
+        ofm_tensor_shape = ps.primary_op.ofm_shapes[0].clone()
 
         if npu_block_type == NpuBlockType.ReduceSum:
             block_traversal = TensorBlockTraversal.DepthFirst
@@ -434,7 +434,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
         else:
             block_traversal = TensorBlockTraversal.Default
         ifm_block_depth = get_ifm_block_depth(
-            npu_block_type, ifm_tensor_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
+            npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
         )
         ifm_block = arch.get_ifm_block_size(
             ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
@@ -448,11 +448,12 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
             NpuBlockType.ReduceSum,
         ):
             # extent the ifm to full dimension
-            batch_size = ifm_tensor_shape[0]
+
+            batch_size = ifm_tensor_shape.batch
 
             # add in padding
-            ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2]  # height += top and bottom
-            ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3]  # width  += left and right
+            ifm_tensor_shape.height += explicit_padding[0] + explicit_padding[2]  # height += top and bottom
+            ifm_tensor_shape.width += explicit_padding[1] + explicit_padding[3]  # width  += left and right
 
             if npu_block_type != NpuBlockType.Pooling:
                 if npu_block_type == NpuBlockType.ReduceSum:
@@ -468,9 +469,9 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
                     weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
 
                 nn_ops = (
-                    int(ofm_tensor_shape[0])
-                    * int(ofm_tensor_shape[1])
-                    * int(ofm_tensor_shape[2])
+                    int(ofm_tensor_shape.batch)
+                    * int(ofm_tensor_shape.height)
+                    * int(ofm_tensor_shape.width)
                     * int(weight_tensor_shape[0])
                     * int(weight_tensor_shape[1])
                     * int(weight_tensor_shape[2])
@@ -481,7 +482,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
                     primary_op.attrs["ksize"][1],
                     primary_op.attrs["ksize"][2],
                     1,
-                    ifm_tensor_shape[3],
+                    ifm_tensor_shape.depth,
                 ]
                 weight_tensor_bandwidth_shape = weight_tensor_shape
                 weight_tensor_element_size = 0
@@ -504,8 +505,8 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
             replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple
 
             weight_read_multiple = numeric_util.round_up_divide(
-                ofm_tensor_shape[1], ofm_block.height
-            ) * numeric_util.round_up_divide(ofm_tensor_shape[2], ofm_block.width)
+                ofm_tensor_shape.height, ofm_block.height
+            ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)
             replacement_read_bws[weight_tensor] = (
                 batch_size
                 * shape_num_elements(weight_tensor_bandwidth_shape)
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index be26a26b..c80e18b5 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -26,6 +26,7 @@ from typing import TYPE_CHECKING
 
 from .errors import VelaError
 from .numeric_util import full_shape
+from .shape4d import Shape4D
 
 
 if TYPE_CHECKING:
@@ -372,7 +373,7 @@ def create_activation_function(op_type: Op) -> ActivationFunction:
     return act
 
 
-def get_slice_offsets(input_shape, offset_tens, offset_mask, is_begin=True):
+def get_slice_offsets(input_shape: List[int], offset_tens: int, offset_mask: int, is_begin: bool = True):
     # For strided slice operator: get start or end offsets
     offsets = len(input_shape) * [0] if is_begin else input_shape[:]
     for idx in range(len(input_shape)):
@@ -427,8 +428,8 @@ class Operation:
         self.op_index = None  # input network operator index
         self.activation_lut = None
         self._kernel = None
-        self.ifm_shapes = []
-        self.ofm_shapes = []
+        self.ifm_shapes: List[Shape4D] = []
+        self.ofm_shapes: List[Shape4D] = []
 
     def clone(self, suffix="_clone"):
         res = Operation(self.type, self.name + suffix)
@@ -707,6 +708,9 @@ class Operation:
         raise VelaError("\n".join(lines))
 
     def set_ifm_ofm_shapes(self):
+        self.ifm_shapes = []
+        self.ofm_shapes = []
+
         ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = self.get_ifm_ifm2_weights_ofm()
 
         # set all shapes to op, as 4D
@@ -716,24 +720,24 @@ class Operation:
             batch_size = elms // n_in_elems
             assert batch_size * n_in_elems == elms
 
-            self.ifm_shapes.append([batch_size, 1, 1, n_in_elems])
-            self.ofm_shapes.append(ofm_tensor.get_full_shape())
+            self.ifm_shapes.append(Shape4D([batch_size, 1, 1, n_in_elems]))
+            self.ofm_shapes.append(Shape4D(ofm_tensor.get_full_shape()))
         elif self.type == Op.Softmax:
-            self.ifm_shapes.append(ifm_tensor.get_full_shape())
-            self.ofm_shapes.append(ofm_tensor.get_full_shape())
+            self.ifm_shapes.append(Shape4D(ifm_tensor.get_full_shape()))
+            self.ofm_shapes.append(Shape4D(ofm_tensor.get_full_shape()))
         elif self.type.is_split_op or self.type.is_concat_op():
             for inp in self.inputs:
                 if inp is not None:
-                    self.ifm_shapes.append(full_shape(4, inp.shape, 1))
+                    self.ifm_shapes.append(Shape4D(full_shape(4, inp.shape, 1)))
                 else:
                     self.ifm_shapes.append(None)
             for out in self.outputs:
                 if out is not None:
-                    self.ofm_shapes.append(full_shape(4, out.shape, 1))
+                    self.ofm_shapes.append(Shape4D(full_shape(4, out.shape, 1)))
                 else:
                     self.ofm_shapes.append(None)
         else:
-            self.ifm_shapes.append(full_shape(4, ifm_tensor.shape, 1))
+            self.ifm_shapes.append(Shape4D(full_shape(4, ifm_tensor.shape, 1)))
             if ifm2_tensor is not None:
-                self.ifm_shapes.append(full_shape(4, ifm2_tensor.shape, 1))
-            self.ofm_shapes.append(full_shape(4, ofm_tensor.shape, 1))
+                self.ifm_shapes.append(Shape4D(full_shape(4, ifm2_tensor.shape, 1)))
+            self.ofm_shapes.append(Shape4D(full_shape(4, ofm_tensor.shape, 1)))
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 095a78d4..8f6660c2 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -231,9 +231,9 @@ def pack_into_passes(nng, arch, verbose_packing=False):
                 ofm_tensor = op.ofm
                 if ofm_tensor is None:
                     ofm_tensor = op.outputs[0]
-                build_pass((op,), ofm_tensor)
+                build_pass((op,), ofm_tensor, op.ofm_shapes[0].clone())
 
-    def build_pass(start_ops_to_process, ofm_tensor=None):
+    def build_pass(start_ops_to_process, ofm_tensor=None, ofm_shapes=None):
         reverse_ops_list = []
         curr_flags = PassFlags.Empty
         npu_block_type = NpuBlockType.Default
@@ -416,8 +416,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
                 ps.ifm_shapes.append(ps.primary_op.ifm_shapes[0])
 
         ps.ofm_tensor = ofm_tensor
-        if ps.primary_op is not None:
-            ps.ofm_shapes.append(ps.primary_op.ofm_shapes[0])
+        ps.ofm_shapes.append(ofm_shapes)
 
         assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None
         ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]
@@ -453,11 +452,11 @@ def pack_into_passes(nng, arch, verbose_packing=False):
             avgpool_out = inp.clone("_avgpooled")
             avgpool_out.consumer_list.append(op)
             avgpool_op.set_output_tensor(avgpool_out)
-            avgpool_op.ifm_shapes = op.ifm_shapes
-            avgpool_op.ofm_shapes = op.ofm_shapes
+            avgpool_op.set_ifm_ofm_shapes()
 
             op.inputs[0] = avgpool_out
             op_list.insert(0, avgpool_op)
+            op.set_ifm_ofm_shapes()
 
             DebugDatabase.add_optimised(op, avgpool_op)
             return avgpool_op
diff --git a/ethosu/vela/shape4d.py b/ethosu/vela/shape4d.py
new file mode 100644
index 00000000..a1b4feaa
--- /dev/null
+++ b/ethosu/vela/shape4d.py
@@ -0,0 +1,77 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Description:
+# Defines the class Shape4D.
+from .numeric_util import full_shape
+
+
+class Shape4D:
+    """
+    4D Shape (in NHWC format)
+    """
+
+    def __init__(self, shape, base=1):
+        assert shape is not None
+        assert len(shape) <= 4
+        self._shape4D = tuple(full_shape(4, shape, base))
+
+    def __str__(self):
+        return f"<Shape4D {self.as_list()}>"
+
+    def __eq__(self, other):
+        return self._shape4D == other._shape4D
+
+    def clone(self):
+        return Shape4D(self.as_list())
+
+    @property
+    def batch(self):
+        return self._shape4D[0]
+
+    @property
+    def height(self):
+        return self._shape4D[1]
+
+    @property
+    def width(self):
+        return self._shape4D[2]
+
+    @property
+    def depth(self):
+        return self._shape4D[3]
+
+    @batch.setter
+    def batch(self, new_batch):
+        self._shape4D = (new_batch, self._shape4D[1], self._shape4D[2], self._shape4D[3])
+
+    @height.setter
+    def height(self, new_height):
+        self._shape4D = (self._shape4D[0], new_height, self._shape4D[2], self._shape4D[3])
+
+    @width.setter
+    def width(self, new_width):
+        self._shape4D = (self._shape4D[0], self._shape4D[1], new_width, self._shape4D[3])
+
+    @depth.setter
+    def depth(self, new_depth):
+        self._shape4D = (self._shape4D[0], self._shape4D[1], self._shape4D[2], new_depth)
+
+    def get_dim(self, dim):
+        assert -4 <= dim < 4
+        return self._shape4D[dim]
+
+    def as_list(self):
+        return list(self._shape4D)
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
index 1f027d60..d8faf369 100644
--- a/ethosu/vela/shared_buffer_allocation.py
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -32,6 +32,7 @@ from .operation import Kernel
 from .operation import NpuBlockType
 from .range_set import MemoryRangeSet
 from .register_command_stream_util import to_kernel
+from .shape4d import Shape4D
 from .tensor import MemArea
 
 
@@ -195,14 +196,14 @@ def shared_buffer_allocation_for_pass(arch, ps) -> SharedBufferAllocation:
         ifm_bits = ifm_tensor.dtype.size_in_bits()
         ifm_shape = ps.primary_op.ifm_shapes[0]
 
-        if ifm_shape != []:
-            ifm_depth = ifm_shape[-1]
+        if ifm_tensor.shape != []:
+            ifm_depth = ifm_shape.depth
 
         if is_elementwise:
             ifm_count = 2
             if ifm_tensor.shape == []:  # Scalar in ifm1
                 assert ifm2_tensor
-                ifm_depth = ps.primary_op.ifm_shapes[1][-1]
+                ifm_depth = ps.primary_op.ifm_shapes[1].depth
                 ifm_count = 1
             elif not ifm2_tensor or ifm2_tensor.shape == []:  # Scalar in ifm2
                 ifm_count = 1
@@ -251,7 +252,7 @@ def shared_buffer_allocation_for_npu_op(
         ifm_bits=ifm_bits,
         ifm_depth=ifm_depth,
         ifm_count=ifm_count,
-        ofm_shape=ofm_shape,
+        ofm_shape=Shape4D(ofm_shape),
     )
 
 
@@ -265,14 +266,9 @@ def find_suitable_block_configs(arch, alloc: SharedBufferAllocation) -> List[Tup
 
     # Constrain the search space if the OFM is smaller than the max block size
     # - Add other block search constraints here if required
-    if len(alloc.ofm_shape) <= 2:
-        max_block_height = max_block_width = alloc.ofm_shape[0]
-    else:
-        max_block_width = alloc.ofm_shape[-2]
-        max_block_height = alloc.ofm_shape[-3]
-
-    # Common block depth
-    max_block_depth = alloc.ofm_shape[-1]
+    max_block_width = alloc.ofm_shape.width
+    max_block_height = alloc.ofm_shape.height
+    max_block_depth = alloc.ofm_shape.depth
 
     # Constrain to valid ranges before search
     max_block_width = min(arch.ofm_block_max.width, max_block_width)
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 98496539..3b4bace9 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -213,7 +213,7 @@ class SoftMax:
         ofm = self.op.outputs[0]
 
         # Reshape ifm/ofm (if needed)
-        full_shape = self.op.ifm_shapes[0]
+        full_shape = self.op.ifm_shapes[0].as_list()
         if full_shape[0] > 1:
             full_shape[1] *= full_shape[0]
             full_shape[0] = 1
@@ -414,6 +414,7 @@ class SoftMax:
         shr30_op.add_input_tensor(scaled_exp)
         shr30_op.add_input_tensor(right_shift)
         shr30_op.set_output_tensor(ofm)
+        shr30_op.set_ifm_ofm_shapes()
         DebugDatabase.add_optimised(self.op, shr30_op)
 
         return shr30_op
@@ -535,6 +536,7 @@ class SoftMax:
         shr13_op.add_input_tensor(mul_ofm)
         shr13_op.add_input_tensor(reciprocal_right_shift)
         shr13_op.set_output_tensor(ofm)
+        shr13_op.set_ifm_ofm_shapes()
         DebugDatabase.add_optimised(self.op, shr13_op)
 
         return shr13_op
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index df8f8868..093e8771 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -40,6 +40,7 @@ from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .numeric_util import full_shape
 from .operation import Op
 from .operation import Operation
+from .shape4d import Shape4D
 
 Shape = List
 
@@ -304,6 +305,7 @@ def create_const_tensor(
     # Operator
     const_op = Operation(Op.Const, name)
     const_op.set_output_tensor(const_tensor)
+    const_op.set_ifm_ofm_shapes()
     return const_tensor
 
 
@@ -323,8 +325,7 @@ def create_reshape_tensor(tens, shape, ifm_reshape=True):
     reshape_op.add_input_tensor(reshape_ifm)
     reshape_op.add_input_tensor(create_const_tensor(name + "_shape", [1], DataType.int32, shape))
     reshape_op.set_output_tensor(reshape_ofm)
-    reshape_op.ifm_shapes.append(full_shape(4, reshape_ifm.shape, 1))
-    reshape_op.ofm_shapes.append(full_shape(4, reshape_ofm.shape, 1))
+    reshape_op.set_ifm_ofm_shapes()
     return reshape_ofm if ifm_reshape else reshape_ifm
 
 
@@ -608,7 +609,7 @@ class Tensor:
     def consumers(self) -> List[Operation]:
         return self.consumer_list
 
-    def addresses_for_rolling_buffer(self, start_coord: Shape, end_coord: Shape, fm_shape: Shape) -> Tuple:
+    def addresses_for_rolling_buffer(self, start_coord: Shape, end_coord: Shape, fm_shape: Shape4D) -> Tuple:
         # returns ( box_height0, box_height1, box_width, [address_tl, address_tr, address_bl, address_br] )
 
         if self.storage_shape == []:
@@ -616,7 +617,7 @@ class Tensor:
                 1,
                 1,
                 1,
-                [self.address_for_coordinate(start_coord, shape=fm_shape), None, None, None],
+                [self.address_for_coordinate(start_coord, shape=fm_shape.as_list()), None, None, None],
             )
 
         storage_shape_4D = full_shape(4, self.storage_shape, 1)
@@ -630,20 +631,20 @@ class Tensor:
         box_width = crossing_x - start_coord[2]
 
         addresses: List = [None] * 4
-        addresses[0] = self.address_for_coordinate(start_coord, shape=fm_shape)
+        addresses[0] = self.address_for_coordinate(start_coord, shape=fm_shape.as_list())
 
         if end_coord[2] > crossing_x:
             addresses[1] = self.address_for_coordinate(
-                [start_coord[0], start_coord[1], crossing_x, start_coord[3]], shape=fm_shape
+                [start_coord[0], start_coord[1], crossing_x, start_coord[3]], shape=fm_shape.as_list()
             )
             raise UnsupportedFeatureError("Striping in vertical direction is not supported")
         if end_coord[1] > crossing_y:
             addresses[2] = self.address_for_coordinate(
-                [start_coord[0], crossing_y, start_coord[2], start_coord[3]], shape=fm_shape
+                [start_coord[0], crossing_y, start_coord[2], start_coord[3]], shape=fm_shape.as_list()
             )
         if end_coord[1] > crossing_y and end_coord[2] > crossing_x:
             addresses[3] = self.address_for_coordinate(
-                [start_coord[0], crossing_y, crossing_x, start_coord[3]], shape=fm_shape
+                [start_coord[0], crossing_y, crossing_x, start_coord[3]], shape=fm_shape.as_list()
             )
 
         return box_height0, box_height0, box_width, addresses
diff --git a/ethosu/vela/test/test_graph_optimiser.py b/ethosu/vela/test/test_graph_optimiser.py
index 45377417..7fdc4bd8 100644
--- a/ethosu/vela/test/test_graph_optimiser.py
+++ b/ethosu/vela/test/test_graph_optimiser.py
@@ -21,6 +21,7 @@ import numpy as np
 from ethosu.vela.graph_optimiser import convert_batched_fc_shape
 from ethosu.vela.operation import Op
 from ethosu.vela.tensor import create_const_tensor
+from ethosu.vela.tensor import Shape4D
 from ethosu.vela.tensor import Tensor
 from ethosu.vela.test import testutil
 
@@ -35,8 +36,8 @@ def test_convert_batched_fc():
 
     ifm.consumer_list.append(op)
 
-    op.ifm_shapes.append([4, 1, 1, 8])
-    op.ofm_shapes.append([4, 1, 1, 8])
+    op.ifm_shapes.append(Shape4D([4, 1, 1, 8]))
+    op.ofm_shapes.append(Shape4D([4, 1, 1, 8]))
 
     prev_op = op.clone()
     prev_op.ifm_shapes = op.ifm_shapes
diff --git a/ethosu/vela/test/test_supported_operators.py b/ethosu/vela/test/test_supported_operators.py
index 583821a2..973b820d 100644
--- a/ethosu/vela/test/test_supported_operators.py
+++ b/ethosu/vela/test/test_supported_operators.py
@@ -62,7 +62,7 @@ def test_constraint_tens_input_scalar():
 
 def test_constraint_tens_shape_size():
     # Tensors cannot be > 4D
-    op = testutil.create_op_with_quant_tensors(Op.Relu, [1, 1, 8, 8, 8], [1, 1, 8, 8, 8])
+    op = testutil.create_op_with_quant_tensors(Op.Relu, [1, 1, 8, 8, 8], [1, 1, 8, 8, 8], set_ifm_ofm_shapes=False)
     assert not support.is_operator_supported(op)
 
 
diff --git a/ethosu/vela/test/testutil.py b/ethosu/vela/test/testutil.py
index 63f841b4..c3459501 100644
--- a/ethosu/vela/test/testutil.py
+++ b/ethosu/vela/test/testutil.py
@@ -75,7 +75,7 @@ def create_elemwise_op(
 
 
 def create_op_with_quant_tensors(
-    op_type, ifm_shape, ofm_shape, weights_shape=None, bias_shape=None, datatype=DataType.uint8
+    op_type, ifm_shape, ofm_shape, weights_shape=None, bias_shape=None, datatype=DataType.uint8, set_ifm_ofm_shapes=True
 ):
     ifm = Tensor(ifm_shape, datatype, "in")
     ifm.quantization = default_quant_params()
@@ -107,7 +107,9 @@ def create_op_with_quant_tensors(
         bias = create_const_tensor("bias", bias_shape, DataType.int32, np.zeros(bias_shape), np.int32, quantization=qp)
         op.add_input_tensor(bias)
 
-    op.set_ifm_ofm_shapes()
+    if set_ifm_ofm_shapes:
+        op.set_ifm_ofm_shapes()
+
     return op
 
 
-- 
cgit v1.2.1