From fdbb072dacae339dd3f8efd3fb70fa84b9296033 Mon Sep 17 00:00:00 2001
From: Rickard Bolin <rickard.bolin@arm.com>
Date: Tue, 5 Sep 2023 11:38:19 +0000
Subject: MLBEDSW-8491: Add support for Mirror pad

Change-Id: I3c13118e14195a5fb8e522a38b205b75fb07b74b
Signed-off-by: Rickard Bolin <rickard.bolin@arm.com>
---
 ethosu/vela/operation.py                  |   2 +-
 ethosu/vela/range_set.py                  |   1 -
 ethosu/vela/tflite_graph_optimiser.py     | 125 ++++++++++++++++++++++++++++++
 ethosu/vela/tflite_mapping.py             |   6 +-
 ethosu/vela/tflite_model_semantic.py      |   3 +
 ethosu/vela/tflite_supported_operators.py |  31 ++++++--
 6 files changed, 160 insertions(+), 8 deletions(-)

diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 9b4149f..a831537 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -225,7 +225,7 @@ class Op(Enum):
     Mean = OperatorInfo(indices=NNG_IFM_INDICES)
     Min = OperatorInfo()
     Minimum = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES)
-    MirrorPad = OperatorInfo()
+    MirrorPad = OperatorInfo(indices=NNG_IFM_IFM2_INDICES)
     Mul = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES)
     Neg = OperatorInfo()
     NonMaxSuppressionV4 = OperatorInfo()
diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py
index 1a00373..10a5c83 100644
--- a/ethosu/vela/range_set.py
+++ b/ethosu/vela/range_set.py
@@ -31,7 +31,6 @@ class RangeSet:
         self.ranges = ranges  # track a list of (start, end) tuples, always in ascending order sorted by start.
 
         if start is not None and start != end:
-            assert start < end
             self.ranges.append((start, end))
 
     def __or__(self, other):
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index cc947bc..ad979bd 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -1881,6 +1881,130 @@ def replace_pad_by_hw_pad(op: Operation, arch, nng) -> Operation:
     return op
 
 
+def convert_mirror_pad(op: Operation, arch, nng):
+    if op.type != Op.MirrorPad or not op.run_on_npu:
+        return op
+
+    _, (top, bot), (left, right), _ = op.ifm2.values
+    mode = op.attrs["mode"]  # 0 = reflect, 1 = symmetric
+
+    ifm = op.ifm
+    ofm = op.ofm
+    ofm.ops = []
+    elem_size = 2 if ofm.dtype == DataType.int16 else 1
+    n, h, w, c = ifm.shape
+    _, oh, ow, _ = ofm.shape
+    # Force linear format on OFM to allow negative stride multipliers
+    ofm.force_linear_format = True
+
+    # Intermediate ofm needed to store ifm padded with top and bot values as input to the left and right padding
+    intermediate_ofm_tens = Tensor([n, h + top + bot, w, c], ofm.dtype, "intermediate_ofm_tens")
+    intermediate_ofm_tens.quantization = op.outputs[0].quantization.clone()
+    intermediate_ofm_tens.force_linear_format = True
+
+    # If there is no left or right padding, we can write directly to the ofm without an intermediate tensor
+    if not (left or right):
+        intermediate_ofm_tens = ofm
+
+    # Initial op to copy the ifm into the middle of the intermediate ofm
+    avg_pool_init = create_avgpool_nop("init_pool")
+    avg_pool_init.write_shape = Shape4D(n, h, w, c)
+    avg_pool_init.write_offset = Shape4D(0, top, 0, 0)
+    avg_pool_init.read_shapes[0] = Shape4D(n, h, w, c)
+    avg_pool_init.read_offsets[0] = Shape4D(0, 0, 0, 0)
+    avg_pool_init.add_input_tensor(ifm)
+    avg_pool_init.set_output_tensor(intermediate_ofm_tens)
+    avg_pool_init.set_ifm_ofm_shapes()
+    DebugDatabase.add_optimised(op, avg_pool_init)
+
+    # Create pools with negative stride to mirror edges and offset to write at padding positions
+    avg_pool_pad = create_avgpool_nop("pad_pool")
+    for i, pad_amount in enumerate([top, bot, left, right]):
+        # Clear input from previous cloned op
+        avg_pool_pad.inputs = []
+        if not pad_amount:
+            continue
+
+        if i == 0:  # top
+            # Set read and write shape width to full ifm width and height to "top" pad size
+            avg_pool_pad.write_shape = Shape4D(n, top, w, c)
+            avg_pool_pad.read_shapes[0] = Shape4D(n, top, w, c)
+            # Leave read offset as default to read the top chunk of the ifm
+            # For reflect mode, shift height offset down one step to "skip" the edge
+            avg_pool_pad.read_offsets[0] = Shape4D(0, 0, 0, 0) if mode == 1 else Shape4D(0, 1, 0, 0)
+            # Offset the base address of tile 0 to start writing just above the ifm that was copied into the middle of
+            # the ofm and use negative height striding to mirror the above ifm chunk
+            avg_pool_pad.tile_base_offsets_ofm[0] = ((top - 1) * w) * c * elem_size
+        if i == 1:  # bot
+            # Set read and write shape width to full ifm width and height to "bot" pad size
+            avg_pool_pad.write_shape = Shape4D(n, bot, w, c)
+            avg_pool_pad.read_shapes[0] = Shape4D(n, bot, w, c)
+            # Set read offset to read the bottom chunk of the ifm
+            # For reflect mode, shift height offset up one step to "skip" the edge
+            avg_pool_pad.read_offsets[0] = Shape4D(0, h - bot, 0, 0) if mode == 1 else Shape4D(0, h - bot - 1, 0, 0)
+            # Offset the base address of tile 0 to start writing at the very bottom of the ofm and use negative height
+            # striding to mirror the above ifm chunk
+            avg_pool_pad.tile_base_offsets_ofm[0] = (oh - 1) * w * c * elem_size
+        if i == 2:  # left
+            # Set read and write shape height to full intermediate ofm height and width to "left" pad size
+            avg_pool_pad.write_shape = Shape4D(n, h + top + bot, left, c)
+            avg_pool_pad.read_shapes[0] = Shape4D(n, h + top + bot, left, c)
+            # Leave read offset as default to read the leftmost chunk of the intermediate ofm
+            # For reflect mode, shift width offset one step to the right to "skip" the edge
+            avg_pool_pad.read_offsets[0] = Shape4D(0, 0, 0, 0) if mode == 1 else Shape4D(0, 0, 1, 0)
+            # Offset the base address of tile 0 to start writing just left of the intermediate ofm and use negative
+            # width striding to mirror the above ifm chunk
+            avg_pool_pad.tile_base_offsets_ofm[0] = (left - 1) * c * elem_size
+        if i == 3:  # right
+            # Set read and write shape height to full intermediate ofm height and width to "right" pad size
+            avg_pool_pad.write_shape = Shape4D(n, h + top + bot, right, c)
+            avg_pool_pad.read_shapes[0] = Shape4D(n, h + top + bot, right, c)
+            # Set read offset to read the rightmost chunk of the intermediate ofm
+            # For reflect mode, shift width offset one step to the left to "skip" the edge
+            avg_pool_pad.read_offsets[0] = Shape4D(0, 0, w - right, 0) if mode == 1 else Shape4D(0, 0, w - right - 1, 0)
+            # Offset the base address of tile 0 to start writing at the rightmost part of the ofm and use negative
+            # width striding to mirror the above ifm chunk
+            avg_pool_pad.tile_base_offsets_ofm[0] = (ow - 1) * c * elem_size
+
+        # Write offset (0,0,0,0) for all convs
+        avg_pool_pad.write_offset = Shape4D(0, 0, 0, 0)
+
+        if i in [0, 1]:  # negative height stride for top and bot, negative width stride for left and right
+            avg_pool_pad.ofm_stride_multiplier = [1, -1, 1]  # C/H/W
+            # top and bot reads from ifm and writes to intermediate ofm
+            avg_pool_pad.add_input_tensor(ifm)
+            intermediate_ofm_tens.ops.append(avg_pool_pad)
+            avg_pool_pad.outputs = [intermediate_ofm_tens]
+        else:
+            avg_pool_pad.ofm_stride_multiplier = [1, 1, -1]  # C/H/W
+            # left and right reads from intermediate ofm and writes to ofm
+            avg_pool_pad.add_input_tensor(intermediate_ofm_tens)
+            ofm.ops.append(avg_pool_pad)
+            avg_pool_pad.outputs = [ofm]
+
+        avg_pool_pad.set_ifm_ofm_shapes()
+        DebugDatabase.add_optimised(op, avg_pool_pad)
+
+        # Clone operation for next padding direction
+        avg_pool_pad = avg_pool_pad.clone(f"_{i}")
+
+    if left or right:
+        # Copy intermediate ofm into final ofm
+        avg_pool_final_copy = create_avgpool_nop("avg_pool_final_copy")
+        avg_pool_final_copy.write_shape = Shape4D(n, h + top + bot, w, c)
+        avg_pool_final_copy.write_offset = Shape4D(0, 0, left, 0)
+        avg_pool_final_copy.read_shapes[0] = Shape4D(n, h + top + bot, w, c)
+        avg_pool_final_copy.read_offsets[0] = Shape4D(0, 0, 0, 0)
+
+        avg_pool_final_copy.add_input_tensor(intermediate_ofm_tens)
+        ofm.ops.append(avg_pool_final_copy)
+        avg_pool_final_copy.outputs = [ofm]
+        avg_pool_final_copy.set_ifm_ofm_shapes()
+        DebugDatabase.add_optimised(op, avg_pool_final_copy)
+
+    return op
+
+
 def convert_pad(op: Operation, arch, nng):
     """
     Rewrites PAD operator to an average pool that copies the IFM to the OFM
@@ -2899,6 +3023,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
         convert_mul_max_to_abs_or_lrelu,
         convert_lrelu,
         convert_avg_pool_to_conv2d,
+        convert_mirror_pad,
         fixup_strided_conv,
         convert_hardswish_to_lut,
         rewrite_fully_connected_input,
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
index 0faa079..c599019 100644
--- a/ethosu/vela/tflite_mapping.py
+++ b/ethosu/vela/tflite_mapping.py
@@ -867,7 +867,11 @@ builtin_operator_map = {
         OptionsSerializer("SquaredDifferenceOptions"),
         TFLITE_IFM_IFM2_INDICES,
     ),
-    BuiltinOperator.MIRROR_PAD: (Op.MirrorPad, OptionsSerializer("MirrorPadOptions", ("mode",)), TFLITE_NO_INDICES),
+    BuiltinOperator.MIRROR_PAD: (
+        Op.MirrorPad,
+        OptionsSerializer("MirrorPadOptions", ("mode",)),
+        TFLITE_IFM_IFM2_INDICES,
+    ),
     BuiltinOperator.ABS: (Op.Abs, OptionsSerializer("AbsOptions"), TFLITE_IFM_INDICES),
     BuiltinOperator.SPLIT_V: (Op.SplitV, OptionsSerializer("SplitVOptions", ("num_splits",)), TFLITE_IFM_INDICES),
     BuiltinOperator.UNIQUE: (
diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py
index eff40bc..1c258de 100644
--- a/ethosu/vela/tflite_model_semantic.py
+++ b/ethosu/vela/tflite_model_semantic.py
@@ -258,6 +258,9 @@ class TFLiteSemantic:
             Op.Transpose: [
                 TFLiteSemantic.constraint_tens_quant_none_check,
             ],
+            Op.MirrorPad: [
+                TFLiteSemantic.constraint_tens_quant_none_check,
+            ],
         }
         return generic_constraints_exclude_list
 
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 48813fe..ad61fca 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -106,10 +106,17 @@ class TFLiteSupportedOperators:
         )
     )
     binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
+
     elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,))
-    pad_ops = set((Op.Pad,))
+    pad_ops = set(
+        (
+            Op.Pad,
+            Op.MirrorPad,
+        )
+    )
+
     supported_int32_tensor_ops = (
-        set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax, Op.Transpose))
+        set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax, Op.Transpose, Op.MirrorPad))
         | binary_elem_wise_add_mul_sub
         | binary_elem_wise_shift_ops
     )
@@ -312,9 +319,13 @@ class TFLiteSupportedOperators:
         self.specific_constraints[Op.StridedSlice].append(TFLiteSupportedOperators.constraint_stridedslice_offset_false)
 
         # Pad specific checks:
-        self.specific_constraints[Op.Pad].append(TFLiteSupportedOperators.constraint_pad_shape)
-        self.specific_constraints[Op.Pad].append(TFLiteSupportedOperators.constraint_padding_dimensions)
-        self.specific_constraints[Op.Pad].append(TFLiteSupportedOperators.constraint_pad_type)
+        for op_type in TFLiteSupportedOperators.pad_ops:
+            self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_pad_shape)
+            self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_padding_dimensions)
+            self.specific_constraints[op_type].append(TFLiteSupportedOperators.constraint_pad_type)
+
+        # Mirror pad specific checks:
+        self.specific_constraints[Op.MirrorPad].append(TFLiteSupportedOperators.constraint_mirror_pad_padding_values)
 
         # Mean specific checks:
         self.specific_constraints[Op.Mean].append(TFLiteSupportedOperators.constraint_mean_height_width_product)
@@ -817,6 +828,16 @@ class TFLiteSupportedOperators:
             valid = sum(pad_tensor[0, :]) == 0
         return valid, f"First dimension padding: {pad_tensor[0,:]}, last dimension padding: {pad_tensor[-1,:]}"
 
+    @staticmethod
+    def constraint_mirror_pad_padding_values(op):
+        "The number of pad values for each direction must not be larger than the ifm size in that dimension"
+        pad_tensor = op.inputs[1].values
+        ifm_shape = op.inputs[0].shape
+        for dim_padding, ifm_dim_shape in enumerate(pad_tensor, ifm_shape):
+            if any(dim_padding > ifm_dim_shape):
+                valid = False
+        return valid, f"IFM shape: {ifm_shape}, number of padding values per dimension: {pad_tensor}"
+
     @staticmethod
     def constraint_stridedslice_stride_values(op):
         "All Strides values must be 1"
-- 
cgit v1.2.1