From e3b1b91c450129308a3a1d466a2f2876a5a872b1 Mon Sep 17 00:00:00 2001
From: Patrik Gustavsson <patrik.gustavsson@arm.com>
Date: Tue, 9 Feb 2021 15:38:46 +0100
Subject: MLBEDSW-3774 Remove SplitSliceRead

Removed SplitSliceRead from subgraph during
graph optimisation.

Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
Change-Id: I9315d4c2a6767828dd2b4e66823d73b10ebee99c
---
 ethosu/vela/graph_optimiser.py                     | 51 ++++++++++++++++++++--
 ethosu/vela/high_level_command_stream.py           |  4 +-
 ethosu/vela/high_level_command_stream_generator.py | 32 ++------------
 ethosu/vela/operation.py                           |  3 ++
 ethosu/vela/pass_packing.py                        | 16 +------
 5 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index eb93106e..50368b86 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -164,10 +164,8 @@ def rewrite_split_ops(tens, arch, nng):
                 # If start offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input
                 if (offset_start[-1] % 16) != 0:
                     inp.avoid_NHCWB16 = True
-        else:
-            offset_start = full_shape(4, offset_start, 0)
 
-        new_op.attrs["split_start"] = offset_start
+        new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)
         new_op.run_on_npu = True
         new_op.set_output_tensor(tens)
         new_op.ifm_shapes.append(Shape4D(inp.shape))
@@ -177,6 +175,45 @@ def rewrite_split_ops(tens, arch, nng):
     return tens
 
 
+def remove_SplitSliceRead(op, arch):
+
+    if op.type == Op.SplitSliceRead:
+        # Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted
+        if (
+            len(op.ofm.consumer_list) == 1
+            and op.ofm.consumer_list[0] is not None
+            and op.ofm.consumer_list[0].run_on_npu
+            and op.ofm.consumer_list[0].type != Op.Reshape
+            and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)
+        ):
+            # SplitSliceRead can be performed by tensor consumer
+            cons_op = op.ofm.consumer_list[0]
+            if cons_op.ifm == op.ofm:
+                cons_op.read_offsets[0] = op.read_offsets[0]
+                cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
+                cons_op.ifm_shapes[0] = op.ifm_shapes[0]
+            elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
+                cons_op.read_offsets[1] = op.read_offsets[0]
+                cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
+                cons_op.ifm_shapes[1] = op.ifm_shapes[0]
+
+            op.ofm.consumer_list.remove(cons_op)
+            op.ofm.ops = []
+            op.ifm.consumer_list.remove(op)
+        else:
+            avgpool_op = create_avgpool_nop(op.name + "_avgpool")
+            avgpool_op.add_input_tensor(op.ifm)
+            avgpool_op.outputs = [op.ofm]
+            op.ofm.ops.remove(op)
+            op.ofm.ops.append(avgpool_op)
+            avgpool_op.ifm_shapes.append(op.ifm_shapes[0])
+            avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
+            avgpool_op.read_offsets[0] = op.read_offsets[0]
+
+            op.ifm.consumer_list.remove(op)
+            DebugDatabase.add_optimised(op, avgpool_op)
+
+
 def insert_copy_op_after_tens(tens):
     tens_cons_list_copy = tens.consumer_list.copy()
 
@@ -202,7 +239,7 @@ def fix_sg_input_output(op, arch, nng):
     if not op.run_on_npu or op.type != Op.Reshape:
         return op
 
-    # For the memory operators we want to remove, tensors are removed.
+    # For the Reshape operators we want to remove, tensors are removed.
     # But in order to to do this, they cannot be outputs of the sg,
     # this need to be fixed prior to the removal.
     # Solution is to add a avgpool NOP, to maintain the original tensor.
@@ -1295,6 +1332,12 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
             [fuse_activation_function_with_prev, optimise_pad, add_padding_fields],
         )
 
+    # Removal of SplitSliceRead, need to be done after optimisation has been performed,
+    # since ifm/ofm_shapes are of importance to this function
+    for sg in nng.subgraphs:
+        rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
+        sg.refresh_after_modification()
+
     # Post-optimisation operator debug tracing, and checking that no undesired reshapes are left in the graph
     for sg in nng.subgraphs:
         rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [check_reshapes, _record_optimised])
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index c25c023e..0ce8fac2 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -41,7 +41,7 @@ class Box:
         npu_block_type: NpuBlockType,
         concat_axis: int = 0,
         concat_offset: int = 0,
-        split_offset: int = None,
+        split_offset: Shape4D = None,
         k_height: int = 1,
         upscaling_factor: int = 1,
     ):
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 97b42aeb..c2027e0f 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -38,46 +38,20 @@ def dma_if_necessary(ps, box, tensor):
         yield DMA(ps, in_tensor, tensor, box)
 
 
-def match_tensor(source, derived):
-    if source == derived:
-        return True
-    ops = derived.ops
-    return ops != [] and len(ops) == 1 and ops[0].type == Op.SplitSliceRead and source == ops[0].inputs[0]
-
-
 def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
     is_first = idx == 0
     is_last = idx == len(passes) - 1
     ps = passes[idx]
     block_config = block_configs[idx]
     npu_block_type = ps.npu_block_type
-    split_offsets = [None, None]  # offset for [ifm, ifm2]
+    split_offsets = list(ps.primary_op.read_offsets)  # offset for [ifm, ifm2]
 
     if ps.ifm_tensor is not None and ps.ifm2_tensor is not None and npu_block_type == NpuBlockType.ElementWise:
         # Ensure correct ifm and ifm2 order
-        if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
+        if ps.inputs[0] == ps.primary_op.inputs[1] and ps.inputs[1] == ps.primary_op.inputs[0]:
             ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
             ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0]
 
-        for op in ps.ops:
-            if op.type == Op.SplitSliceRead:
-                ps.primary_op.memory_function = op.type
-                assert len(op.inputs) == 1
-                if match_tensor(ps.ifm_tensor, op.inputs[0]):
-                    split_offsets[0] = op.attrs["split_start"]
-                elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
-                    split_offsets[1] = op.attrs["split_start"]
-                else:
-                    assert False
-    else:
-        ifm_idx = 0
-        for op in ps.ops:
-            if op.type == Op.SplitSliceRead:
-                assert ifm_idx < 2
-                split_offsets[ifm_idx] = op.attrs["split_start"]
-                ps.primary_op.memory_function = op.type
-                ifm_idx += 1
-
     ifm_tensor = ps.ifm_tensor
     ifm_shape = None
     if ifm_tensor.shape != []:
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index b297bed0..16431be7 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -418,6 +418,7 @@ class Operation:
         "ifm_shapes",
         "ofm_shapes",
         "rescale",
+        "read_offsets",
     )
 
     def __init__(self, op_type: Op, name: str):
@@ -444,6 +445,7 @@ class Operation:
         # If not none: contains rescale to be used as output scaling
         # (which overrides the ofm tensor's scale)
         self.rescale = None
+        self.read_offsets: List[Shape4D] = [None, None]  # offset for [ifm, ifm2]
 
     def clone(self, suffix="_clone"):
         res = Operation(self.type, self.name + suffix)
@@ -458,6 +460,7 @@ class Operation:
         res.forced_output_quantization = self.forced_output_quantization
         res.scheduled_pass = self.scheduled_pass
         res.op_index = None  # not relevant as not part of input network
+        res.read_offsets = list(self.read_offsets)
 
         return res
 
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index b52b159e..281c0932 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -42,8 +42,6 @@ class PassFlags(enum.Flag):
     PostFusingLimited = 8192
 
 
-npu_pre_ops = set((Op.SplitSliceRead,))
-
 mac_main_ops = set(
     (
         # convolutions
@@ -146,16 +144,6 @@ test_sequence = [
         # flags_to_clear
         PassFlags.Empty,
     ),
-    (
-        # ops_set
-        npu_pre_ops,
-        # incompatible_pack_flags
-        PassFlags.Cpu | PassFlags.MemoryOnly,
-        # flags_to_set
-        PassFlags.Npu | PassFlags.Mac | PassFlags.Pre | PassFlags.ElementWise,
-        # flags_to_clear
-        PassFlags.Empty,
-    ),
     (
         # ops_set
         npu_dma_ops,
@@ -437,7 +425,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
                 visit_op(op, tens)
 
     def create_primary_op(op_list):
-        if any(op.type in (npu_pre_ops | npu_post_ops | npu_post_fuse_limited_ops) and op.run_on_npu for op in op_list):
+        if any(op.type in (npu_post_ops | npu_post_fuse_limited_ops) and op.run_on_npu for op in op_list):
             # Configure a 1x1 AvgPool and attach the op onto it
             op = op_list[0]
             inp = op.inputs[0]
-- 
cgit v1.2.1