From 17afa2837ad366f2da32e2bc0e2659ebb35bd1d5 Mon Sep 17 00:00:00 2001
From: Louis Verhaard <louis.verhaard@arm.com>
Date: Wed, 14 Oct 2020 08:32:41 +0200
Subject: MLBEDSW-3268: Refactor mark_tensors

- Refactored mark_tensor_purpose
- Initial weight compression is now always done in insert_dma
- Removed mark_tensor_format

Change-Id: Ic719b9bcd1d27e1390d7b9ce8cd21795139ec814
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
---
 ethosu/vela/compiler_driver.py |   1 -
 ethosu/vela/insert_dma.py      |  24 ++-
 ethosu/vela/mark_tensors.py    | 329 ++++++++---------------------------------
 ethosu/vela/rewrite_graph.py   |  22 +--
 4 files changed, 76 insertions(+), 300 deletions(-)

(limited to 'ethosu')

diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 6c1142d1..05bf65a4 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -147,7 +147,6 @@ def compiler_driver(nng, arch, options, scheduler_options):
 
     extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
 
-    mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
     assert verify_graph_health(nng)
     if options.timing:
         start = time.time()
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
index 56d68d13..fc1e7986 100644
--- a/ethosu/vela/insert_dma.py
+++ b/ethosu/vela/insert_dma.py
@@ -26,6 +26,7 @@ from .weight_compressor import compress_weights
 
 
 def weights_fit_sram(arch, op, tens, nng):
+    # Compresses weights and checks if they fit in SRAM
     if tens.purpose != TensorPurpose.Weights:
         return True
 
@@ -35,22 +36,17 @@ def weights_fit_sram(arch, op, tens, nng):
     elif len(tens.shape) == 2:
         min_weight_size = tens.shape[0] * arch.OFMSplitDepth
 
-    # Need to be fit into Sram, as a double buffer
-    # Only evaluate when the compression test limit will make it impossible to fit
-    w_comp_test_limit = 2
-    if (w_comp_test_limit * min_weight_size * 2) > arch.sram_size:
-        # check worst compression ratio
-        npu_block_type = op.attrs.get("npu_block_type", NpuBlockType.Default)
-        compress_weights(arch, nng, tens, npu_block_type, 16, 16, op.get_dilation_h_w())
+    compress_weights(arch, nng, tens, op.type.npu_block_type, 16, 16, op.get_dilation_h_w())
 
-        worst_buffer_size = tens.compression_scale_for_worst_weight_stream * min_weight_size * 2
-        if worst_buffer_size > arch.sram_size:
-            print(
-                "Weights, {}, are too big to be DMAed to SRAM, estimated minimum size is {} bytes".format(
-                    tens.name, worst_buffer_size
-                )
+    # Need to be fit into Sram, as a double buffer
+    worst_buffer_size = tens.compression_scale_for_worst_weight_stream * min_weight_size * 2
+    if worst_buffer_size > arch.sram_size:
+        print(
+            "Weights, {}, are too big to be DMAed to SRAM, estimated minimum size is {} bytes".format(
+                tens.name, worst_buffer_size
             )
-            return False
+        )
+        return False
     return True
 
 
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index 206d8365..1379628a 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -14,295 +14,82 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Description:
-# Mark purpose and select formats for Tensors. Also compresses the weights.
-from . import rewrite_graph
-from . import weight_compressor
+# Mark purpose and select formats for Tensors.
 from .errors import OperatorError
 from .operation import CustomType
 from .operation import Op
+from .rewrite_graph import visit_graph_post_order
 from .tensor import MemType
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 
 
-def purpose_from_list(lst):
-    def purpose(op, idx):
-        return lst[idx]
-
-    return purpose
-
-
-def all_fm(op, idx):
-    return TensorPurpose.FeatureMap
-
-
-def all_parameter(op, idx):
-    return TensorPurpose.FeatureMap
-
-
-def input0_from_output_rest_parameter(op, idx):
-    if idx == 0:
-        res = op.outputs[0].purpose
-        if res == TensorPurpose.Unknown:
-            print("Warning: Propagating unknown tensor purpose", op)
-        return res
-    return TensorPurpose.FeatureMap
-
-
-def inputs_from_output(op, idx):
-    res = op.outputs[0].purpose
-    if res == TensorPurpose.Unknown:
-        print("Warning: Propagating unknown tensor purpose", op)
-    return res
-
+def get_format(purpose, arch):
+    if purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT, TensorPurpose.Scratch):
+        fmt = arch.default_feature_map_format
+    elif purpose == TensorPurpose.Weights:
+        fmt = arch.default_weight_format
+    elif purpose == TensorPurpose.Unknown:
+        fmt = TensorFormat.Unknown
+    else:
+        assert 0, "unknown tensor purpose {}".format(purpose)
+    return fmt
+
+
+def mark_purpose(tens, arch, purpose):
+    # Sets tensor's purpose, format, mem_area and mem_type
+    if tens.purpose == TensorPurpose.Unknown:
+        tens.purpose = purpose
+    elif tens.purpose not in (purpose, TensorPurpose.LUT):
+        assert 0, "Cannot resolve tensor purpose {} and {} for tensor {}".format(tens.purpose, purpose, tens)
+    fmt = get_format(purpose, arch)
+    tens.set_format(fmt, arch)
+    tens.mem_area = arch.tensor_storage_mem_area[tens.purpose]
+    tens.mem_type = arch.tensor_storage_mem_type[tens.purpose]
+
+    if len(tens.ops) == 1 and tens.ops[0].type == Op.Const:
+        tens.mem_area = arch.permanent_storage_mem_area  # special case constants, as they must be in permanent storage
+        tens.mem_type = MemType.Permanent_NPU
+
+
+def rewrite_mark_tensor_purpose(op, arch):
+    # find disconnected outputs and mark as feature maps
+    for tens in op.outputs:
+        if not tens.consumers():
+            mark_purpose(tens, arch, TensorPurpose.FeatureMap)
+    weight_tensors = op.get_weight_tensors()
+    for tens in op.inputs:
+        if tens.purpose != TensorPurpose.Unknown:
+            purpose = tens.purpose
+        elif tens in weight_tensors:
+            purpose = TensorPurpose.Weights
+        else:
+            purpose = TensorPurpose.FeatureMap
+        mark_purpose(tens, arch, purpose)
+    if op.type == Op.Reshape:
+        # Reshape's input and output point to same data
+        op.ofm.mem_area = op.ifm.mem_area
 
-tensor_purposes = [  # ops, input_purpose
-    (
-        set(
-            (
-                Op.Relu,
-                Op.Relu6,
-                Op.Rsqrt,
-                Op.Abs,
-                Op.Cast,
-                Op.Exp,
-                Op.Floor,
-                Op.FloorDiv,
-                Op.FloorMod,
-                Op.SquaredDifference,
-                Op.AddN,
-                Op.Maximum,
-                Op.Minimum,
-                Op.Sigmoid,
-                Op.Tanh,
-                Op.AvgPool,
-                Op.MaxPool,
-                Op.Squeeze,
-                Op.Softmax,
-                Op.LRN,
-                Op.BatchMatMul,
-                Op.ZerosLike,
-                Op.Mul,
-                Op.Add,
-                Op.Sub,
-                Op.Div,
-                Op.LeakyRelu,
-                Op.CLZ,
-                Op.SHL,
-                Op.SHR,
-                Op.ReduceSum,
-            )
-        ),
-        all_fm,
-    ),
-    (
-        set((Op.Conv2D, Op.MatMul, Op.Conv2DBias, Op.DepthwiseConv2DBias, Op.FullyConnected,)),
-        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]),
-    ),
-    (
-        set((Op.Conv2DBackpropInputSwitchedBias,)),
-        purpose_from_list(
-            [TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
-        ),
-    ),
-    (
-        set((Op.QuantizedConv2D, Op.QuantizedMatMul)),
-        purpose_from_list(
-            [
-                TensorPurpose.FeatureMap,
-                TensorPurpose.Weights,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-            ]
-        ),
-    ),
-    (
-        set(
-            (
-                Op.Reshape,
-                Op.Min,
-                Op.Max,
-                Op.Mean,
-                Op.Pad,
-                Op.MirrorPad,
-                Op.ArgMax,
-                Op.ArgMin,
-                Op.ExpandDims,
-                Op.ResizeNearestNeighbor,
-                Op.ResizeBilinear,
-                Op.Tile,
-                Op.Transpose,
-            )
-        ),
-        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
-    ),
-    (
-        set((Op.QuantizedReshape,)),
-        purpose_from_list(
-            [TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
-        ),
-    ),
-    (
-        set((Op.Dequantize, Op.Quantize, Op.QuantizedAvgPool, Op.QuantizedMaxPool, Op.Slice, Op.SplitV,)),
-        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
-    ),
-    (
-        set((Op.BatchToSpaceND, Op.SpaceToBatchND, Op.DepthToSpace, Op.SpaceToDepth)),
-        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
-    ),
-    (
-        set((Op.BlockLSTM,)),
-        purpose_from_list(
-            [
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.Weights,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-                TensorPurpose.FeatureMap,
-            ]
-        ),
-    ),
-    (set((Op.SplitSliceRead,)), purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap])),
-    (set((Op.Shape, Op.ConcatSliceWrite)), purpose_from_list([TensorPurpose.FeatureMap])),
-    (
-        set((Op.StridedSlice,)),
-        purpose_from_list(
-            [TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
-        ),
-    ),
-    (set((Op.Fill, Op.Pack, Op.Range)), all_parameter),
-    (set((Op.Placeholder, Op.SubgraphInput, Op.Const,)), purpose_from_list([])),
-    (set((Op.FakeQuantWithMinMaxArgs,)), input0_from_output_rest_parameter),
-    (set((Op.Square, Op.Sqrt, Op.Log, Op.Less, Op.Identity,)), inputs_from_output,),
-    (None, all_fm),
-]
+    if op.type == Op.Custom and op.attrs.get("custom_type") == CustomType.ExistingNpuOp:
+        scratch_tensor = None
 
+        if len(op.inputs) >= 3:
+            scratch_tensor = op.inputs[2]  # should be existing scratch tensor
+            if scratch_tensor.name.endswith("_scratch"):
+                scratch_tensor.purpose = TensorPurpose.Scratch
 
-for ops, input_purpose in tensor_purposes:
-    if ops is None:
-        continue
+        if scratch_tensor is None:
+            OperatorError(op, "Scratch tensor not found.")
 
 
 def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
-    def mark_tensor_helper(tens, purpose):
-        if tens.purpose == TensorPurpose.Unknown or tens.purpose == purpose:
-            tens.purpose = purpose
-        elif tens.purpose != TensorPurpose.LUT:
-            assert 0, "Cannot resolve tensor purpose %s and %s for tensor %s" % (tens.purpose, purpose, tens)
-        tens.mem_area = arch.tensor_storage_mem_area[tens.purpose]
-        tens.mem_type = arch.tensor_storage_mem_type[tens.purpose]
-
-        if len(tens.ops) == 1 and tens.ops[0].type == Op.Const:
-            tens.mem_area = (
-                arch.permanent_storage_mem_area
-            )  # special case constants, as they must be in permanent storage
-            tens.mem_type = MemType.Permanent_NPU
-
-    def rewrite_mark_tensor_purpose(op, arch, nng):
-        # find disconnected outputs and mark as parameters
-        for tens in op.outputs:
-            if not tens.consumers():
-                mark_tensor_helper(tens, TensorPurpose.FeatureMap)
-
-        for ops, input_purpose in tensor_purposes:
-            if ops is None or op.type in ops:
-                if ops is None:
-                    print(
-                        "Warning: Don't know how to mark up purpose for",
-                        op.type,
-                        op.inputs,
-                        "triggering all feature map fallback",
-                    )
-
-                for idx, tens in enumerate(op.inputs):
-                    if tens is None:
-                        continue
-                    purpose = input_purpose(op, idx) if tens.purpose == TensorPurpose.Unknown else tens.purpose
-                    mark_tensor_helper(tens, purpose)
-
-                if op.type == Op.Reshape:
-                    # Reshape's input and output point to same data
-                    op.outputs[0].mem_area = op.inputs[0].mem_area
-
-                if op.type == Op.Custom and op.attrs.get("custom_type") == CustomType.ExistingNpuOp:
-                    scratch_tensor = None
-
-                    if len(op.inputs) >= 3:
-                        scratch_tensor = op.inputs[2]  # should be existing scratch tensor
-                        if scratch_tensor.name.endswith("_scratch"):
-                            scratch_tensor.purpose = TensorPurpose.Scratch
-
-                    if scratch_tensor is None:
-                        OperatorError(op, "Scratch tensor not found.")
-
-                break
-
-        return op
-
+    # Sets purpose, format, mem_area and mem_type for all tensors in the graph
     for sg in nng.subgraphs:
-        sg = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [rewrite_mark_tensor_purpose])
+        visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_mark_tensor_purpose])
         for tens in sg.output_tensors:
-            mark_tensor_helper(tens, TensorPurpose.FeatureMap)
+            mark_purpose(tens, arch, TensorPurpose.FeatureMap)
 
     if verbose_tensor_purpose:
         nng.print_graph_with_tensors()
 
     return nng
-
-
-def mark_tensor_format(nng, arch, verbose_tensor_format=False):
-    formats_for_tensor = {}
-
-    def init_tens(tens):
-        if tens.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT):
-            fmt = arch.default_feature_map_format
-        elif tens.purpose == TensorPurpose.Weights:
-            fmt = arch.default_weight_format
-        elif tens.purpose == TensorPurpose.Scratch:
-            fmt = arch.default_feature_map_format
-        elif tens.purpose == TensorPurpose.Unknown:
-            fmt = TensorFormat.Unknown
-        else:
-            assert 0, "unknown tensor purpose %s" % (tens.purpose,)
-        return fmt
-
-    def visit_tens(tens, ps):
-        if tens not in formats_for_tensor:
-            fmt = init_tens(tens)
-        else:
-            fmt = formats_for_tensor[tens]
-
-        formats_for_tensor[tens] = fmt
-
-    for sg in nng.subgraphs:
-        for ps in sg.passes:
-            for tens in ps.outputs:
-                visit_tens(tens, ps)
-            for tens in ps.intermediates:
-                visit_tens(tens, ps)
-            for tens in ps.inputs:
-                visit_tens(tens, ps)
-
-    for tens, fmt in formats_for_tensor.items():
-        if len(tens.shape) > 4:
-            continue
-        tens.set_format(fmt, arch)
-        if fmt == TensorFormat.WeightsCompressed and tens.values is not None:
-            src_tens = tens.get_dma_src_tensor()
-            if src_tens is not None:
-                op = tens.find_npu_op()
-                if op is not None:
-                    weight_compressor.compress_weights(
-                        arch, nng, tens, op.type.npu_block_type, 16, 16, op.get_dilation_h_w()
-                    )
-                    # Alias compressed weights back into source tensor
-                    src_tens.copy_compressed_weight_info(tens)
-
-    if verbose_tensor_format:
-        nng.print_passes_with_tensors()
diff --git a/ethosu/vela/rewrite_graph.py b/ethosu/vela/rewrite_graph.py
index e71b228a..42acaf9b 100644
--- a/ethosu/vela/rewrite_graph.py
+++ b/ethosu/vela/rewrite_graph.py
@@ -82,14 +82,16 @@ def rewrite_graph_pre_order(nng, sg, arch, tensor_rewrite_list, op_rewrite_list,
     return sg
 
 
-def visit_graph_post_order(sg, arch, tensor_visit_list, op_visit_list):
-
+def visit_graph_post_order(start_tensors, arch, tensor_visit_list, op_visit_list):
+    # Depth-first graph traversal, starting from the given list of tensors
+    # (typically a subgraph's output_tensors).
+    # Visits ops and tensors in input to output order.
     op_visit_dict = dict()
     tens_visit_dict = dict()
 
     def visit_op(op):
         if op in op_visit_dict:
-            return op_visit_dict[op]
+            return
         op_visit_dict[op] = op
 
         for tens in op.inputs:
@@ -101,11 +103,9 @@ def visit_graph_post_order(sg, arch, tensor_visit_list, op_visit_list):
         for tens in op.outputs:
             visit_tens(tens)
 
-        return op
-
     def visit_tens(tens):
-        if tens in tens_visit_dict:
-            return tens_visit_dict[tens]
+        if tens is None or tens in tens_visit_dict:
+            return
 
         tens_visit_dict[tens] = tens
 
@@ -115,15 +115,9 @@ def visit_graph_post_order(sg, arch, tensor_visit_list, op_visit_list):
         for visit in tensor_visit_list:
             visit(tens, arch)
 
-        return tens
-
-    for tens in sg.output_tensors:
+    for tens in start_tensors:
         visit_tens(tens)
 
-    sg.refresh_after_modification()
-
-    return sg
-
 
 def verify_graph_health(nng):
 
-- 
cgit v1.2.1