From 0f98b361288c71fca327969346db32de098c797b Mon Sep 17 00:00:00 2001
From: Fredrik Svedberg <fredrik.svedberg@arm.com>
Date: Tue, 29 Sep 2020 10:00:39 +0200
Subject: [MLBEDSW-2802] Fix 5D tensor crash

Fixed crash in networks with 5D tensors.
Fixed crash for (int32) tensors without quantization.
Added validity checks for concatenation.
Moved unfusing of activation function from tflite_reader to graph_optimiser.

Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: Ib9ba8891dc95ef5491e15d0feedef44331a26393
---
 ethosu/vela/graph_optimiser.py                   | 15 ++++++++++++++
 ethosu/vela/mark_tensors.py                      |  2 ++
 ethosu/vela/npu_serialisation.py                 |  2 +-
 ethosu/vela/register_command_stream_generator.py | 14 ++++++-------
 ethosu/vela/shared_buffer_allocation.py          |  5 +++--
 ethosu/vela/supported_operators.py               | 25 ++++++++++++++++++++++++
 ethosu/vela/tflite_reader.py                     | 17 ----------------
 7 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 2bd57ddd..81d5a188 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -433,6 +433,20 @@ def fixup_pack_input(op, arch):
     return op
 
 
+def unfuse_activation_function(op, arch):
+    unfuse_ops = ("ConcatTFLite",)
+    if op.type in unfuse_ops and op.run_on_npu and op.attrs.get("fused_activation_function", None) is not None:
+        act = op.attrs["fused_activation_function"]
+        del op.attrs["fused_activation_function"]
+        act_op = Operation(act, op.name + act)
+        out_tens = op.outputs[0]
+        intermediate_tens = out_tens.clone("_act_intermediate")
+        act_op.set_output_tensor(out_tens)
+        act_op.add_input_tensor(intermediate_tens)
+        op.set_output_tensor(intermediate_tens)
+
+    return op
+
 def fixup_unpack_output(tens, arch):
     op = tens.ops[0]
     if op.type in set(("Unpack", "StridedSlice")):
@@ -1087,6 +1101,7 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
         fixup_fully_connected_input,
         convert_batched_fc_to_conv,
         fixup_pack_input,
+        unfuse_activation_function,
         fixup_conv2d_backprop,
         fixup_relus_with_differing_ifm_ofm_scaling,
         fixup_act_reorder,
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index 208b5b8c..a971ef23 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -367,6 +367,8 @@ def mark_tensor_format(nng, arch, verbose_tensor_format=False):
                 visit_tens(tens, ps)
 
     for tens, fmt in formats_for_tensor.items():
+        if len(tens.shape) > 4:
+            continue
         tens.set_format(fmt, arch)
         if fmt == TensorFormat.WeightsCompressed and tens.values is not None:
             src_tens = tens.get_dma_src_tensor()
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index 6277a6dc..430db585 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -51,7 +51,7 @@ def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
 
 def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor):
     start_addr = src_tensor.address
-    values = src_tensor.quant_values.flatten()
+    values = src_tensor.quant_values.flatten() if src_tensor.quant_values is not None else src_tensor.values.flatten()
     if src_tensor.dtype.size_in_bytes() > 1:
         values = np.frombuffer(values.tobytes(), dtype=np.uint8)
     end_addr = start_addr + values.size
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index acfd25a2..da9be668 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -483,9 +483,9 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
 
                 # Calculate scales needed for arithmetic elementwise operators
                 if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
-                    input_scale = cmd.ifm_tensor.quantization.scale_f32
-                    input2_scale = cmd.ifm2_tensor.quantization.scale_f32
-                    output_scale = ofm_quant.scale_f32
+                    input_scale = cmd.ifm_tensor.quantization.scale_f32 if cmd.ifm_tensor.quantization else None
+                    input2_scale = cmd.ifm2_tensor.quantization.scale_f32 if cmd.ifm2_tensor.quantization else None
+                    output_scale = ofm_quant.scale_f32 if ofm_quant else None
                     use_global_scale = True
 
                     if output_scale is not None and faf in ("Sigmoid", "Tanh"):
@@ -803,10 +803,10 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
                     scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
                     emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
 
-            ofm_quant_qmin = ofm_quant.quant_min
-            ofm_quant_qmax = ofm_quant.quant_max
-            ifm_min = cmd.ifm_tensor.quantization.min
-            ifm_max = cmd.ifm_tensor.quantization.max
+            ofm_quant_qmin = ofm_quant.quant_min if ofm_quant else np.iinfo(np.int16).min
+            ofm_quant_qmax = ofm_quant.quant_max if ofm_quant else np.iinfo(np.int16).max
+            ifm_min = cmd.ifm_tensor.quantization.min if cmd.ifm_tensor.quantization else np.iinfo(np.int16).min
+            ifm_max = cmd.ifm_tensor.quantization.max if cmd.ifm_tensor.quantization else np.iinfo(np.int16).max
 
             # Emit commands for any fused activation function
             if faf is None:
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
index 63e2268d..7657dffa 100644
--- a/ethosu/vela/shared_buffer_allocation.py
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -38,7 +38,8 @@ class SharedBufferAllocation:
 
         ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
         tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
-        has_scale = None not in (t.quantization.scale_f32 for t in tensors)
+        scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
+        has_scale = len(tensors) == len(scales) and not None in scales
 
         strides = (1, 1, 1, 1)
         dilation = (1, 1, 1, 1)
@@ -192,7 +193,7 @@ def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):
 
     # Constrain the search space if the OFM is smaller than the max block size
     # - Add other block search constraints here if required
-    if len(alloc.ofm_tensor.shape) == 2:
+    if len(alloc.ofm_tensor.shape) <= 2:
         max_block_height = max_block_width = alloc.ofm_tensor.shape[0]
     else:
         max_block_width = alloc.ofm_tensor.shape[-2]
diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py
index 0a1af829..eec1b900 100644
--- a/ethosu/vela/supported_operators.py
+++ b/ethosu/vela/supported_operators.py
@@ -152,6 +152,9 @@ class SupportedOperators:
                     "placing on CPU",
                 )
                 return False
+            if len(t.shape) > 4:
+                print("Warning:", op.type, "has input(s) of unsupported shape", t.shape, "placing on CPU")
+                return False
         for t in op.outputs:
             if not t.has_fully_defined_shape():
                 print("Warning:", op.type, "has output(s) of undefined shape, placing on CPU")
@@ -165,6 +168,9 @@ class SupportedOperators:
                     "placing on CPU",
                 )
                 return False
+            if len(t.shape) > 4:
+                print("Warning:", op.type, "has output(s) of unsupported shape", t.shape, "placing on CPU")
+                return False
 
         # check data type
         tensors = [t for t in op.get_ifm_ifm2_weights_ofm() if t is not None]
@@ -447,6 +453,25 @@ class SupportedOperators:
             if num_to_be_inferred > 1:
                 print("Warning:", op.type, "has more than one size to be inferred, which is illegal, placing on CPU")
                 return False
+        if op.type.find("Concat") != -1:
+            axis = op.attrs.get("axis", None)
+            if axis is None:
+                print("Warning:", op.type, "invalid or missing axis, placing on CPU")
+                return False
+            if axis < 0:
+                axis += len(op.inputs[0].shape)
+            if not 0 < axis < len(op.inputs[0].shape):
+                print("Warning:", op.type, "invalid axis", axis, ", placing on CPU")
+                return False
+            ofm = op.outputs[0]
+            ofm_dims = len(ofm.shape)
+            for ifm in op.inputs:
+                if len(ifm.shape) != ofm_dims:
+                    return False
+                for i in range(ofm_dims):
+                    if i != axis and ifm.shape[i] != ofm.shape[i]:
+                        print("Warning:", op.type, "invalid ifm:", ifm.name, ifm.shape, "mismatch in dimension", i, ", placing on CPU")
+                        return False
 
         return True
 
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
index 7458b907..77cc7963 100644
--- a/ethosu/vela/tflite_reader.py
+++ b/ethosu/vela/tflite_reader.py
@@ -149,8 +149,6 @@ class TFLiteSubgraph:
         for out in op.outputs:
             out.ops = [op]
 
-        activation_function_to_split_out = None
-
         if op_type.startswith("DepthwiseConv2d") or op_type.startswith("Conv2D"):
             if inputs[1].values is not None:
                 inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 2, 3, 0))
@@ -192,21 +190,6 @@ class TFLiteSubgraph:
             if "depth_multiplier" in op.attrs:
                 op.attrs["channel_multiplier"] = op.attrs["depth_multiplier"]
 
-            if "fused_activation_function" in op.attrs:
-                if op_type in set(("ConcatTFLite",)):
-                    act = op.attrs["fused_activation_function"]
-                    del op.attrs["fused_activation_function"]
-                    if act is not None:
-                        activation_function_to_split_out = act
-
-        if activation_function_to_split_out is not None:
-            act_op = Operation(activation_function_to_split_out, name + activation_function_to_split_out)
-            out_tens = op.outputs[0]
-            intermediate_tens = out_tens.clone("_act_intermediate")
-            act_op.set_output_tensor(out_tens)
-            intermediate_tens.ops = [op]
-            op.outputs[0] = intermediate_tens
-            act_op.inputs = [intermediate_tens]
 
     @staticmethod
     def len1_array_to_scalar(arr):
-- 
cgit v1.2.1