From 17009399160defd4ab21d85249ff31804d732f4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johan=20Alfv=C3=A9n?= <johan.alfven@arm.com>
Date: Tue, 30 Aug 2022 09:14:56 +0200
Subject: MLBEDSW-5029: Output diff for Mean op

Fixed three test cases causing output diff compared to
the reference kernel for the Mean operator.

- If there is a possibility that the accumulator could saturate
the Mean op must run CPU
- Use correct rounding for the bias term
- If a Reshape op is followed by a Mean op, push the Reshape op
to the CPU since this cannot be handled by the NPU

Signed-off-by: Johan Alfven <johan.alfven@arm.com>
Change-Id: I734465730372105821a5e2f73a6a125b9eb7d7f4
---
 .../vela/test/test_tflite_supported_operators.py   | 16 +++++++
 ethosu/vela/tflite_graph_optimiser.py              |  3 +-
 ethosu/vela/tflite_supported_operators.py          | 55 +++++++++++++++++++---
 3 files changed, 67 insertions(+), 7 deletions(-)

(limited to 'ethosu/vela')

diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index 89c27997..cc8b3d2c 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -623,6 +623,22 @@ def test_mean_hw_product_int8():
     op = create_mean([1, 16, 17, 16], [1, 1, 1, 16], [1, 2], DataType.int8, {"keep_dims": True})
     assert not support.is_operator_supported(op)
 
+    # Create OP that will not saturate the accumulator
+    op = create_mean([1, 5, 14, 16], [1, 1, 1, 16], [1, 2], DataType.int8, {"keep_dims": True})
+    op.ifm.quantization.scale_f32 = 2.0
+    op.ifm.quantization.zero_point = 95
+    op.ofm.quantization.scale_f32 = 1.0
+    op.ofm.quantization.zero_point = 95
+    assert support.is_operator_supported(op)
+
+    # Create OP that can saturate the accumulator
+    op = create_mean([1, 6, 14, 16], [1, 1, 1, 16], [1, 2], DataType.int8, {"keep_dims": True})
+    op.ifm.quantization.scale_f32 = 2.0
+    op.ifm.quantization.zero_point = 95
+    op.ofm.quantization.scale_f32 = 1.0
+    op.ofm.quantization.zero_point = 95
+    assert not support.is_operator_supported(op)
+
 
 def test_mean_hw_product_avgpool():
     op = create_mean([1, 200, 200, 16], [1, 16], [1, 2], DataType.uint8, {"keep_dims": False})
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 3646b01e..38e3f603 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -42,6 +42,7 @@ from .graph_optimiser_util import set_ifm_ofm_op_shapes
 from .graph_optimiser_util import set_tensor_equivalence
 from .numeric_util import clamp_sigmoid
 from .numeric_util import round_away_zero
+from .numeric_util import round_up_to_int
 from .operation import create_activation_function
 from .operation import ExplicitScaling
 from .operation import NpuBlockType
@@ -1365,7 +1366,7 @@ def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
                 fiq = ifmq.clone()
                 fiq.zero_point = 0
                 op.forced_input_quantization = fiq
-                bias_term = ofmq.zero_point - int(ifmq.zero_point * ifmq.scale_f32 / ofmq.scale_f32)
+                bias_term = ofmq.zero_point - round_up_to_int(ifmq.zero_point * ifmq.scale_f32 / ofmq.scale_f32)
                 # If the bias term is outside uint8 range, we need an Add op to apply it.
                 if bias_term < 0 or bias_term > 255:
                     intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 1915d43b..f01a6690 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -304,6 +304,7 @@ class TFLiteSupportedOperators:
 
         # Reshape specific checks:
         self.specific_constraints[Op.Reshape].append(TFLiteSupportedOperators.constraint_reshape_shape_constant)
+        self.specific_constraints[Op.Reshape].append(TFLiteSupportedOperators.constraint_reshape_before_mean)
 
         # Concat specific checks:
         for op_type in (Op.Concat, Op.ConcatTFLite):
@@ -795,10 +796,9 @@ class TFLiteSupportedOperators:
         max_prod = cls.mean_kernel_product
         return h * w <= max_prod, f"Product of height and width is {h * w}"
 
-    @classmethod
-    @docstring_format_args([mean_kernel_product_int8])
-    def constraint_mean_height_width_product_int8(cls, op):
-        """Product of IFM height and width must be no greater than {} when:
+    @staticmethod
+    def constraint_mean_height_width_product_int8(op):
+        """Number of IFM height and width elements might cause accumulator saturation when;
         The IFM shape has 4 dimensions; and
         The axis indices specify reduction across 2 dimensions; and
         The axis indices correspond to the width and height dimensions of the IFM; and
@@ -817,8 +817,43 @@ class TFLiteSupportedOperators:
             return True, ""
         h = shape[-3]
         w = shape[-2]
-        max_prod = cls.mean_kernel_product_int8
-        return h * w <= max_prod, f"Product of height and width is {h * w}"
+
+        ifmq, ofmq = op.ifm.quantization, op.ofm.quantization
+
+        # Scale factor
+        real_scale = ifmq.scale_f32 / ofmq.scale_f32
+
+        # Min and max value
+        ifm_min_val = np.iinfo(np.int8).min - ifmq.zero_point
+        ifm_max_val = np.iinfo(np.int8).max - ifmq.zero_point
+
+        # Accumulator limits
+        min_acc_limit = np.iinfo(np.int16).min
+        max_acc_limit = np.iinfo(np.int16).max
+
+        # Theoretical max/min value that accumulator need to store
+        min_acc_sum = h * w * ifm_min_val * real_scale + ofmq.zero_point
+        max_acc_sum = h * w * ifm_max_val * real_scale + ofmq.zero_point
+
+        # Max product of heigth and width that will not saturate the accumulator
+        ifm_min_val = 1 if ifm_min_val == 0 else ifm_min_val
+        ifm_max_val = 1 if ifm_max_val == 0 else ifm_max_val
+        if max_acc_sum > abs(min_acc_sum):
+            max_hw = int((max_acc_limit - ofmq.zero_point) / real_scale / ifm_max_val)
+        else:
+            max_hw = int((min_acc_limit - ofmq.zero_point) / real_scale / ifm_min_val)
+
+        extra = []
+
+        extra.append(f"   Possible accumulator range is ({min_acc_sum} - {max_acc_sum})\n")
+        extra.append(f"   Maximum  accumulator range is ({min_acc_limit} - {max_acc_limit})\n")
+        extra.append(
+            f"   Based on the IFM and OFM quantization the IFM height and width must be no greater than {max_hw}"
+        )
+
+        extra = "".join(extra)
+
+        return (min_acc_sum >= min_acc_limit and max_acc_sum <= max_acc_limit, f"\n{extra}")
 
     @classmethod
     @docstring_format_args([filter_height_range[1], dilated_height_range[1]])
@@ -866,6 +901,14 @@ class TFLiteSupportedOperators:
 
         return valid, f"Op has non-const input(s): {extra}"
 
+    @staticmethod
+    def constraint_reshape_before_mean(op):
+        "Reshape on NPU not supported before MEAN operator"
+        for next_op in op.outputs[0].consumers():
+            if next_op is not None and next_op.type == Op.Mean:
+                return False, ""
+        return True, ""
+
     @staticmethod
     def constraint_concat_valid_dimensions_non_axis(op):
         """All Input dimensions must match OFM dimension in all axes except the one defined by the axis attribute"""
-- 
cgit v1.2.1