From f03bad31c194d1a24ce808223f1b00310a7325e2 Mon Sep 17 00:00:00 2001
From: Louis Verhaard <louis.verhaard@arm.com>
Date: Fri, 25 Sep 2020 08:30:44 +0200
Subject: MLBEDSW-2031: LUT support tanh/sigmoid

Uses LUT for int8/uint8 based tanh/sigmoid.

Change-Id: Ib6ac5a5c958ab9a17e47f620b22c3e22d8d60321
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
---
 ethosu/vela/fp_math.py         |  1 +
 ethosu/vela/graph_optimiser.py | 62 +++++++++++++++++++++++++++++++++++-------
 ethosu/vela/numeric_util.py    |  6 +++-
 ethosu/vela/operation.py       |  2 +-
 4 files changed, 59 insertions(+), 12 deletions(-)

(limited to 'ethosu/vela')

diff --git a/ethosu/vela/fp_math.py b/ethosu/vela/fp_math.py
index 2515b771..66375611 100644
--- a/ethosu/vela/fp_math.py
+++ b/ethosu/vela/fp_math.py
@@ -21,6 +21,7 @@
 # point implementation.
 import numpy as np
 
+
 # Convert floating point to fixed point, default Q5.26
 def from_float(x, integer_bits=5):
     i32info = np.iinfo(np.int32)
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 68473307..a8f68ae1 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -28,6 +28,8 @@ from .data_type import DataType
 from .errors import UnsupportedFeatureError
 from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .numeric_util import full_shape
+from .numeric_util import round_away_zero
+from .numeric_util import sigmoid
 from .operation import create_avgpool_nop
 from .operation import NpuBlockType
 from .operation import Operation
@@ -863,9 +865,9 @@ def convert_lrelu_to_mul_max(op, arch):
     return op
 
 
-def convert_lrelu_to_lut(op, arch):
-    # Rewrite LeakyRelu by Add with scalar 0 + LUT activation
-    ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+def convert_to_lut(op, lut_values):
+    # Rewrite the operation by Add with scalar 0 + LUT activation
+    ifm = op.inputs[0]
     assert ifm.dtype.size_in_bytes() == 1
     op.type = "AddAct"
     op.name = op.name + "_add"
@@ -878,6 +880,41 @@ def convert_lrelu_to_lut(op, arch):
     quantization.zero_point = 0
     tens = create_const_tensor(op.inputs[0].name + "_add", [], ifm.dtype, [0], np.uint8, quantization=quantization)
     op.add_input_tensor(tens)
+    # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
+    # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
+    # should be the same as the IFM
+    op.attrs["forced_output_quantization"] = ifm.quantization
+    lut_tensor = lut.create_lut_tensor(op.name + "_lut", lut_values, DataType.int8)
+    op.set_activation_lut(lut_tensor)
+    return op
+
+
+def convert_to_lut8(op, fn):
+    # Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
+    # fn is a function(real) -> real
+    ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+    if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
+        return op
+    # Generate the LUT
+    ifm_scale = np.double(ifm.quantization.scale_f32)
+    ofm_scale = np.double(ofm.quantization.scale_f32)
+    zp_in = ifm.quantization.zero_point
+    zp_out = ofm.quantization.zero_point
+    values = []
+    ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
+    quantized_min = min(ix)
+    quantized_max = max(ix)
+    for x in ix:
+        x_real = ifm_scale * (x - zp_in)
+        y_real = fn(x_real)
+        lut_result = round_away_zero(zp_out + y_real / ofm_scale)
+        lut_result = min(quantized_max, max(quantized_min, lut_result))
+        values.append(lut_result)
+    return convert_to_lut(op, values)
+
+
+def convert_lrelu_to_lut(op, arch):
+    ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
     # Generate the LUT
     alpha = op.attrs["alpha"]
     ifm_scale = np.double(ifm.quantization.scale_f32)
@@ -903,13 +940,7 @@ def convert_lrelu_to_lut(op, arch):
             lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)
         lut_result = min(quantized_max, max(quantized_min, lut_result))
         values.append(lut_result)
-    # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
-    # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
-    # should be the same as the IFM
-    op.attrs["forced_output_quantization"] = ifm.quantization
-    lut_tensor = lut.create_lut_tensor(op.name + "_lut", values, DataType.int8)
-    op.set_activation_lut(lut_tensor)
-    return op
+    return convert_to_lut(op, values)
 
 
 def convert_lrelu(op, arch):
@@ -926,6 +957,15 @@ def convert_lrelu(op, arch):
     return convert_lrelu_to_mul_max(op, arch)
 
 
+def convert_tanh_sigmoid_to_lut(op, arch):
+    # Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
+    if op.type == "Sigmoid":
+        return convert_to_lut8(op, sigmoid)
+    elif op.type == "Tanh":
+        return convert_to_lut8(op, math.tanh)
+    return op
+
+
 def remove_unwanted_reshapes(op, arch):
     # Try to remove reshapes enclosing ElementWise operator with only one non-constant input
     if not op.run_on_npu or op.attrs["npu_block_type"] != NpuBlockType.ElementWise:
@@ -971,6 +1011,7 @@ def fuse_activation_function_with_prev(op, arch):
     # Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
     fuse = (
         prev_op.run_on_npu
+        and "npu_block_type" in prev_op.attrs
         and prev_op.attrs["npu_block_type"] != NpuBlockType.Default
         and len(ifm.ops) == 1
         and len(prev_op.outputs[0].consumers()) == 1
@@ -1058,6 +1099,7 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
         convert_mul_max_to_abs_or_lrelu,
         remove_unwanted_reshapes,
         convert_lrelu,
+        convert_tanh_sigmoid_to_lut,
     ]
 
     for idx, sg in enumerate(nng.subgraphs):
diff --git a/ethosu/vela/numeric_util.py b/ethosu/vela/numeric_util.py
index 4ebef8e5..3d26444a 100644
--- a/ethosu/vela/numeric_util.py
+++ b/ethosu/vela/numeric_util.py
@@ -77,13 +77,17 @@ def clamp_tanh(x):
     return y
 
 
+def sigmoid(x):
+    return 1 / (1 + math.exp(-x))
+
+
 def clamp_sigmoid(x):
     if x <= -8:
         y = 0.0
     elif x >= 8:
         y = 1.0
     else:
-        y = 1 / (1 + math.exp(-x))
+        y = sigmoid(x)
     return y
 
 
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 252f03b7..14818870 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -152,7 +152,7 @@ input and output tensors, as well as an attribute dictionary."""
             weight_idx = 1
             ofm_idx = 0
 
-        elif self.type in ("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims"):
+        elif self.type in ("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims", "Sigmoid", "Tanh"):
             ifm_idx = 0
             ofm_idx = 0
 
-- 
cgit v1.2.1