From 58520b981013214e458b5a7ff1983d919d6d2363 Mon Sep 17 00:00:00 2001
From: Louis Verhaard <louis.verhaard@arm.com>
Date: Mon, 24 Aug 2020 16:45:38 +0200
Subject: MLBEDSW-2688: use LeakyRelu for int16

For int16, using LeakyRelu (with bug fix) gives exactly
the same results as Mul+Max if input/output scales are the same.

Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
Change-Id: I4f4db464d77b0aaf0d25ddfca534f91d08db548d
---
 ethosu/vela/graph_optimiser.py                   | 36 ++++++++----------------
 ethosu/vela/register_command_stream_generator.py |  2 +-
 2 files changed, 13 insertions(+), 25 deletions(-)

(limited to 'ethosu/vela')

diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index b9aafcac..46d26c80 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -734,8 +734,9 @@ def convert_lrelu_to_mul_max(op, arch):
 
 
 def convert_lrelu_to_lut(op, arch):
-    ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
     # Rewrite LeakyRelu by Add with scalar 0 + LUT activation
+    ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+    assert ifm.dtype.size_in_bytes() == 1
     op.type = "AddAct"
     op.name = op.name + "_add"
     op.attrs.update({"npu_block_type": NpuBlockType.ElementWise})
@@ -750,26 +751,9 @@ def convert_lrelu_to_lut(op, arch):
     alpha = op.attrs["alpha"]
     zp = ofm.quantization.zero_point
     # Generate the LUT
-    if ifm.dtype.size_in_bytes() == 1:
-        dtype = DataType.int8
-        ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
-        values = [int(x) if x >= zp else int(round(zp - alpha * (zp - x))) for x in ix]
-    else:
-        # int16
-        dtype = DataType.int32
-        values = []
-        for ix in range(512):
-            x = (ix - 256) * 128
-            if x >= zp:
-                base = x
-                slope = 128
-            else:
-                base = int(round(zp - alpha * (zp - x)))
-                next_base = int(round(zp - alpha * (zp - (x + 127))))
-                slope = int(round(128 * (next_base - base) / 127))
-            value = ((slope << 16) & 0xFFFF0000) + (base & 0xFFFF)
-            values.append(value)
-    lut_tensor = lut.create_lut_tensor(op.name + "_lut", values, dtype)
+    ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
+    values = [int(x) if x >= zp else int(round(zp - alpha * (zp - x))) for x in ix]
+    lut_tensor = lut.create_lut_tensor(op.name + "_lut", values, DataType.int8)
     op.set_activation_lut(lut_tensor)
     return op
 
@@ -779,9 +763,13 @@ def convert_lrelu(op, arch):
     if op.type != "LeakyRelu":
         return op
     ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
-    use_lut = (ifm.is_scaling_equal(ofm)) and (ifm.dtype == ofm.dtype) and ifm.dtype in (DataType.uint8, DataType.int8)
-    if use_lut:
-        return convert_lrelu_to_lut(op, arch)
+    if ifm.is_scaling_equal(ofm) and ifm.dtype == ofm.dtype:
+        if ifm.dtype in (DataType.uint8, DataType.int8):
+            # use LUT
+            return convert_lrelu_to_lut(op, arch)
+        elif ifm.dtype == DataType.int16:
+            # use LeakyRelu unmodified
+            return op
     return convert_lrelu_to_mul_max(op, arch)
 
 
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 5a14cb4f..8d9f9185 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -533,7 +533,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
                     use_global_scale = True
 
                     if primary_op.type == "LeakyRelu":
-                        output_scale *= primary_op.attrs["alpha"]
+                        output_scale = primary_op.attrs["alpha"]
 
                     ofm_scale, shift = scaling.quantise_scale(output_scale)
                     emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
-- 
cgit v1.2.1