From b81e1bb92be8ea5b29625cf2f361e9160286b16a Mon Sep 17 00:00:00 2001
From: Fredrik Svedberg <fredrik.svedberg@arm.com>
Date: Tue, 11 Oct 2022 21:50:51 +0200
Subject: MLBEDSW-6971 Fix output diff when cascading elementwise operators

Fixed output diff when cascading elementwise operators with
reversed operand order.

Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: Iac2e28cfb53037b929459af213f4fa7715b3e6de
---
 ethosu/vela/cascade_builder.py                     | 5 +++--
 ethosu/vela/high_level_command_stream.py           | 2 ++
 ethosu/vela/high_level_command_stream_generator.py | 3 +++
 ethosu/vela/high_level_command_to_npu_op.py        | 5 ++++-
 ethosu/vela/scheduler.py                           | 2 ++
 ethosu/vela/softmax.py                             | 2 +-
 6 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'ethosu')

diff --git a/ethosu/vela/cascade_builder.py b/ethosu/vela/cascade_builder.py
index b4a4f876..ebe2f133 100644
--- a/ethosu/vela/cascade_builder.py
+++ b/ethosu/vela/cascade_builder.py
@@ -175,11 +175,12 @@ class CascadeBuilder:
         ifm = sched_op.parent_op.ifm
         ifm2 = sched_op.parent_op.ifm2
 
-        # Cascading elementwise operations with reverse operand order is not handled
         if sched_op.parent_op.type.is_binary_elementwise_op() and ifm and ifm2:
             # We cannot rule out cascadability if at least one IFM is constant
+            ifm_const = ifm.ops != [] and ifm.ops[0].type == Op.Const
             ifm2_const = ifm2.ops != [] and ifm2.ops[0].type == Op.Const
-            return ifm_ifm2_correct_order(ifm.shape, ifm2.shape) and ifm2_const
+            correct_order = ifm_ifm2_correct_order(ifm.shape, ifm2.shape)
+            return (ifm_const and (ifm.shape == ifm2.shape or not correct_order)) or (ifm2_const and correct_order)
         else:
             # Either one IFM is not variable or it is not a binary elementwise op - we cannot rule out cascadability
             return True
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index 4a41edd0..bfe5bce1 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -210,6 +210,7 @@ class NpuStripe(Command):
         ifm2_box=None,
         pad_top=0,
         pad_bottom=0,
+        reversed_operands=False,
     ):
         self.ps = ps
         self.block_config = block_config
@@ -226,6 +227,7 @@ class NpuStripe(Command):
         self.weight_box = weight_box
         self.pad_top = pad_top
         self.pad_bottom = pad_bottom
+        self.reversed_operands = reversed_operands
         for i in range(len(self.ofm_box.end_coord)):
             assert self.ofm_box.end_coord[i] <= ps.ofm_shapes[0][i]
 
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 7e13b62f..e71fb6ea 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -74,6 +74,8 @@ def generate_high_level_commands_for_sched_op(sched_op, schedule):
         _,
         _,
     ) = parent_op.get_ifm_ifm2_weights_biases_ofm()
+    if sched_op.reversed_operands:
+        ifm2_tensor, ifm_tensor = ifm_tensor, ifm2_tensor
     ifm = sched_op.ifm
     ifm2 = sched_op.ifm2
     ofm_shape = sched_op.ofm.shape
@@ -236,4 +238,5 @@ def generate_high_level_commands_for_sched_op(sched_op, schedule):
                     ifm2_box=ifm2_box,
                     pad_top=pad_top,
                     pad_bottom=pad_bottom,
+                    reversed_operands=sched_op.reversed_operands,
                 )
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 974d980c..202917bd 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -555,7 +555,10 @@ def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> Npu
     if elemwise_op not in UNARY_ELEMWISE_OPS:
         ifm_shape = [] if cmd.ifm_tensor.shape == [] else ps.ifm_shapes[0].as_list()
         ifm2_shape = [] if cmd.ifm2_tensor.shape == [] else ps.ifm_shapes[1].as_list()
-        if not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
+        if cmd.reversed_operands:
+            assert ifm_ifm2_correct_order(ifm_shape, ifm2_shape)
+            npu_op.reversed_operands = True
+        elif not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
             # The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
             cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
             cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 9dca63a8..208b121e 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -182,6 +182,7 @@ class SchedulerOperation:
         self.activation = ps.primary_op.activation
         self.kernel = ps.primary_op.kernel
         self.resampling_mode = ps.primary_op.ifm_resampling_mode
+        self.reversed_operands = False
         self.uses_scalar = ps.primary_op.ifm2 is not None and (
             ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
         )
@@ -239,6 +240,7 @@ class SchedulerOperation:
                 # The non-broadcasted IFM should be the primary input
                 or (ifm1.shape != ofm.shape and ifm2.shape == ofm.shape)
             ):
+                self.reversed_operands = True
                 self.ifm, self.ifm2 = self.ifm2, self.ifm
 
                 self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 1655427e..a0fd19ca 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -353,8 +353,8 @@ class SoftMax:
         )
         add_op = create_add(
             f"{self.op.name}_add{pass_number}",
-            f0_one_const,
             shifted_sum_minus_one,
+            f0_one_const,
             one_scale_quant,
             activation,
         )
-- 
cgit v1.2.1