From 4a434cba156cdfb2613b7ebe4d4a4ec9f85ba616 Mon Sep 17 00:00:00 2001
From: Fredrik Svedberg <fredrik.svedberg@arm.com>
Date: Tue, 27 Sep 2022 14:13:01 +0200
Subject: MLBEDSW-6969 Remove RescaleAdd and RescaleMul operators

Removed RescaleAdd and RescaleMul operators in favour of
Operation.explicit_scale and removed Operation.rescale.

Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: Idccd8851731d4bb8d4e84970e0fd6b409d7d4e45
---
 ethosu/vela/cascade_builder.py                   |  8 +++-----
 ethosu/vela/high_level_command_to_npu_op.py      | 21 +++++++--------------
 ethosu/vela/operation.py                         |  7 -------
 ethosu/vela/operation_util.py                    | 19 -------------------
 ethosu/vela/register_command_stream_generator.py | 24 +++++++++++++++---------
 ethosu/vela/softmax.py                           | 19 +++++++++----------
 ethosu/vela/tflite_graph_optimiser.py            | 13 ++++++-------
 ethosu/vela/tosa_reader.py                       |  4 ++--
 ethosu/vela/tosa_supported_operators.py          |  1 -
 9 files changed, 42 insertions(+), 74 deletions(-)

(limited to 'ethosu/vela')

diff --git a/ethosu/vela/cascade_builder.py b/ethosu/vela/cascade_builder.py
index 3c105374..1f5dc504 100644
--- a/ethosu/vela/cascade_builder.py
+++ b/ethosu/vela/cascade_builder.py
@@ -18,6 +18,7 @@
 # Groups Operators in a schedule together to form Cascades.
 from collections import namedtuple
 
+from .high_level_command_to_npu_op import ifm_ifm2_correct_order
 from .numeric_util import round_up
 from .operation import NpuBlockType
 from .operation import Op
@@ -169,17 +170,14 @@ class CascadeBuilder:
     @staticmethod
     def element_wise_cascading_conformity(sched_op):
         """Check the inputs of the op to see if it's a candidate for cascading."""
-        # Cascading sub-operators of Softmax results in a crash when handling Sub and RescaleAdd ops
 
         ifm = sched_op.parent_op.ifm
         ifm2 = sched_op.parent_op.ifm2
 
-        if sched_op.op_type in [Op.RescaleAdd]:
-            return False
-
+        # Cascading elementwise operations with reverse operand order is not handled
         if sched_op.parent_op.type.is_binary_elementwise_op() and ifm and ifm2:
             # We cannot rule out cascadability if at least one IFM is constant
-            return Op.Const in (ifm.ops[0], ifm2.ops[0])
+            return Op.Const in (ifm.ops[0].type, ifm2.ops[0].type) and ifm_ifm2_correct_order(ifm.shape, ifm2.shape)
         else:
             # Either one IFM is not variable or it is not a binary elementwise op - we cannot rule out cascadability
             return True
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 7923e371..974d980c 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -92,9 +92,7 @@ dtype_map = {
 # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
 elementwise_op_map = {
     Op.Mul: NpuElementWiseOp.MUL,
-    Op.RescaleMul: NpuElementWiseOp.MUL,
     Op.Add: NpuElementWiseOp.ADD,
-    Op.RescaleAdd: NpuElementWiseOp.ADD,
     Op.Sub: NpuElementWiseOp.SUB,
     Op.Minimum: NpuElementWiseOp.MIN,
     Op.Maximum: NpuElementWiseOp.MAX,
@@ -312,11 +310,7 @@ def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
         (
             ps.primary_op.activation is None
             or forced_ofm_quantization is not None
-            or (
-                ps.primary_op.type.is_avgpool_op()
-                and ps.primary_op.activation.op_type.is_relu_op()
-                and not ps.primary_op.rescale
-            )
+            or (ps.primary_op.type.is_avgpool_op() and ps.primary_op.activation.op_type.is_relu_op())
         )
         and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
         and not fused_quantize
@@ -461,7 +455,7 @@ def create_npu_activation(op: Operation) -> NpuActivation:
     act = NpuActivation(act_op)
     act.min = op.activation.min
     act.max = op.activation.max
-    if act_op is NpuActivationOp.NONE_OR_RELU and op.type.is_avgpool_op() and not op.rescale:
+    if act_op is NpuActivationOp.NONE_OR_RELU and op.type.is_avgpool_op() and not op.explicit_scaling:
         quant = op.ofm.quantization
         if quant and quant.zero_point:  # Zero point is not 0
             scale_f32 = 1 if quant.scale_f32 is None else quant.scale_f32
@@ -544,10 +538,8 @@ def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPooling
     npu_op = NpuPoolingOperation(pool_op)
     set_common_op_fields(npu_op, cmd, arch)
     # Pooling specific info
-    npu_op.rescale = op.rescale
     if op.explicit_scaling:
         # Note: reuse of rescale for explicit scaling to not expose this in the external API
-        assert npu_op.rescale is None
         npu_op.rescale = op.explicit_scaling
     return npu_op
 
@@ -588,7 +580,11 @@ def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> Npu
     set_common_op_fields(npu_op, cmd, arch)
     # Check if output scale needs to be overridden
     output_scale = None
-    if op.type == Op.Add and op.original_type.is_resize_op():
+    if op.explicit_scaling is not None:
+        assert not op.explicit_scaling.per_channel
+        assert op.type in (Op.Add, Op.Mul, Op.Sub)
+        npu_op.rescale = (op.explicit_scaling.multiplier[0], op.explicit_scaling.shift[0])
+    elif op.type == Op.Add and op.original_type.is_resize_op():
         # Force output scale same as the input scale for
         # resizebilinear/nearestneighbor 1x1 that is converted to add
         output_scale = npu_op.ifm2.quantization.scale_f32
@@ -596,9 +592,6 @@ def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> Npu
         output_scale = npu_op.ifm.quantization.scale_f32 / npu_op.ofm.quantization.scale_f32
     elif op.type == Op.LeakyRelu:
         output_scale = op.attrs["alpha"]
-    elif op.type in (Op.RescaleAdd, Op.RescaleMul):
-        assert op.rescale is not None, f"{op.type} must have rescale"
-        npu_op.rescale = op.rescale
     elif op.type in (Op.Add, Op.Mul, Op.Sub):
         if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
             output_scale = 1 / 0x3000
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 8b3c88d9..8189793e 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -27,7 +27,6 @@ from typing import List
 from typing import Optional
 from typing import Tuple
 from typing import TYPE_CHECKING
-from typing import Union
 
 from .api import NpuRoundingMode
 from .errors import VelaError
@@ -247,8 +246,6 @@ class Op(Enum):
     ReluN1To1 = OperatorInfo(indices=NNG_IFM_INDICES)
     ReluN = OperatorInfo(indices=NNG_IFM_INDICES)  # TOSA specific
     Rescale = OperatorInfo(indices=NNG_IFM_INDICES)  # TOSA specific
-    RescaleAdd = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES)
-    RescaleMul = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES)
     Reshape = OperatorInfo(indices=NNG_IFM_INDICES)
     # resize ops map to pooling operations unless explicitly converted to other operations in the graph optimiser
     ResizeBilinear = OperatorInfo(block_type=NpuBlockType.Pooling, indices=NNG_IFM_INDICES)
@@ -535,9 +532,6 @@ class Operation:
         self._kernel = None
         self.ifm_shapes: List[Shape4D] = []
         self.ofm_shapes: List[Shape4D] = []
-        # If not none: contains rescale to be used as output scaling
-        # (which overrides the ofm tensor's scale)
-        self.rescale: Optional[Union[Tuple[int, int], ExplicitScaling]] = None
         self.read_offsets: List[Optional[Shape4D]] = [None, None]  # offset for [ifm, ifm2]
         self.read_shapes: List[Optional[Shape4D]] = [None, None]  # read shape for [ifm, ifm2]
         self.rounding_mode: Optional[NpuRoundingMode] = None
@@ -586,7 +580,6 @@ class Operation:
         res.rounding_mode = self.rounding_mode
         res.explicit_scaling = self.explicit_scaling
         res.low_precision_scaling = self.low_precision_scaling
-        res.rescale = self.rescale
         res.ifm_resampling_mode = self.ifm_resampling_mode
         res.tile_base_offsets_ifm = [_ifm.copy() for _ifm in self.tile_base_offsets_ifm]
         res.tile_base_offsets_ofm = self.tile_base_offsets_ofm.copy()
diff --git a/ethosu/vela/operation_util.py b/ethosu/vela/operation_util.py
index 36a8e592..aaabddbf 100644
--- a/ethosu/vela/operation_util.py
+++ b/ethosu/vela/operation_util.py
@@ -122,25 +122,6 @@ def create_add(
     )
 
 
-def create_rescale_add(
-    name: str,
-    ifm: Tensor,
-    ifm2: Tensor,
-    rescale: Tuple[int, int],
-    quantization: QuantizationParameters,
-    activation: Optional[ActivationFunction] = None,
-    dtype: Optional[DataType] = None,
-    attrs: Optional[dict] = None,
-    ifm_shape: Optional[Shape4D] = None,
-    ifm2_shape: Optional[Shape4D] = None,
-) -> Operation:
-    op = create_binary_elementwise(
-        Op.RescaleAdd, name, ifm, ifm2, quantization, activation, dtype, attrs, ifm_shape, ifm2_shape
-    )
-    op.rescale = rescale
-    return op
-
-
 def create_clz(
     name: str,
     ifm: Tensor,
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 5680c96f..99ac32d5 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -707,6 +707,7 @@ def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoo
             shift = explicit_scaling.shift[0]
         else:
             # for ResizeBilinear/NearestNeighbor operations with rescale
+            # Note: this is not used, but part of the public API
             rescale = pool_op.rescale
             rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
             scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
@@ -759,25 +760,30 @@ def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElem
             else:
                 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
         else:  # Add/Sub
-            opa_scale: float
-            opb_scale: float
+            # Default operand scaling is no scaling
+            opa_scale = opb_scale = 1
+            opa_shift = 0
             bitdepth = npu_op.ifm.data_type.size_in_bits()
             use_advanced_scaling = False
-            if None in (input_scale, input2_scale, output_scale):
-                opa_scale = opb_scale = ofm_scale = 1
-                opa_shift = shift = 0
-                if npu_op.rescale is not None:
-                    ofm_scale, shift = npu_op.rescale
+            if npu_op.rescale is not None:
+                # Explicit ofm scaling
+                ofm_scale, shift = npu_op.rescale
+            elif None in (input_scale, input2_scale, output_scale):
+                # No ofm scaling
+                ofm_scale = 1
+                shift = 0
             elif input_scale == input2_scale and bitdepth == 16:
+                # int16 same scaling
                 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
                     input_scale, input2_scale, output_scale
                 )
                 # align the double rounding with that of advanced scaling
-                opa_scale /= 2
-                opb_scale /= 2
+                opa_scale //= 2
+                opb_scale //= 2
                 shift -= 1
                 opa_shift = 0  # Unused for this case
             elif input_scale == input2_scale:
+                # Same scaling
                 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
                     input_scale, input2_scale, output_scale
                 )
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 9565bc5c..1655427e 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -28,6 +28,7 @@ from .api import NpuRoundingMode
 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .operation import ActivationFunction
+from .operation import ExplicitScaling
 from .operation import Op
 from .operation import Operation
 from .operation_util import create_add
@@ -35,7 +36,6 @@ from .operation_util import create_clz
 from .operation_util import create_depthwise_maxpool
 from .operation_util import create_mul
 from .operation_util import create_reduce_sum
-from .operation_util import create_rescale_add
 from .operation_util import create_shl
 from .operation_util import create_shr
 from .operation_util import create_sub
@@ -351,16 +351,15 @@ class SoftMax:
         f0_one_const = create_const_tensor(
             "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant
         )
-        half_denominator = add_op_get_ofm(
-            create_rescale_add(
-                f"{self.op.name}_add{pass_number}",
-                f0_one_const,
-                shifted_sum_minus_one,
-                (1, 1),  # Custom rescale
-                one_scale_quant,
-                activation,
-            )
+        add_op = create_add(
+            f"{self.op.name}_add{pass_number}",
+            f0_one_const,
+            shifted_sum_minus_one,
+            one_scale_quant,
+            activation,
         )
+        add_op.explicit_scaling = ExplicitScaling(False, shift=[1], multiplier=[1])  # Custom rescale
+        half_denominator = add_op_get_ofm(add_op)
 
         # PASS 11 - Multiply
         neg_32_over_17 = create_const_tensor(
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 5d6d7071..f3ca1b63 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -871,7 +871,7 @@ def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
             # Add explicit rescaling
             rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32
             multiplier, shift = scaling.quantise_scale(rescale)
-            relu_fused_op.rescale = ExplicitScaling(False, [shift], [multiplier])
+            relu_fused_op.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
             # Tidy up and assign the ifm and ofm to the new op
             ifm.consumer_list.remove(op)
 
@@ -991,8 +991,8 @@ def convert_prelu(op, arch, nng):
         DebugDatabase.add_optimised(op, relu_op)
 
         # Add scaled and alpha multiplied values (without scaling)
-        add_op = Operation(Op.RescaleAdd, op.name + "_add")
-        add_op.rescale = (1, 0)  # No scale or shift
+        add_op = Operation(Op.Add, op.name + "_add")
+        add_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])  # No scaling
         add_op.add_input_tensor(fm_alpha)
         add_op.add_input_tensor(fm_scaled)
         add_op.set_output_tensor(ofm)
@@ -1180,8 +1180,8 @@ def convert_lrelu_to_mul_max(op, arch):
             mul_ifm.dtype = DataType.int32
         min_op.set_output_tensor(mul_ifm)
         min_op.set_ifm_ofm_shapes()
-        new_op = Op.RescaleAdd
-        op.rescale = (1, 0)  # No scale or shift
+        new_op = Op.Add
+        op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])  # No scaling
         DebugDatabase.add_optimised(op, min_op)
 
     # Add multiplication with alpha
@@ -1196,8 +1196,7 @@ def convert_lrelu_to_mul_max(op, arch):
     if is_converted_prelu:
         # The LeakyRelu was the result from convert_prelu and the scaling is provided
         scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
-        mul_alpha.type = Op.RescaleMul
-        mul_alpha.rescale = [alpha_scale, alpha_shift]
+        mul_alpha.explicit_scaling = ExplicitScaling(False, [alpha_shift], [alpha_scale])
     elif alpha == 0 or np.isinf(1 / alpha):
         # Handling of alpha near or at zero
         quantization.scale_f32 = np.float32(1)
diff --git a/ethosu/vela/tosa_reader.py b/ethosu/vela/tosa_reader.py
index 2bec9cf1..cd18adb2 100644
--- a/ethosu/vela/tosa_reader.py
+++ b/ethosu/vela/tosa_reader.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from .nn_graph import Graph
 from .nn_graph import Subgraph
+from .operation import ExplicitScaling
 from .operation import Op
 from .operation import Operation
 from .reader_util import align_tensor_indices_to_nng
@@ -183,8 +184,7 @@ class TosaSubgraph:
             if "shift" in op.attrs and op.type == Op.Mul:
                 shift = op.attrs["shift"]
                 if shift != 0:
-                    op.type = Op.RescaleMul
-                    op.rescale = [1, shift]
+                    op.explicit_scaling = ExplicitScaling(False, [shift], [1])
             if op.type.is_depthwise_conv2d_op():
                 op.attrs["depth_multiplier"] = op.weights.shape[3]
             if op.type == Op.SplitSliceRead:
diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py
index 192862ef..3f3a0025 100644
--- a/ethosu/vela/tosa_supported_operators.py
+++ b/ethosu/vela/tosa_supported_operators.py
@@ -50,7 +50,6 @@ class TosaSupportedOperators:
         (
             Op.Add,
             Op.Mul,
-            Op.RescaleMul,
             Op.Sub,
         )
     )
-- 
cgit v1.2.1