aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFredrik Svedberg <fredrik.svedberg@arm.com>2020-12-01 16:33:45 +0100
committerpatrik.gustavsson <patrik.gustavsson@arm.com>2020-12-08 14:00:17 +0000
commitd9c2c4258c50f04ef3a6c3849508d317249e8ebf (patch)
tree6604f3fef997fb946884bf5bd2f54236b523393f
parentd5cf7650b6bcfa5d81321661fa300763660d31ab (diff)
downloadethos-u-vela-d9c2c4258c50f04ef3a6c3849508d317249e8ebf.tar.gz
[MLBEDSW-3690] Refactor Softmax
Move operator generation code to common functions. Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com> Change-Id: I02e185fd793a96ae435fa7d235c9d1e97f388a03
-rw-r--r--ethosu/vela/graph_optimiser.py2
-rw-r--r--ethosu/vela/operation.py14
-rw-r--r--ethosu/vela/operation_util.py192
-rw-r--r--ethosu/vela/pass_packing.py2
-rw-r--r--ethosu/vela/softmax.py554
5 files changed, 367 insertions, 397 deletions
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 85c2b602..4a857507 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -32,10 +32,10 @@ from .numeric_util import clamp_sigmoid
from .numeric_util import full_shape
from .numeric_util import round_away_zero
from .operation import create_activation_function
-from .operation import create_avgpool_nop
from .operation import NpuBlockType
from .operation import Op
from .operation import Operation
+from .operation_util import create_avgpool_nop
from .softmax import SoftMax
from .tensor import check_quantized_tens_scaling_equal
from .tensor import create_const_tensor
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 5cb4b6ae..45fae217 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -355,20 +355,6 @@ def create_activation_function(op_type: Op) -> ActivationFunction:
return act
-def create_avgpool_nop(name):
- op = Operation(Op.AvgPool, name)
- op.attrs["padding"] = b"VALID"
- op.attrs["stride_w"] = 1
- op.attrs["stride_h"] = 1
- op.attrs["filter_width"] = 1
- op.attrs["filter_height"] = 1
- op.attrs["strides"] = [1, 1, 1, 1]
- op.attrs["ksize"] = [1, 1, 1, 1]
- op.attrs["skirt"] = [0, 0, 0, 0]
- op.attrs["explicit_padding"] = [0, 0, 0, 0]
- return op
-
-
def get_slice_offsets(input_shape, offset_tens, offset_mask, is_begin=True):
# For strided slice operator: get start or end offsets
offsets = len(input_shape) * [0] if is_begin else input_shape[:]
diff --git a/ethosu/vela/operation_util.py b/ethosu/vela/operation_util.py
new file mode 100644
index 00000000..2fc7622c
--- /dev/null
+++ b/ethosu/vela/operation_util.py
@@ -0,0 +1,192 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Description:
+# Utility functions for creating Network Operations.
+from typing import Optional
+
+from .data_type import DataType
+from .high_level_command_to_npu_op import ifm_ifm2_correct_order
+from .operation import ActivationFunction
+from .operation import Op
+from .operation import Operation
+from .tensor import create_reshape_tensor
+from .tensor import QuantizationParameters
+from .tensor import Tensor
+
+
+def create_avgpool_nop(name: str) -> Operation:
+ op = Operation(Op.AvgPool, name)
+ op.attrs["padding"] = b"VALID"
+ op.attrs["stride_w"] = 1
+ op.attrs["stride_h"] = 1
+ op.attrs["filter_width"] = 1
+ op.attrs["filter_height"] = 1
+ op.attrs["strides"] = [1, 1, 1, 1]
+ op.attrs["ksize"] = [1, 1, 1, 1]
+ op.attrs["skirt"] = [0, 0, 0, 0]
+ op.attrs["explicit_padding"] = [0, 0, 0, 0]
+ return op
+
+
+def create_depthwise_maxpool(
+ name: str, ifm: Tensor, quantization: QuantizationParameters, activation: Optional[ActivationFunction] = None
+) -> Operation:
+ op = Operation(Op.MaxPool, name)
+ height = ifm.shape[1] * ifm.shape[2]
+ width = ifm.shape[3]
+ ifm_shape = [1, height, width, 1]
+ op.attrs["padding"] = b"VALID"
+ op.attrs["stride_w"] = 1
+ op.attrs["stride_h"] = 1
+ op.attrs["filter_width"] = width
+ op.attrs["filter_height"] = 1
+ op.attrs["strides"] = [1, op.attrs["stride_h"], op.attrs["stride_w"], 1]
+ op.attrs["ksize"] = [1, op.attrs["filter_height"], op.attrs["filter_width"], 1]
+ op.activation = activation
+ op.inputs = [create_reshape_tensor(ifm, ifm_shape)]
+ ofm = Tensor([1, height, 1, 1], ifm.dtype, op.name + "_tens0")
+ ofm.quantization = quantization
+ op.set_output_tensor(ofm)
+ return op
+
+
+def create_reduce_sum(
+ name: str, ifm: Tensor, quantization: QuantizationParameters, activation: Optional[ActivationFunction] = None
+) -> Operation:
+ op = Operation(Op.ReduceSum, name)
+ op.attrs["padding"] = b"VALID"
+ op.attrs["stride_w"] = 1
+ op.attrs["stride_h"] = 1
+ op.attrs["filter_width"] = 1
+ op.attrs["filter_height"] = 1
+ op.attrs["strides"] = [1, op.attrs["stride_h"], op.attrs["stride_w"], 1]
+ op.attrs["ksize"] = [1, op.attrs["filter_height"], op.attrs["filter_width"], 1]
+ op.add_input_tensor(ifm)
+ op.activation = activation
+ ofm_shape = [1, ifm.shape[1], ifm.shape[2], 1]
+ sum_of_exp = Tensor(ofm_shape, DataType.int32, op.name + "_tens0")
+ sum_of_exp.quantization = quantization
+ op.set_output_tensor(sum_of_exp)
+ return op
+
+
+def create_add(
+ name: str,
+ ifm: Tensor,
+ ifm2: Tensor,
+ quantization: QuantizationParameters,
+ activation: Optional[ActivationFunction] = None,
+ dtype: Optional[DataType] = None,
+ attrs: Optional[dict] = None,
+) -> Operation:
+ return create_binary_elementwise(Op.Add, name, ifm, ifm2, quantization, activation, dtype, attrs)
+
+
+def create_clz(
+ name: str,
+ ifm: Tensor,
+ quantization: QuantizationParameters,
+ activation: Optional[ActivationFunction] = None,
+ dtype: Optional[DataType] = None,
+ attrs: Optional[dict] = None,
+) -> Operation:
+ return create_unary_elementwise(Op.CLZ, name, ifm, quantization, activation, dtype, attrs)
+
+
+def create_mul(
+ name: str,
+ ifm: Tensor,
+ ifm2: Tensor,
+ quantization: QuantizationParameters,
+ activation: Optional[ActivationFunction] = None,
+ dtype: Optional[DataType] = None,
+ attrs: Optional[dict] = None,
+) -> Operation:
+ return create_binary_elementwise(Op.Mul, name, ifm, ifm2, quantization, activation, dtype, attrs)
+
+
+def create_shl(
+ name: str,
+ ifm: Tensor,
+ ifm2: Tensor,
+ quantization: QuantizationParameters,
+ activation: Optional[ActivationFunction] = None,
+ dtype: Optional[DataType] = None,
+ attrs: Optional[dict] = None,
+) -> Operation:
+ return create_binary_elementwise(Op.SHL, name, ifm, ifm2, quantization, activation, dtype, attrs)
+
+
+def create_shr(
+ name: str,
+ ifm: Tensor,
+ ifm2: Tensor,
+ quantization: QuantizationParameters,
+ activation: Optional[ActivationFunction] = None,
+ dtype: Optional[DataType] = None,
+ attrs: Optional[dict] = None,
+) -> Operation:
+ return create_binary_elementwise(Op.SHR, name, ifm, ifm2, quantization, activation, dtype, attrs)
+
+
+def create_sub(
+ name: str,
+ ifm: Tensor,
+ ifm2: Tensor,
+ quantization: QuantizationParameters,
+ activation: Optional[ActivationFunction] = None,
+ dtype: Optional[DataType] = None,
+ attrs: Optional[dict] = None,
+) -> Operation:
+ return create_binary_elementwise(Op.Sub, name, ifm, ifm2, quantization, activation, dtype, attrs)
+
+
+def create_unary_elementwise(
+ op_type: Op,
+ name: str,
+ ifm: Tensor,
+ quantization: QuantizationParameters,
+ activation: Optional[ActivationFunction] = None,
+ dtype: Optional[DataType] = None,
+ attrs: Optional[dict] = None,
+) -> Operation:
+ return create_binary_elementwise(op_type, name, ifm, None, quantization, activation, dtype, attrs)
+
+
+def create_binary_elementwise(
+ op_type: Op,
+ name: str,
+ ifm: Tensor,
+ ifm2: Tensor,
+ quantization: QuantizationParameters,
+ activation: Optional[ActivationFunction] = None,
+ dtype: Optional[DataType] = None,
+ attrs: Optional[dict] = None,
+) -> Operation:
+ op = Operation(op_type, name)
+ op.add_input_tensor(ifm)
+ if ifm2:
+ op.add_input_tensor(ifm2)
+ op.activation = activation
+ if not dtype:
+ dtype = ifm.dtype
+ if attrs:
+ op.attrs.update(attrs)
+ ofm_shape = ifm.shape if ifm2 is None or ifm_ifm2_correct_order(ifm.shape, ifm2.shape) else ifm2.shape
+ ofm = Tensor(ofm_shape, dtype, f"{op.name}_tens0")
+ ofm.quantization = quantization
+ op.set_output_tensor(ofm)
+ return op
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 59376a85..ea2eaa4f 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -21,9 +21,9 @@ import enum
from .debug_database import DebugDatabase
from .nn_graph import Pass
from .nn_graph import PassPlacement
-from .operation import create_avgpool_nop
from .operation import NpuBlockType
from .operation import Op
+from .operation_util import create_avgpool_nop
from .tensor import TensorPurpose
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 8c980ad4..1bdab740 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -30,9 +30,16 @@ from .debug_database import DebugDatabase
from .operation import ActivationFunction
from .operation import Op
from .operation import Operation
+from .operation_util import create_add
+from .operation_util import create_clz
+from .operation_util import create_depthwise_maxpool
+from .operation_util import create_mul
+from .operation_util import create_reduce_sum
+from .operation_util import create_shl
+from .operation_util import create_shr
+from .operation_util import create_sub
from .tensor import create_const_tensor
from .tensor import create_reshape_tensor
-from .tensor import Tensor
from .tensor import TensorPurpose
@@ -238,215 +245,124 @@ class SoftMax:
one_scale_quant = ifm.quantization.clone()
one_scale_quant.scale_f32 = 1.0
one_scale_quant.zero_point = 0
+ two_scale_quant = one_scale_quant.clone()
+ two_scale_quant.scale_f32 = 2.0
ifm.quantization.zero_point = 0
pass_number = 0
+ def add_op_get_ofm(op):
+ DebugDatabase.add_optimised(self.op, op)
+ nonlocal pass_number
+ pass_number += 1
+ return op.ofm
+
# PASS 0 - Depthwise Maxpool
- maxpool_op = self.op.clone(f"_maxpool{pass_number}")
- maxpool_op.type = Op.MaxPool
- maxpool_h = ifm.shape[1] * ifm.shape[2]
- maxpool_w = ifm.shape[3]
- maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1]
- maxpool_op.attrs["padding"] = b"VALID"
- maxpool_op.attrs["stride_w"] = 1
- maxpool_op.attrs["stride_h"] = 1
- maxpool_op.attrs["filter_width"] = maxpool_w
- maxpool_op.attrs["filter_height"] = 1
- maxpool_op.attrs["strides"] = [1, maxpool_op.attrs["stride_h"], maxpool_op.attrs["stride_w"], 1]
- maxpool_op.attrs["ksize"] = [1, maxpool_op.attrs["filter_height"], maxpool_op.attrs["filter_width"], 1]
- maxpool_op.inputs = [create_reshape_tensor(ifm, maxpool_ifm_shape)]
- ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, f"{maxpool_op.name}_0")
- ifm_max.quantization = no_scale_quant
- maxpool_op.set_output_tensor(ifm_max)
- DebugDatabase.add_optimised(self.op, maxpool_op)
- pass_number += 1
+ ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant))
# PASS 1 - Sub+LUT(exp)
- sub_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub_op.add_input_tensor(ifm)
- sub_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]))
+ sub_op_quantization = one_scale_quant.clone()
+ sub_op_quantization.zero_point = 127
+ ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])
+ sub_op = create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, sub_op_quantization, dtype=DataType.int32)
sub_op.set_activation_lut(
create_const_tensor(
- f"{sub_op.name}_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT
+ f"{sub_op.name}_exp_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT
)
)
- ifm_exp = Tensor(ifm.shape, DataType.int32, f"{sub_op.name}_0")
- ifm_exp.quantization = one_scale_quant.clone()
- ifm_exp.quantization.zero_point = 127
- sub_op.activation = ActivationFunction(Op.LUT)
+ ifm_exp = add_op_get_ofm(sub_op)
# Note: activation.min/max are non-quantized values
sub_op.activation.min = -128 - ifm_exp.quantization.zero_point
sub_op.activation.max = 127 - ifm_exp.quantization.zero_point
- sub_op.set_output_tensor(ifm_exp)
- DebugDatabase.add_optimised(self.op, sub_op)
- pass_number += 1
# PASS 2 - SHR
- shr2_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
- shr2_op.attrs["rounding_mode"] = NpuRoundingMode.NATURAL
- shr2_op.add_input_tensor(ifm_exp)
- shr2_op.add_input_tensor(
- create_const_tensor(
- f"{shr2_op.name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant
- ),
+ name = f"{self.op.name}_shr{pass_number}"
+ shift = create_const_tensor(
+ f"{name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant
+ )
+ rescaled_exp = add_op_get_ofm(
+ create_shr(
+ name, ifm_exp, shift, no_scale_quant, activation, attrs={"rounding_mode": NpuRoundingMode.NATURAL},
+ )
)
- shr2_op.activation = activation.clone()
- rescaled_exp = Tensor(ifm.shape, ifm_exp.dtype, f"{shr2_op.name}_0")
- rescaled_exp.quantization = no_scale_quant
- shr2_op.set_output_tensor(rescaled_exp)
- DebugDatabase.add_optimised(self.op, shr2_op)
- pass_number += 1
# PASS 3 - Reduce sum
- reduce_sum_op = Operation(Op.ReduceSum, f"{self.op.name}_reduce_sum3")
- reduce_sum_op.attrs["padding"] = b"VALID"
- reduce_sum_op.attrs["stride_w"] = 1
- reduce_sum_op.attrs["stride_h"] = 1
- reduce_sum_op.attrs["filter_width"] = 1
- reduce_sum_op.attrs["filter_height"] = 1
- reduce_sum_op.attrs["strides"] = [1, reduce_sum_op.attrs["stride_h"], reduce_sum_op.attrs["stride_w"], 1]
- reduce_sum_op.attrs["ksize"] = [1, reduce_sum_op.attrs["filter_height"], reduce_sum_op.attrs["filter_width"], 1]
- reduce_sum_op.add_input_tensor(rescaled_exp)
- reduce_sum_op.activation = activation.clone()
-
- reduce_sum_shape = [1, rescaled_exp.shape[1], rescaled_exp.shape[2], 1]
- sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, f"{reduce_sum_op.name}_0")
- sum_of_exp.quantization = no_scale_quant
- reduce_sum_op.set_output_tensor(sum_of_exp)
- DebugDatabase.add_optimised(self.op, reduce_sum_op)
- pass_number += 1
+ sum_of_exp = add_op_get_ofm(
+ create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", rescaled_exp, no_scale_quant, activation)
+ )
# PASS 4 - CLZ
- clz_op = Operation(Op.CLZ, f"{self.op.name}_clz{pass_number}")
- clz_op.add_input_tensor(sum_of_exp)
- clz_op.activation = activation.clone()
- headroom_plus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{clz_op.name}_0")
- headroom_plus_one.quantization = no_scale_quant
- clz_op.set_output_tensor(headroom_plus_one)
- DebugDatabase.add_optimised(self.op, clz_op)
- pass_number += 1
+ headroom_plus_one = add_op_get_ofm(
+ create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant, activation)
+ )
# PASS 5 - Sub
- sub5_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub5_op.add_input_tensor(
- create_const_tensor(
- "headroom_offset_const",
- [1, 1, 1, 1],
- DataType.int32,
- [12 + 31 - 8],
- np.int32,
- quantization=no_scale_quant,
- ),
+ headroom_offset = create_const_tensor(
+ "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant,
+ )
+ right_shift = add_op_get_ofm(
+ create_sub(
+ f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation,
+ )
)
- sub5_op.add_input_tensor(headroom_plus_one)
- sub5_op.activation = activation.clone()
- right_shift = Tensor(sum_of_exp.shape, DataType.int32, f"{sub5_op.name}_0")
- right_shift.quantization = no_scale_quant
- sub5_op.set_output_tensor(right_shift)
- DebugDatabase.add_optimised(self.op, sub5_op)
- pass_number += 1
# PASS 6 - Sub
one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant)
- sub6_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub6_op.add_input_tensor(headroom_plus_one)
- sub6_op.add_input_tensor(one)
- sub6_op.activation = activation.clone()
- headroom = Tensor(sum_of_exp.shape, DataType.int32, f"{sub6_op.name}_0")
- headroom.quantization = no_scale_quant
- sub6_op.set_output_tensor(headroom)
- DebugDatabase.add_optimised(self.op, sub6_op)
- pass_number += 1
+ headroom = add_op_get_ofm(
+ create_sub(f"{self.op.name}_sub{pass_number}", headroom_plus_one, one, no_scale_quant, activation)
+ )
# PASS 7 - SHL
- shl7_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}")
- shl7_op.add_input_tensor(sum_of_exp)
- shl7_op.add_input_tensor(headroom)
- shl7_op.activation = activation.clone()
- shifted_sum = Tensor(sum_of_exp.shape, DataType.int32, f"{shl7_op.name}_0")
- shifted_sum.quantization = no_scale_quant
- shl7_op.set_output_tensor(shifted_sum)
- DebugDatabase.add_optimised(self.op, shl7_op)
- pass_number += 1
+ shifted_sum = add_op_get_ofm(
+ create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exp, headroom, no_scale_quant, activation)
+ )
# PASS 8 - Sub
- sub8_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub8_op.add_input_tensor(shifted_sum)
- sub8_op.add_input_tensor(
- create_const_tensor(
- "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant
- ),
+ shifted_one = create_const_tensor(
+ "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant
+ )
+ shifted_sum_minus_one = add_op_get_ofm(
+ create_sub(f"{self.op.name}_sub{pass_number}", shifted_sum, shifted_one, no_scale_quant, activation)
)
- sub8_op.activation = activation.clone()
- shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{sub8_op.name}_0")
- shifted_sum_minus_one.quantization = no_scale_quant
- sub8_op.set_output_tensor(shifted_sum_minus_one)
- DebugDatabase.add_optimised(self.op, sub8_op)
- pass_number += 1
# PASS 9 - SHL
- shl9_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}")
- shl9_op.add_input_tensor(shifted_sum_minus_one)
- shl9_op.add_input_tensor(one)
- shl9_op.activation = activation.clone()
- shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl9_op.name}_0")
- shifted_sum_minus_one.quantization = no_scale_quant
- shl9_op.set_output_tensor(shifted_sum_minus_one)
- DebugDatabase.add_optimised(self.op, shl9_op)
- pass_number += 1
+ shifted_sum_minus_one = add_op_get_ofm(
+ create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,)
+ )
# PASS 10 - Add
- add10_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}")
- add10_op.add_input_tensor(
- create_const_tensor(
- "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant
- ),
- )
- add10_op.add_input_tensor(shifted_sum_minus_one)
- add10_op.activation = activation.clone()
- add10_op.attrs["rescale"] = (1, 1)
- half_denominator = Tensor(sum_of_exp.shape, DataType.int32, f"{add10_op.name}_0")
- half_denominator.quantization = one_scale_quant
- add10_op.set_output_tensor(half_denominator)
- DebugDatabase.add_optimised(self.op, add10_op)
- pass_number += 1
+ f0_one_const = create_const_tensor(
+ "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant
+ )
+ half_denominator = add_op_get_ofm(
+ create_add(
+ f"{self.op.name}_add{pass_number}",
+ f0_one_const,
+ shifted_sum_minus_one,
+ one_scale_quant,
+ activation,
+ attrs={"rescale": (1, 1)},
+ )
+ )
# PASS 11 - Multiply
- mul11_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
- mul11_op.add_input_tensor(half_denominator)
- mul11_op.add_input_tensor(
- create_const_tensor(
- "neg_32_over_17_const",
- [1, 1, 1, 1],
- DataType.int32,
- [-1010580540],
- np.int32,
- quantization=one_scale_quant,
- ),
+ neg_32_over_17 = create_const_tensor(
+ "neg_32_over_17_const", [1, 1, 1, 1], DataType.int32, [-1010580540], np.int32, quantization=one_scale_quant
+ )
+ rescaled = add_op_get_ofm(
+ create_mul(
+ f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2,
+ )
)
- rescaled = Tensor(sum_of_exp.shape, DataType.int32, f"{mul11_op.name}_0")
- rescaled.quantization = one_scale_quant.clone()
- rescaled.quantization.scale_f32 = 2.0
- mul11_op.activation = activation2.clone()
- mul11_op.set_output_tensor(rescaled)
- DebugDatabase.add_optimised(self.op, mul11_op)
- pass_number += 1
# PASS 12 - Add
- add12_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}")
- add12_op.add_input_tensor(rescaled)
- add12_op.add_input_tensor(
- create_const_tensor(
- "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant
- ),
+ const_48_over_17 = create_const_tensor(
+ "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant
+ )
+ rescale_w_offset = add_op_get_ofm(
+ create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,)
)
- add12_op.activation = activation.clone()
- rescale_w_offset = Tensor(sum_of_exp.shape, DataType.int32, f"{add12_op.name}_0")
- rescale_w_offset.quantization = one_scale_quant
- add12_op.set_output_tensor(rescale_w_offset)
- DebugDatabase.add_optimised(self.op, add12_op)
- pass_number += 1
+ # PASS 13 - 27
nr_x = rescale_w_offset
F2_one = create_const_tensor(
"F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], np.int32, quantization=no_scale_quant
@@ -456,80 +372,44 @@ class SoftMax:
)
for _ in range(3):
# PASS 13, 18, 23 - MUL
- mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
- mul_op.add_input_tensor(nr_x)
- mul_op.add_input_tensor(half_denominator)
- mul_op.activation = activation2.clone()
- half_denominator_times_x = Tensor(sum_of_exp.shape, DataType.int32, f"{mul_op.name}_0")
- half_denominator_times_x.quantization = one_scale_quant.clone()
- half_denominator_times_x.quantization.scale_f32 = 2.0
- mul_op.set_output_tensor(half_denominator_times_x)
- pass_number += 1
+ half_denominator_times_x = add_op_get_ofm(
+ create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,)
+ )
# PASS 14, 19, 24 - SUB
- sub_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub_op.add_input_tensor(F2_one)
- sub_op.add_input_tensor(half_denominator_times_x)
- sub_op.activation = activation.clone()
- one_minus_half_denominator_times_x = Tensor(sum_of_exp.shape, DataType.int32, f"{sub_op.name}_0")
- one_minus_half_denominator_times_x.quantization = one_scale_quant
- sub_op.set_output_tensor(one_minus_half_denominator_times_x)
- DebugDatabase.add_optimised(self.op, sub_op)
- pass_number += 1
+ one_minus_half_denominator_times_x = add_op_get_ofm(
+ create_sub(
+ f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation,
+ )
+ )
# PASS 15, 20, 25 - MUL
- mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
- mul_op.add_input_tensor(nr_x)
- mul_op.add_input_tensor(one_minus_half_denominator_times_x)
- mul_op.activation = activation2.clone()
- to_rescale = Tensor(sum_of_exp.shape, DataType.int32, f"{mul_op.name}_0")
- to_rescale.quantization = one_scale_quant.clone()
- to_rescale.quantization.scale_f32 = 2.0
- mul_op.set_output_tensor(to_rescale)
- pass_number += 1
+ to_rescale = add_op_get_ofm(
+ create_mul(
+ f"{self.op.name}_mul{pass_number}",
+ nr_x,
+ one_minus_half_denominator_times_x,
+ two_scale_quant,
+ activation2,
+ )
+ )
# PASS 16, 21, 26 - MUL
- shl_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
- shl_op.add_input_tensor(to_rescale)
- shl_op.add_input_tensor(four)
- shl_op.activation = activation.clone()
- to_add = Tensor(sum_of_exp.shape, DataType.int32, f"{shl_op.name}_0")
- to_add.quantization = no_scale_quant
- shl_op.set_output_tensor(to_add)
- DebugDatabase.add_optimised(self.op, shl_op)
- pass_number += 1
+ to_add = add_op_get_ofm(
+ create_mul(f"{self.op.name}_mul{pass_number}", to_rescale, four, no_scale_quant, activation)
+ )
# PASS 17, 22, 27 - ADD
- add_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}")
- add_op.add_input_tensor(nr_x)
- add_op.add_input_tensor(to_add)
- add_op.activation = activation.clone()
- nr_x = Tensor(sum_of_exp.shape, DataType.int32, f"{add_op.name}_0")
- nr_x.quantization = one_scale_quant
- add_op.set_output_tensor(nr_x)
- DebugDatabase.add_optimised(self.op, add_op)
- pass_number += 1
+ nr_x = add_op_get_ofm(
+ create_add(f"{self.op.name}_add{pass_number}", nr_x, to_add, one_scale_quant, activation)
+ )
# PASS 28 - Multiply
- mul28_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
- mul28_op.add_input_tensor(nr_x)
- mul28_op.add_input_tensor(
- create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant)
- )
- mul28_op.activation = activation.clone()
- scale_factor = Tensor(sum_of_exp.shape, DataType.int32, f"{mul28_op.name}_0")
- scale_factor.quantization = one_scale_quant
- mul28_op.set_output_tensor(scale_factor)
- DebugDatabase.add_optimised(self.op, mul28_op)
- pass_number += 1
+ two = create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant)
+ scale_factor = add_op_get_ofm(
+ create_mul(f"{self.op.name}_mul{pass_number}", nr_x, two, one_scale_quant, activation)
+ )
# PASS 29 - Multiply
- mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
- mul_op.add_input_tensor(ifm_exp)
- mul_op.add_input_tensor(scale_factor)
- mul_op.activation = activation2.clone()
- scaled_exp = Tensor(ifm_exp.shape, DataType.int32, f"{mul_op.name}_0")
- scaled_exp.quantization = one_scale_quant.clone()
- scaled_exp.quantization.scale_f32 = 2.0
- mul_op.set_output_tensor(scaled_exp)
- DebugDatabase.add_optimised(self.op, mul_op)
- pass_number += 1
+ scaled_exp = add_op_get_ofm(
+ create_mul(f"{self.op.name}_mul{pass_number}", ifm_exp, scale_factor, two_scale_quant, activation2)
+ )
# PASS 30 - SHR
shr30_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
@@ -538,7 +418,6 @@ class SoftMax:
shr30_op.add_input_tensor(right_shift)
shr30_op.set_output_tensor(ofm)
DebugDatabase.add_optimised(self.op, shr30_op)
- pass_number += 1
return shr30_op
@@ -547,176 +426,97 @@ class SoftMax:
no_scale_quant.scale_f32 = None
pass_number = 0
+ def add_op_get_ofm(op):
+ DebugDatabase.add_optimised(self.op, op)
+ nonlocal pass_number
+ pass_number += 1
+ return op.ofm
+
# PASS 0 - Depthwise Maxpool
- maxpool_op = self.op.clone(f"_maxpool{pass_number}")
- maxpool_op.type = Op.MaxPool
- DebugDatabase.add_optimised(self.op, maxpool_op)
- maxpool_h = ifm.shape[1] * ifm.shape[2]
- maxpool_w = ifm.shape[3]
- maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1]
- maxpool_op.attrs["padding"] = b"VALID"
- maxpool_op.attrs["stride_w"] = 1
- maxpool_op.attrs["stride_h"] = 1
- maxpool_op.attrs["filter_width"] = maxpool_w
- maxpool_op.attrs["filter_height"] = 1
- maxpool_op.attrs["strides"] = [1, maxpool_op.attrs["stride_h"], maxpool_op.attrs["stride_w"], 1]
- maxpool_op.attrs["ksize"] = [1, maxpool_op.attrs["filter_height"], maxpool_op.attrs["filter_width"], 1]
- maxpool_op.inputs = [create_reshape_tensor(ifm, maxpool_ifm_shape)]
- ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, f"{maxpool_op.name}_0")
- ifm_max.quantization = no_scale_quant
- maxpool_op.set_output_tensor(ifm_max)
- DebugDatabase.add_optimised(self.op, maxpool_op)
- pass_number += 1
+ ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant))
# PASS 1 - Sub
- sub1_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub1_op.add_input_tensor(ifm)
- sub1_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]))
- sub1_ofm = Tensor(ifm.shape, DataType.int32, f"{sub1_op.name}_0")
- sub1_ofm.quantization = ifm.quantization.clone()
- sub1_op.set_output_tensor(sub1_ofm)
- DebugDatabase.add_optimised(self.op, sub1_op)
- pass_number += 1
+ ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])
+ sub1_ofm = add_op_get_ofm(
+ create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, ifm.quantization.clone(), dtype=DataType.int32)
+ )
# PASS 2 - Mul
+ name = f"{self.op.name}_mul{pass_number}"
beta = self.op.attrs.get("beta", 1.0)
mul2_out_range = 10.0 / 65535.0
mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range)
- mul2_quant = ifm.quantization.clone()
- mul2_quant.scale_f32 = beta
- mul2_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
- mul2_op.add_input_tensor(sub1_ofm)
- mul2_op.add_input_tensor(
- create_const_tensor(
- f"{mul2_op.name}_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=mul2_quant
- ),
+ scale_quant = ifm.quantization.clone()
+ scale_quant.scale_f32 = beta
+ mul2_quant = ofm.quantization.clone()
+ mul2_quant.scale_f32 = mul2_out_range
+ scale = create_const_tensor(
+ f"{name}_scale_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=scale_quant
)
- mul2_ofm = Tensor(ifm.shape, DataType.int32, f"{self.op.name}_mul{pass_number}")
- mul2_ofm.quantization = ofm.quantization.clone()
- mul2_ofm.quantization.scale_f32 = mul2_out_range
- mul2_op.set_output_tensor(mul2_ofm)
- DebugDatabase.add_optimised(self.op, mul2_op)
- pass_number += 1
+ mul2_ofm = add_op_get_ofm(create_mul(name, sub1_ofm, scale, mul2_quant))
# PASS 3 - Add+LUT(exp)
- add_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}")
- add_op.add_input_tensor(mul2_ofm)
- add_op.add_input_tensor(
- create_const_tensor(
- f"{add_op.name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant
- ),
+ name = f"{self.op.name}_add{pass_number}"
+ const_add = create_const_tensor(
+ f"{name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant
)
+ add_op = create_add(name, mul2_ofm, const_add, mul2_ofm.quantization.clone(), dtype=DataType.int16)
add_op.set_activation_lut(
create_const_tensor(
- f"{add_op.name}_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT
+ f"{name}_exp_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT
)
)
- exp_ofm = Tensor(mul2_ofm.shape, DataType.int16, f"{add_op.name}_0")
- exp_ofm.quantization = mul2_ofm.quantization.clone()
- add_op.set_output_tensor(exp_ofm)
- DebugDatabase.add_optimised(self.op, add_op)
- pass_number += 1
+ ifm_exp = add_op_get_ofm(add_op)
# PASS 4 - Reduce sum
- reduce_sum_op = Operation(Op.ReduceSum, self.op.name + "_reduce_sum4")
- reduce_sum_op.attrs["padding"] = b"VALID"
- reduce_sum_op.attrs["stride_w"] = 1
- reduce_sum_op.attrs["stride_h"] = 1
- reduce_sum_op.attrs["filter_width"] = 1
- reduce_sum_op.attrs["filter_height"] = 1
- reduce_sum_op.attrs["strides"] = [1, reduce_sum_op.attrs["stride_h"], reduce_sum_op.attrs["stride_w"], 1]
- reduce_sum_op.attrs["ksize"] = [1, reduce_sum_op.attrs["filter_height"], reduce_sum_op.attrs["filter_width"], 1]
- reduce_sum_op.add_input_tensor(exp_ofm)
-
- reduce_sum_shape = [1, exp_ofm.shape[1], exp_ofm.shape[2], 1]
- sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, f"{reduce_sum_op.name}_0")
- sum_of_exp.quantization = no_scale_quant
- reduce_sum_op.set_output_tensor(sum_of_exp)
- DebugDatabase.add_optimised(self.op, reduce_sum_op)
- pass_number += 1
+ sum_of_exp = add_op_get_ofm(
+ create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", ifm_exp, no_scale_quant)
+ )
# PASS 5 - CLZ
- clz_op = Operation(Op.CLZ, f"{self.op.name}_clz{pass_number}")
- clz_op.add_input_tensor(sum_of_exp)
- headroom_plus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{clz_op.name}_0")
- headroom_plus_one.quantization = no_scale_quant
- clz_op.set_output_tensor(headroom_plus_one)
- DebugDatabase.add_optimised(self.op, clz_op)
- pass_number += 1
+ headroom_plus_one = add_op_get_ofm(create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant))
# PASS 6 - Sub
- sub6_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub6_op.add_input_tensor(
- create_const_tensor(
- f"{sub6_op.name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant
- ),
+ name = f"{self.op.name}_sub{pass_number}"
+ const_31 = create_const_tensor(
+ f"{name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant
)
- sub6_op.add_input_tensor(headroom_plus_one)
- reciprocal_right_shift = Tensor(sum_of_exp.shape, DataType.int32, f"{sub6_op.name}_0")
- reciprocal_right_shift.quantization = no_scale_quant
- sub6_op.set_output_tensor(reciprocal_right_shift)
- DebugDatabase.add_optimised(self.op, sub6_op)
- pass_number += 1
+ reciprocal_right_shift = add_op_get_ofm(create_sub(name, const_31, headroom_plus_one, no_scale_quant))
# PASS 7 - SHL
- shl7_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}")
- shl7_op.add_input_tensor(
- create_const_tensor(
- f"{shl7_op.name}_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant
- ),
+ one = create_const_tensor(
+ f"one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant
+ )
+ constant_one = add_op_get_ofm(
+ create_shl(f"{self.op.name}_shl{pass_number}", one, reciprocal_right_shift, no_scale_quant)
)
- shl7_op.add_input_tensor(reciprocal_right_shift)
- constant_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl7_op.name}_0")
- constant_one.quantization = no_scale_quant
- shl7_op.set_output_tensor(constant_one)
- DebugDatabase.add_optimised(self.op, shl7_op)
- pass_number += 1
# PASS 8 - Sub
- sub8_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub8_op.add_input_tensor(sum_of_exp)
- sub8_op.add_input_tensor(constant_one)
- sum_of_exps_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{sub8_op.name}_0")
- sum_of_exps_minus_one.quantization = no_scale_quant
- sub8_op.set_output_tensor(sum_of_exps_minus_one)
- DebugDatabase.add_optimised(self.op, sub8_op)
- pass_number += 1
+ sum_of_exps_minus_one = add_op_get_ofm(
+ create_sub(f"{self.op.name}_sub{pass_number}", sum_of_exp, constant_one, no_scale_quant)
+ )
# PASS 9 - SHL
- shl9_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}")
- shl9_op.add_input_tensor(sum_of_exps_minus_one)
- shl9_op.add_input_tensor(headroom_plus_one)
- shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl9_op.name}_0")
- shifted_sum_minus_one.quantization = no_scale_quant
- shl9_op.set_output_tensor(shifted_sum_minus_one)
- DebugDatabase.add_optimised(self.op, shl9_op)
- pass_number += 1
+ shifted_sum_minus_one = add_op_get_ofm(
+ create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exps_minus_one, headroom_plus_one, no_scale_quant)
+ )
# PASS 10 - SHR
- shr10_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
- shr10_op.add_input_tensor(shifted_sum_minus_one)
- shr10_op.add_input_tensor(
- create_const_tensor(
- f"{shr10_op.name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant
- ),
+ name = f"{self.op.name}_shr{pass_number}"
+ shift = create_const_tensor(
+ f"{name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant
)
- shifted_sum_minus_one_16 = Tensor(sum_of_exp.shape, DataType.int32, f"{shr10_op.name}_0")
- shifted_sum_minus_one_16.quantization = shifted_sum_minus_one.quantization.clone()
- shr10_op.set_output_tensor(shifted_sum_minus_one_16)
- DebugDatabase.add_optimised(self.op, shr10_op)
- pass_number += 1
+ shifted_sum_minus_one_16 = add_op_get_ofm(create_shr(name, shifted_sum_minus_one, shift, no_scale_quant))
# PASS 11 - Sub+LUT(one over one plus x)
- sub11_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}")
- sub11_op.add_input_tensor(shifted_sum_minus_one_16)
- sub11_op.add_input_tensor(
- create_const_tensor(
- f"{sub11_op.name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant
- ),
+ name = f"{self.op.name}_sub{pass_number}"
+ sub11_const = create_const_tensor(
+ f"{name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant
)
+ sub11_op = create_sub(name, shifted_sum_minus_one_16, sub11_const, no_scale_quant, dtype=DataType.int16)
sub11_op.set_activation_lut(
create_const_tensor(
- f"{sub11_op.name}_lut",
+ f"{name}_one_over_one_plus_x_lut",
[1, 1, 1, 512],
DataType.int32,
self.ONE_OVER_ONE_PLUS_X_LUT,
@@ -724,21 +524,14 @@ class SoftMax:
TensorPurpose.LUT,
)
)
- reciprocal_scale = Tensor(sum_of_exp.shape, DataType.int16, f"{sub11_op.name}_0")
- reciprocal_scale.quantization = no_scale_quant
- sub11_op.set_output_tensor(reciprocal_scale)
- DebugDatabase.add_optimised(self.op, sub11_op)
- pass_number += 1
+ reciprocal_scale = add_op_get_ofm(sub11_op)
# PASS 12 - Multiply
- mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}")
- mul_op.add_input_tensor(exp_ofm)
- mul_op.add_input_tensor(reciprocal_scale)
- mul_ofm = Tensor(exp_ofm.shape, DataType.int32, f"{mul_op.name}_0")
- mul_ofm.quantization = no_scale_quant
- mul_op.set_output_tensor(mul_ofm)
- DebugDatabase.add_optimised(self.op, mul_op)
- pass_number += 1
+ mul_ofm = add_op_get_ofm(
+ create_mul(
+ f"{self.op.name}_mul{pass_number}", ifm_exp, reciprocal_scale, no_scale_quant, dtype=DataType.int32
+ )
+ )
# PASS 13 - SHR
shr13_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}")
@@ -746,6 +539,5 @@ class SoftMax:
shr13_op.add_input_tensor(reciprocal_right_shift)
shr13_op.set_output_tensor(ofm)
DebugDatabase.add_optimised(self.op, shr13_op)
- pass_number += 1
return shr13_op