From d9c2c4258c50f04ef3a6c3849508d317249e8ebf Mon Sep 17 00:00:00 2001 From: Fredrik Svedberg Date: Tue, 1 Dec 2020 16:33:45 +0100 Subject: [MLBEDSW-3690] Refactor Softmax Move operator generation code to common functions. Signed-off-by: Fredrik Svedberg Change-Id: I02e185fd793a96ae435fa7d235c9d1e97f388a03 --- ethosu/vela/graph_optimiser.py | 2 +- ethosu/vela/operation.py | 14 -- ethosu/vela/operation_util.py | 192 ++++++++++++++ ethosu/vela/pass_packing.py | 2 +- ethosu/vela/softmax.py | 554 +++++++++++++---------------------------- 5 files changed, 367 insertions(+), 397 deletions(-) create mode 100644 ethosu/vela/operation_util.py diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index 85c2b602..4a857507 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -32,10 +32,10 @@ from .numeric_util import clamp_sigmoid from .numeric_util import full_shape from .numeric_util import round_away_zero from .operation import create_activation_function -from .operation import create_avgpool_nop from .operation import NpuBlockType from .operation import Op from .operation import Operation +from .operation_util import create_avgpool_nop from .softmax import SoftMax from .tensor import check_quantized_tens_scaling_equal from .tensor import create_const_tensor diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index 5cb4b6ae..45fae217 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -355,20 +355,6 @@ def create_activation_function(op_type: Op) -> ActivationFunction: return act -def create_avgpool_nop(name): - op = Operation(Op.AvgPool, name) - op.attrs["padding"] = b"VALID" - op.attrs["stride_w"] = 1 - op.attrs["stride_h"] = 1 - op.attrs["filter_width"] = 1 - op.attrs["filter_height"] = 1 - op.attrs["strides"] = [1, 1, 1, 1] - op.attrs["ksize"] = [1, 1, 1, 1] - op.attrs["skirt"] = [0, 0, 0, 0] - op.attrs["explicit_padding"] = [0, 0, 0, 0] - return op - - def get_slice_offsets(input_shape, offset_tens, offset_mask, is_begin=True): # For strided slice operator: get start or end offsets offsets = len(input_shape) * [0] if is_begin else input_shape[:] diff --git a/ethosu/vela/operation_util.py b/ethosu/vela/operation_util.py new file mode 100644 index 00000000..2fc7622c --- /dev/null +++ b/ethosu/vela/operation_util.py @@ -0,0 +1,192 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Description: +# Utility functions for creating Network Operations. +from typing import Optional + +from .data_type import DataType +from .high_level_command_to_npu_op import ifm_ifm2_correct_order +from .operation import ActivationFunction +from .operation import Op +from .operation import Operation +from .tensor import create_reshape_tensor +from .tensor import QuantizationParameters +from .tensor import Tensor + + +def create_avgpool_nop(name: str) -> Operation: + op = Operation(Op.AvgPool, name) + op.attrs["padding"] = b"VALID" + op.attrs["stride_w"] = 1 + op.attrs["stride_h"] = 1 + op.attrs["filter_width"] = 1 + op.attrs["filter_height"] = 1 + op.attrs["strides"] = [1, 1, 1, 1] + op.attrs["ksize"] = [1, 1, 1, 1] + op.attrs["skirt"] = [0, 0, 0, 0] + op.attrs["explicit_padding"] = [0, 0, 0, 0] + return op + + +def create_depthwise_maxpool( + name: str, ifm: Tensor, quantization: QuantizationParameters, activation: Optional[ActivationFunction] = None +) -> Operation: + op = Operation(Op.MaxPool, name) + height = ifm.shape[1] * ifm.shape[2] + width = ifm.shape[3] + ifm_shape = [1, height, width, 1] + op.attrs["padding"] = b"VALID" + op.attrs["stride_w"] = 1 + op.attrs["stride_h"] = 1 + op.attrs["filter_width"] = width + op.attrs["filter_height"] = 1 + op.attrs["strides"] = [1, op.attrs["stride_h"], op.attrs["stride_w"], 1] + op.attrs["ksize"] = [1, op.attrs["filter_height"], op.attrs["filter_width"], 1] + op.activation = activation + op.inputs = [create_reshape_tensor(ifm, ifm_shape)] + ofm = Tensor([1, height, 1, 1], ifm.dtype, op.name + "_tens0") + ofm.quantization = quantization + op.set_output_tensor(ofm) + return op + + +def create_reduce_sum( + name: str, ifm: Tensor, quantization: QuantizationParameters, activation: Optional[ActivationFunction] = None +) -> Operation: + op = Operation(Op.ReduceSum, name) + op.attrs["padding"] = b"VALID" + op.attrs["stride_w"] = 1 + op.attrs["stride_h"] = 1 + op.attrs["filter_width"] = 1 + op.attrs["filter_height"] = 1 + op.attrs["strides"] = [1, op.attrs["stride_h"], op.attrs["stride_w"], 1] + op.attrs["ksize"] = [1, op.attrs["filter_height"], op.attrs["filter_width"], 1] + op.add_input_tensor(ifm) + op.activation = activation + ofm_shape = [1, ifm.shape[1], ifm.shape[2], 1] + sum_of_exp = Tensor(ofm_shape, DataType.int32, op.name + "_tens0") + sum_of_exp.quantization = quantization + op.set_output_tensor(sum_of_exp) + return op + + +def create_add( + name: str, + ifm: Tensor, + ifm2: Tensor, + quantization: QuantizationParameters, + activation: Optional[ActivationFunction] = None, + dtype: Optional[DataType] = None, + attrs: Optional[dict] = None, +) -> Operation: + return create_binary_elementwise(Op.Add, name, ifm, ifm2, quantization, activation, dtype, attrs) + + +def create_clz( + name: str, + ifm: Tensor, + quantization: QuantizationParameters, + activation: Optional[ActivationFunction] = None, + dtype: Optional[DataType] = None, + attrs: Optional[dict] = None, +) -> Operation: + return create_unary_elementwise(Op.CLZ, name, ifm, quantization, activation, dtype, attrs) + + +def create_mul( + name: str, + ifm: Tensor, + ifm2: Tensor, + quantization: QuantizationParameters, + activation: Optional[ActivationFunction] = None, + dtype: Optional[DataType] = None, + attrs: Optional[dict] = None, +) -> Operation: + return create_binary_elementwise(Op.Mul, name, ifm, ifm2, quantization, activation, dtype, attrs) + + +def create_shl( + name: str, + ifm: Tensor, + ifm2: Tensor, + quantization: QuantizationParameters, + activation: Optional[ActivationFunction] = None, + dtype: Optional[DataType] = None, + attrs: Optional[dict] = None, +) -> Operation: + return create_binary_elementwise(Op.SHL, name, ifm, ifm2, quantization, activation, dtype, attrs) + + +def create_shr( + name: str, + ifm: Tensor, + ifm2: Tensor, + quantization: QuantizationParameters, + activation: Optional[ActivationFunction] = None, + dtype: Optional[DataType] = None, + attrs: Optional[dict] = None, +) -> Operation: + return create_binary_elementwise(Op.SHR, name, ifm, ifm2, quantization, activation, dtype, attrs) + + +def create_sub( + name: str, + ifm: Tensor, + ifm2: Tensor, + quantization: QuantizationParameters, + activation: Optional[ActivationFunction] = None, + dtype: Optional[DataType] = None, + attrs: Optional[dict] = None, +) -> Operation: + return create_binary_elementwise(Op.Sub, name, ifm, ifm2, quantization, activation, dtype, attrs) + + +def create_unary_elementwise( + op_type: Op, + name: str, + ifm: Tensor, + quantization: QuantizationParameters, + activation: Optional[ActivationFunction] = None, + dtype: Optional[DataType] = None, + attrs: Optional[dict] = None, +) -> Operation: + return create_binary_elementwise(op_type, name, ifm, None, quantization, activation, dtype, attrs) + + +def create_binary_elementwise( + op_type: Op, + name: str, + ifm: Tensor, + ifm2: Tensor, + quantization: QuantizationParameters, + activation: Optional[ActivationFunction] = None, + dtype: Optional[DataType] = None, + attrs: Optional[dict] = None, +) -> Operation: + op = Operation(op_type, name) + op.add_input_tensor(ifm) + if ifm2: + op.add_input_tensor(ifm2) + op.activation = activation + if not dtype: + dtype = ifm.dtype + if attrs: + op.attrs.update(attrs) + ofm_shape = ifm.shape if ifm2 is None or ifm_ifm2_correct_order(ifm.shape, ifm2.shape) else ifm2.shape + ofm = Tensor(ofm_shape, dtype, f"{op.name}_tens0") + ofm.quantization = quantization + op.set_output_tensor(ofm) + return op diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py index 59376a85..ea2eaa4f 100644 --- a/ethosu/vela/pass_packing.py +++ b/ethosu/vela/pass_packing.py @@ -21,9 +21,9 @@ import enum from .debug_database import DebugDatabase from .nn_graph import Pass from .nn_graph import PassPlacement -from .operation import create_avgpool_nop from .operation import NpuBlockType from .operation import Op +from .operation_util import create_avgpool_nop from .tensor import TensorPurpose diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py index 8c980ad4..1bdab740 100644 --- a/ethosu/vela/softmax.py +++ b/ethosu/vela/softmax.py @@ -30,9 +30,16 @@ from .debug_database import DebugDatabase from .operation import ActivationFunction from .operation import Op from .operation import Operation +from .operation_util import create_add +from .operation_util import create_clz +from .operation_util import create_depthwise_maxpool +from .operation_util import create_mul +from .operation_util import create_reduce_sum +from .operation_util import create_shl +from .operation_util import create_shr +from .operation_util import create_sub from .tensor import create_const_tensor from .tensor import create_reshape_tensor -from .tensor import Tensor from .tensor import TensorPurpose @@ -238,215 +245,124 @@ class SoftMax: one_scale_quant = ifm.quantization.clone() one_scale_quant.scale_f32 = 1.0 one_scale_quant.zero_point = 0 + two_scale_quant = one_scale_quant.clone() + two_scale_quant.scale_f32 = 2.0 ifm.quantization.zero_point = 0 pass_number = 0 + def add_op_get_ofm(op): + DebugDatabase.add_optimised(self.op, op) + nonlocal pass_number + pass_number += 1 + return op.ofm + # PASS 0 - Depthwise Maxpool - maxpool_op = self.op.clone(f"_maxpool{pass_number}") - maxpool_op.type = Op.MaxPool - maxpool_h = ifm.shape[1] * ifm.shape[2] - maxpool_w = ifm.shape[3] - maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1] - maxpool_op.attrs["padding"] = b"VALID" - maxpool_op.attrs["stride_w"] = 1 - maxpool_op.attrs["stride_h"] = 1 - maxpool_op.attrs["filter_width"] = maxpool_w - maxpool_op.attrs["filter_height"] = 1 - maxpool_op.attrs["strides"] = [1, maxpool_op.attrs["stride_h"], maxpool_op.attrs["stride_w"], 1] - maxpool_op.attrs["ksize"] = [1, maxpool_op.attrs["filter_height"], maxpool_op.attrs["filter_width"], 1] - maxpool_op.inputs = [create_reshape_tensor(ifm, maxpool_ifm_shape)] - ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, f"{maxpool_op.name}_0") - ifm_max.quantization = no_scale_quant - maxpool_op.set_output_tensor(ifm_max) - DebugDatabase.add_optimised(self.op, maxpool_op) - pass_number += 1 + ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant)) # PASS 1 - Sub+LUT(exp) - sub_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub_op.add_input_tensor(ifm) - sub_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])) + sub_op_quantization = one_scale_quant.clone() + sub_op_quantization.zero_point = 127 + ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]) + sub_op = create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, sub_op_quantization, dtype=DataType.int32) sub_op.set_activation_lut( create_const_tensor( - f"{sub_op.name}_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT + f"{sub_op.name}_exp_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT ) ) - ifm_exp = Tensor(ifm.shape, DataType.int32, f"{sub_op.name}_0") - ifm_exp.quantization = one_scale_quant.clone() - ifm_exp.quantization.zero_point = 127 - sub_op.activation = ActivationFunction(Op.LUT) + ifm_exp = add_op_get_ofm(sub_op) # Note: activation.min/max are non-quantized values sub_op.activation.min = -128 - ifm_exp.quantization.zero_point sub_op.activation.max = 127 - ifm_exp.quantization.zero_point - sub_op.set_output_tensor(ifm_exp) - DebugDatabase.add_optimised(self.op, sub_op) - pass_number += 1 # PASS 2 - SHR - shr2_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}") - shr2_op.attrs["rounding_mode"] = NpuRoundingMode.NATURAL - shr2_op.add_input_tensor(ifm_exp) - shr2_op.add_input_tensor( - create_const_tensor( - f"{shr2_op.name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_shr{pass_number}" + shift = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant + ) + rescaled_exp = add_op_get_ofm( + create_shr( + name, ifm_exp, shift, no_scale_quant, activation, attrs={"rounding_mode": NpuRoundingMode.NATURAL}, + ) ) - shr2_op.activation = activation.clone() - rescaled_exp = Tensor(ifm.shape, ifm_exp.dtype, f"{shr2_op.name}_0") - rescaled_exp.quantization = no_scale_quant - shr2_op.set_output_tensor(rescaled_exp) - DebugDatabase.add_optimised(self.op, shr2_op) - pass_number += 1 # PASS 3 - Reduce sum - reduce_sum_op = Operation(Op.ReduceSum, f"{self.op.name}_reduce_sum3") - reduce_sum_op.attrs["padding"] = b"VALID" - reduce_sum_op.attrs["stride_w"] = 1 - reduce_sum_op.attrs["stride_h"] = 1 - reduce_sum_op.attrs["filter_width"] = 1 - reduce_sum_op.attrs["filter_height"] = 1 - reduce_sum_op.attrs["strides"] = [1, reduce_sum_op.attrs["stride_h"], reduce_sum_op.attrs["stride_w"], 1] - reduce_sum_op.attrs["ksize"] = [1, reduce_sum_op.attrs["filter_height"], reduce_sum_op.attrs["filter_width"], 1] - reduce_sum_op.add_input_tensor(rescaled_exp) - reduce_sum_op.activation = activation.clone() - - reduce_sum_shape = [1, rescaled_exp.shape[1], rescaled_exp.shape[2], 1] - sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, f"{reduce_sum_op.name}_0") - sum_of_exp.quantization = no_scale_quant - reduce_sum_op.set_output_tensor(sum_of_exp) - DebugDatabase.add_optimised(self.op, reduce_sum_op) - pass_number += 1 + sum_of_exp = add_op_get_ofm( + create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", rescaled_exp, no_scale_quant, activation) + ) # PASS 4 - CLZ - clz_op = Operation(Op.CLZ, f"{self.op.name}_clz{pass_number}") - clz_op.add_input_tensor(sum_of_exp) - clz_op.activation = activation.clone() - headroom_plus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{clz_op.name}_0") - headroom_plus_one.quantization = no_scale_quant - clz_op.set_output_tensor(headroom_plus_one) - DebugDatabase.add_optimised(self.op, clz_op) - pass_number += 1 + headroom_plus_one = add_op_get_ofm( + create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant, activation) + ) # PASS 5 - Sub - sub5_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub5_op.add_input_tensor( - create_const_tensor( - "headroom_offset_const", - [1, 1, 1, 1], - DataType.int32, - [12 + 31 - 8], - np.int32, - quantization=no_scale_quant, - ), + headroom_offset = create_const_tensor( + "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant, + ) + right_shift = add_op_get_ofm( + create_sub( + f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation, + ) ) - sub5_op.add_input_tensor(headroom_plus_one) - sub5_op.activation = activation.clone() - right_shift = Tensor(sum_of_exp.shape, DataType.int32, f"{sub5_op.name}_0") - right_shift.quantization = no_scale_quant - sub5_op.set_output_tensor(right_shift) - DebugDatabase.add_optimised(self.op, sub5_op) - pass_number += 1 # PASS 6 - Sub one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant) - sub6_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub6_op.add_input_tensor(headroom_plus_one) - sub6_op.add_input_tensor(one) - sub6_op.activation = activation.clone() - headroom = Tensor(sum_of_exp.shape, DataType.int32, f"{sub6_op.name}_0") - headroom.quantization = no_scale_quant - sub6_op.set_output_tensor(headroom) - DebugDatabase.add_optimised(self.op, sub6_op) - pass_number += 1 + headroom = add_op_get_ofm( + create_sub(f"{self.op.name}_sub{pass_number}", headroom_plus_one, one, no_scale_quant, activation) + ) # PASS 7 - SHL - shl7_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}") - shl7_op.add_input_tensor(sum_of_exp) - shl7_op.add_input_tensor(headroom) - shl7_op.activation = activation.clone() - shifted_sum = Tensor(sum_of_exp.shape, DataType.int32, f"{shl7_op.name}_0") - shifted_sum.quantization = no_scale_quant - shl7_op.set_output_tensor(shifted_sum) - DebugDatabase.add_optimised(self.op, shl7_op) - pass_number += 1 + shifted_sum = add_op_get_ofm( + create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exp, headroom, no_scale_quant, activation) + ) # PASS 8 - Sub - sub8_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub8_op.add_input_tensor(shifted_sum) - sub8_op.add_input_tensor( - create_const_tensor( - "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant - ), + shifted_one = create_const_tensor( + "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant + ) + shifted_sum_minus_one = add_op_get_ofm( + create_sub(f"{self.op.name}_sub{pass_number}", shifted_sum, shifted_one, no_scale_quant, activation) ) - sub8_op.activation = activation.clone() - shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{sub8_op.name}_0") - shifted_sum_minus_one.quantization = no_scale_quant - sub8_op.set_output_tensor(shifted_sum_minus_one) - DebugDatabase.add_optimised(self.op, sub8_op) - pass_number += 1 # PASS 9 - SHL - shl9_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}") - shl9_op.add_input_tensor(shifted_sum_minus_one) - shl9_op.add_input_tensor(one) - shl9_op.activation = activation.clone() - shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl9_op.name}_0") - shifted_sum_minus_one.quantization = no_scale_quant - shl9_op.set_output_tensor(shifted_sum_minus_one) - DebugDatabase.add_optimised(self.op, shl9_op) - pass_number += 1 + shifted_sum_minus_one = add_op_get_ofm( + create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,) + ) # PASS 10 - Add - add10_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}") - add10_op.add_input_tensor( - create_const_tensor( - "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant - ), - ) - add10_op.add_input_tensor(shifted_sum_minus_one) - add10_op.activation = activation.clone() - add10_op.attrs["rescale"] = (1, 1) - half_denominator = Tensor(sum_of_exp.shape, DataType.int32, f"{add10_op.name}_0") - half_denominator.quantization = one_scale_quant - add10_op.set_output_tensor(half_denominator) - DebugDatabase.add_optimised(self.op, add10_op) - pass_number += 1 + f0_one_const = create_const_tensor( + "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant + ) + half_denominator = add_op_get_ofm( + create_add( + f"{self.op.name}_add{pass_number}", + f0_one_const, + shifted_sum_minus_one, + one_scale_quant, + activation, + attrs={"rescale": (1, 1)}, + ) + ) # PASS 11 - Multiply - mul11_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul11_op.add_input_tensor(half_denominator) - mul11_op.add_input_tensor( - create_const_tensor( - "neg_32_over_17_const", - [1, 1, 1, 1], - DataType.int32, - [-1010580540], - np.int32, - quantization=one_scale_quant, - ), + neg_32_over_17 = create_const_tensor( + "neg_32_over_17_const", [1, 1, 1, 1], DataType.int32, [-1010580540], np.int32, quantization=one_scale_quant + ) + rescaled = add_op_get_ofm( + create_mul( + f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2, + ) ) - rescaled = Tensor(sum_of_exp.shape, DataType.int32, f"{mul11_op.name}_0") - rescaled.quantization = one_scale_quant.clone() - rescaled.quantization.scale_f32 = 2.0 - mul11_op.activation = activation2.clone() - mul11_op.set_output_tensor(rescaled) - DebugDatabase.add_optimised(self.op, mul11_op) - pass_number += 1 # PASS 12 - Add - add12_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}") - add12_op.add_input_tensor(rescaled) - add12_op.add_input_tensor( - create_const_tensor( - "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant - ), + const_48_over_17 = create_const_tensor( + "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant + ) + rescale_w_offset = add_op_get_ofm( + create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,) ) - add12_op.activation = activation.clone() - rescale_w_offset = Tensor(sum_of_exp.shape, DataType.int32, f"{add12_op.name}_0") - rescale_w_offset.quantization = one_scale_quant - add12_op.set_output_tensor(rescale_w_offset) - DebugDatabase.add_optimised(self.op, add12_op) - pass_number += 1 + # PASS 13 - 27 nr_x = rescale_w_offset F2_one = create_const_tensor( "F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], np.int32, quantization=no_scale_quant @@ -456,80 +372,44 @@ class SoftMax: ) for _ in range(3): # PASS 13, 18, 23 - MUL - mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul_op.add_input_tensor(nr_x) - mul_op.add_input_tensor(half_denominator) - mul_op.activation = activation2.clone() - half_denominator_times_x = Tensor(sum_of_exp.shape, DataType.int32, f"{mul_op.name}_0") - half_denominator_times_x.quantization = one_scale_quant.clone() - half_denominator_times_x.quantization.scale_f32 = 2.0 - mul_op.set_output_tensor(half_denominator_times_x) - pass_number += 1 + half_denominator_times_x = add_op_get_ofm( + create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,) + ) # PASS 14, 19, 24 - SUB - sub_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub_op.add_input_tensor(F2_one) - sub_op.add_input_tensor(half_denominator_times_x) - sub_op.activation = activation.clone() - one_minus_half_denominator_times_x = Tensor(sum_of_exp.shape, DataType.int32, f"{sub_op.name}_0") - one_minus_half_denominator_times_x.quantization = one_scale_quant - sub_op.set_output_tensor(one_minus_half_denominator_times_x) - DebugDatabase.add_optimised(self.op, sub_op) - pass_number += 1 + one_minus_half_denominator_times_x = add_op_get_ofm( + create_sub( + f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation, + ) + ) # PASS 15, 20, 25 - MUL - mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul_op.add_input_tensor(nr_x) - mul_op.add_input_tensor(one_minus_half_denominator_times_x) - mul_op.activation = activation2.clone() - to_rescale = Tensor(sum_of_exp.shape, DataType.int32, f"{mul_op.name}_0") - to_rescale.quantization = one_scale_quant.clone() - to_rescale.quantization.scale_f32 = 2.0 - mul_op.set_output_tensor(to_rescale) - pass_number += 1 + to_rescale = add_op_get_ofm( + create_mul( + f"{self.op.name}_mul{pass_number}", + nr_x, + one_minus_half_denominator_times_x, + two_scale_quant, + activation2, + ) + ) # PASS 16, 21, 26 - MUL - shl_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - shl_op.add_input_tensor(to_rescale) - shl_op.add_input_tensor(four) - shl_op.activation = activation.clone() - to_add = Tensor(sum_of_exp.shape, DataType.int32, f"{shl_op.name}_0") - to_add.quantization = no_scale_quant - shl_op.set_output_tensor(to_add) - DebugDatabase.add_optimised(self.op, shl_op) - pass_number += 1 + to_add = add_op_get_ofm( + create_mul(f"{self.op.name}_mul{pass_number}", to_rescale, four, no_scale_quant, activation) + ) # PASS 17, 22, 27 - ADD - add_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}") - add_op.add_input_tensor(nr_x) - add_op.add_input_tensor(to_add) - add_op.activation = activation.clone() - nr_x = Tensor(sum_of_exp.shape, DataType.int32, f"{add_op.name}_0") - nr_x.quantization = one_scale_quant - add_op.set_output_tensor(nr_x) - DebugDatabase.add_optimised(self.op, add_op) - pass_number += 1 + nr_x = add_op_get_ofm( + create_add(f"{self.op.name}_add{pass_number}", nr_x, to_add, one_scale_quant, activation) + ) # PASS 28 - Multiply - mul28_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul28_op.add_input_tensor(nr_x) - mul28_op.add_input_tensor( - create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant) - ) - mul28_op.activation = activation.clone() - scale_factor = Tensor(sum_of_exp.shape, DataType.int32, f"{mul28_op.name}_0") - scale_factor.quantization = one_scale_quant - mul28_op.set_output_tensor(scale_factor) - DebugDatabase.add_optimised(self.op, mul28_op) - pass_number += 1 + two = create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant) + scale_factor = add_op_get_ofm( + create_mul(f"{self.op.name}_mul{pass_number}", nr_x, two, one_scale_quant, activation) + ) # PASS 29 - Multiply - mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul_op.add_input_tensor(ifm_exp) - mul_op.add_input_tensor(scale_factor) - mul_op.activation = activation2.clone() - scaled_exp = Tensor(ifm_exp.shape, DataType.int32, f"{mul_op.name}_0") - scaled_exp.quantization = one_scale_quant.clone() - scaled_exp.quantization.scale_f32 = 2.0 - mul_op.set_output_tensor(scaled_exp) - DebugDatabase.add_optimised(self.op, mul_op) - pass_number += 1 + scaled_exp = add_op_get_ofm( + create_mul(f"{self.op.name}_mul{pass_number}", ifm_exp, scale_factor, two_scale_quant, activation2) + ) # PASS 30 - SHR shr30_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}") @@ -538,7 +418,6 @@ class SoftMax: shr30_op.add_input_tensor(right_shift) shr30_op.set_output_tensor(ofm) DebugDatabase.add_optimised(self.op, shr30_op) - pass_number += 1 return shr30_op @@ -547,176 +426,97 @@ class SoftMax: no_scale_quant.scale_f32 = None pass_number = 0 + def add_op_get_ofm(op): + DebugDatabase.add_optimised(self.op, op) + nonlocal pass_number + pass_number += 1 + return op.ofm + # PASS 0 - Depthwise Maxpool - maxpool_op = self.op.clone(f"_maxpool{pass_number}") - maxpool_op.type = Op.MaxPool - DebugDatabase.add_optimised(self.op, maxpool_op) - maxpool_h = ifm.shape[1] * ifm.shape[2] - maxpool_w = ifm.shape[3] - maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1] - maxpool_op.attrs["padding"] = b"VALID" - maxpool_op.attrs["stride_w"] = 1 - maxpool_op.attrs["stride_h"] = 1 - maxpool_op.attrs["filter_width"] = maxpool_w - maxpool_op.attrs["filter_height"] = 1 - maxpool_op.attrs["strides"] = [1, maxpool_op.attrs["stride_h"], maxpool_op.attrs["stride_w"], 1] - maxpool_op.attrs["ksize"] = [1, maxpool_op.attrs["filter_height"], maxpool_op.attrs["filter_width"], 1] - maxpool_op.inputs = [create_reshape_tensor(ifm, maxpool_ifm_shape)] - ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, f"{maxpool_op.name}_0") - ifm_max.quantization = no_scale_quant - maxpool_op.set_output_tensor(ifm_max) - DebugDatabase.add_optimised(self.op, maxpool_op) - pass_number += 1 + ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant)) # PASS 1 - Sub - sub1_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub1_op.add_input_tensor(ifm) - sub1_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])) - sub1_ofm = Tensor(ifm.shape, DataType.int32, f"{sub1_op.name}_0") - sub1_ofm.quantization = ifm.quantization.clone() - sub1_op.set_output_tensor(sub1_ofm) - DebugDatabase.add_optimised(self.op, sub1_op) - pass_number += 1 + ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]) + sub1_ofm = add_op_get_ofm( + create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, ifm.quantization.clone(), dtype=DataType.int32) + ) # PASS 2 - Mul + name = f"{self.op.name}_mul{pass_number}" beta = self.op.attrs.get("beta", 1.0) mul2_out_range = 10.0 / 65535.0 mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range) - mul2_quant = ifm.quantization.clone() - mul2_quant.scale_f32 = beta - mul2_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul2_op.add_input_tensor(sub1_ofm) - mul2_op.add_input_tensor( - create_const_tensor( - f"{mul2_op.name}_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=mul2_quant - ), + scale_quant = ifm.quantization.clone() + scale_quant.scale_f32 = beta + mul2_quant = ofm.quantization.clone() + mul2_quant.scale_f32 = mul2_out_range + scale = create_const_tensor( + f"{name}_scale_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=scale_quant ) - mul2_ofm = Tensor(ifm.shape, DataType.int32, f"{self.op.name}_mul{pass_number}") - mul2_ofm.quantization = ofm.quantization.clone() - mul2_ofm.quantization.scale_f32 = mul2_out_range - mul2_op.set_output_tensor(mul2_ofm) - DebugDatabase.add_optimised(self.op, mul2_op) - pass_number += 1 + mul2_ofm = add_op_get_ofm(create_mul(name, sub1_ofm, scale, mul2_quant)) # PASS 3 - Add+LUT(exp) - add_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}") - add_op.add_input_tensor(mul2_ofm) - add_op.add_input_tensor( - create_const_tensor( - f"{add_op.name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_add{pass_number}" + const_add = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant ) + add_op = create_add(name, mul2_ofm, const_add, mul2_ofm.quantization.clone(), dtype=DataType.int16) add_op.set_activation_lut( create_const_tensor( - f"{add_op.name}_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT + f"{name}_exp_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT ) ) - exp_ofm = Tensor(mul2_ofm.shape, DataType.int16, f"{add_op.name}_0") - exp_ofm.quantization = mul2_ofm.quantization.clone() - add_op.set_output_tensor(exp_ofm) - DebugDatabase.add_optimised(self.op, add_op) - pass_number += 1 + ifm_exp = add_op_get_ofm(add_op) # PASS 4 - Reduce sum - reduce_sum_op = Operation(Op.ReduceSum, self.op.name + "_reduce_sum4") - reduce_sum_op.attrs["padding"] = b"VALID" - reduce_sum_op.attrs["stride_w"] = 1 - reduce_sum_op.attrs["stride_h"] = 1 - reduce_sum_op.attrs["filter_width"] = 1 - reduce_sum_op.attrs["filter_height"] = 1 - reduce_sum_op.attrs["strides"] = [1, reduce_sum_op.attrs["stride_h"], reduce_sum_op.attrs["stride_w"], 1] - reduce_sum_op.attrs["ksize"] = [1, reduce_sum_op.attrs["filter_height"], reduce_sum_op.attrs["filter_width"], 1] - reduce_sum_op.add_input_tensor(exp_ofm) - - reduce_sum_shape = [1, exp_ofm.shape[1], exp_ofm.shape[2], 1] - sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, f"{reduce_sum_op.name}_0") - sum_of_exp.quantization = no_scale_quant - reduce_sum_op.set_output_tensor(sum_of_exp) - DebugDatabase.add_optimised(self.op, reduce_sum_op) - pass_number += 1 + sum_of_exp = add_op_get_ofm( + create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", ifm_exp, no_scale_quant) + ) # PASS 5 - CLZ - clz_op = Operation(Op.CLZ, f"{self.op.name}_clz{pass_number}") - clz_op.add_input_tensor(sum_of_exp) - headroom_plus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{clz_op.name}_0") - headroom_plus_one.quantization = no_scale_quant - clz_op.set_output_tensor(headroom_plus_one) - DebugDatabase.add_optimised(self.op, clz_op) - pass_number += 1 + headroom_plus_one = add_op_get_ofm(create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant)) # PASS 6 - Sub - sub6_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub6_op.add_input_tensor( - create_const_tensor( - f"{sub6_op.name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_sub{pass_number}" + const_31 = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant ) - sub6_op.add_input_tensor(headroom_plus_one) - reciprocal_right_shift = Tensor(sum_of_exp.shape, DataType.int32, f"{sub6_op.name}_0") - reciprocal_right_shift.quantization = no_scale_quant - sub6_op.set_output_tensor(reciprocal_right_shift) - DebugDatabase.add_optimised(self.op, sub6_op) - pass_number += 1 + reciprocal_right_shift = add_op_get_ofm(create_sub(name, const_31, headroom_plus_one, no_scale_quant)) # PASS 7 - SHL - shl7_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}") - shl7_op.add_input_tensor( - create_const_tensor( - f"{shl7_op.name}_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant - ), + one = create_const_tensor( + f"one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant + ) + constant_one = add_op_get_ofm( + create_shl(f"{self.op.name}_shl{pass_number}", one, reciprocal_right_shift, no_scale_quant) ) - shl7_op.add_input_tensor(reciprocal_right_shift) - constant_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl7_op.name}_0") - constant_one.quantization = no_scale_quant - shl7_op.set_output_tensor(constant_one) - DebugDatabase.add_optimised(self.op, shl7_op) - pass_number += 1 # PASS 8 - Sub - sub8_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub8_op.add_input_tensor(sum_of_exp) - sub8_op.add_input_tensor(constant_one) - sum_of_exps_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{sub8_op.name}_0") - sum_of_exps_minus_one.quantization = no_scale_quant - sub8_op.set_output_tensor(sum_of_exps_minus_one) - DebugDatabase.add_optimised(self.op, sub8_op) - pass_number += 1 + sum_of_exps_minus_one = add_op_get_ofm( + create_sub(f"{self.op.name}_sub{pass_number}", sum_of_exp, constant_one, no_scale_quant) + ) # PASS 9 - SHL - shl9_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}") - shl9_op.add_input_tensor(sum_of_exps_minus_one) - shl9_op.add_input_tensor(headroom_plus_one) - shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl9_op.name}_0") - shifted_sum_minus_one.quantization = no_scale_quant - shl9_op.set_output_tensor(shifted_sum_minus_one) - DebugDatabase.add_optimised(self.op, shl9_op) - pass_number += 1 + shifted_sum_minus_one = add_op_get_ofm( + create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exps_minus_one, headroom_plus_one, no_scale_quant) + ) # PASS 10 - SHR - shr10_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}") - shr10_op.add_input_tensor(shifted_sum_minus_one) - shr10_op.add_input_tensor( - create_const_tensor( - f"{shr10_op.name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_shr{pass_number}" + shift = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant ) - shifted_sum_minus_one_16 = Tensor(sum_of_exp.shape, DataType.int32, f"{shr10_op.name}_0") - shifted_sum_minus_one_16.quantization = shifted_sum_minus_one.quantization.clone() - shr10_op.set_output_tensor(shifted_sum_minus_one_16) - DebugDatabase.add_optimised(self.op, shr10_op) - pass_number += 1 + shifted_sum_minus_one_16 = add_op_get_ofm(create_shr(name, shifted_sum_minus_one, shift, no_scale_quant)) # PASS 11 - Sub+LUT(one over one plus x) - sub11_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub11_op.add_input_tensor(shifted_sum_minus_one_16) - sub11_op.add_input_tensor( - create_const_tensor( - f"{sub11_op.name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_sub{pass_number}" + sub11_const = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant ) + sub11_op = create_sub(name, shifted_sum_minus_one_16, sub11_const, no_scale_quant, dtype=DataType.int16) sub11_op.set_activation_lut( create_const_tensor( - f"{sub11_op.name}_lut", + f"{name}_one_over_one_plus_x_lut", [1, 1, 1, 512], DataType.int32, self.ONE_OVER_ONE_PLUS_X_LUT, @@ -724,21 +524,14 @@ class SoftMax: TensorPurpose.LUT, ) ) - reciprocal_scale = Tensor(sum_of_exp.shape, DataType.int16, f"{sub11_op.name}_0") - reciprocal_scale.quantization = no_scale_quant - sub11_op.set_output_tensor(reciprocal_scale) - DebugDatabase.add_optimised(self.op, sub11_op) - pass_number += 1 + reciprocal_scale = add_op_get_ofm(sub11_op) # PASS 12 - Multiply - mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul_op.add_input_tensor(exp_ofm) - mul_op.add_input_tensor(reciprocal_scale) - mul_ofm = Tensor(exp_ofm.shape, DataType.int32, f"{mul_op.name}_0") - mul_ofm.quantization = no_scale_quant - mul_op.set_output_tensor(mul_ofm) - DebugDatabase.add_optimised(self.op, mul_op) - pass_number += 1 + mul_ofm = add_op_get_ofm( + create_mul( + f"{self.op.name}_mul{pass_number}", ifm_exp, reciprocal_scale, no_scale_quant, dtype=DataType.int32 + ) + ) # PASS 13 - SHR shr13_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}") @@ -746,6 +539,5 @@ class SoftMax: shr13_op.add_input_tensor(reciprocal_right_shift) shr13_op.set_output_tensor(ofm) DebugDatabase.add_optimised(self.op, shr13_op) - pass_number += 1 return shr13_op -- cgit v1.2.1