diff options
author | Fredrik Svedberg <fredrik.svedberg@arm.com> | 2020-12-01 16:33:45 +0100 |
---|---|---|
committer | patrik.gustavsson <patrik.gustavsson@arm.com> | 2020-12-08 14:00:17 +0000 |
commit | d9c2c4258c50f04ef3a6c3849508d317249e8ebf (patch) | |
tree | 6604f3fef997fb946884bf5bd2f54236b523393f /ethosu/vela/softmax.py | |
parent | d5cf7650b6bcfa5d81321661fa300763660d31ab (diff) | |
download | ethos-u-vela-d9c2c4258c50f04ef3a6c3849508d317249e8ebf.tar.gz |
[MLBEDSW-3690] Refactor Softmax
Move operator generation code to common functions.
Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: I02e185fd793a96ae435fa7d235c9d1e97f388a03
Diffstat (limited to 'ethosu/vela/softmax.py')
-rw-r--r-- | ethosu/vela/softmax.py | 554 |
1 files changed, 173 insertions, 381 deletions
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py index 8c980ad4..1bdab740 100644 --- a/ethosu/vela/softmax.py +++ b/ethosu/vela/softmax.py @@ -30,9 +30,16 @@ from .debug_database import DebugDatabase from .operation import ActivationFunction from .operation import Op from .operation import Operation +from .operation_util import create_add +from .operation_util import create_clz +from .operation_util import create_depthwise_maxpool +from .operation_util import create_mul +from .operation_util import create_reduce_sum +from .operation_util import create_shl +from .operation_util import create_shr +from .operation_util import create_sub from .tensor import create_const_tensor from .tensor import create_reshape_tensor -from .tensor import Tensor from .tensor import TensorPurpose @@ -238,215 +245,124 @@ class SoftMax: one_scale_quant = ifm.quantization.clone() one_scale_quant.scale_f32 = 1.0 one_scale_quant.zero_point = 0 + two_scale_quant = one_scale_quant.clone() + two_scale_quant.scale_f32 = 2.0 ifm.quantization.zero_point = 0 pass_number = 0 + def add_op_get_ofm(op): + DebugDatabase.add_optimised(self.op, op) + nonlocal pass_number + pass_number += 1 + return op.ofm + # PASS 0 - Depthwise Maxpool - maxpool_op = self.op.clone(f"_maxpool{pass_number}") - maxpool_op.type = Op.MaxPool - maxpool_h = ifm.shape[1] * ifm.shape[2] - maxpool_w = ifm.shape[3] - maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1] - maxpool_op.attrs["padding"] = b"VALID" - maxpool_op.attrs["stride_w"] = 1 - maxpool_op.attrs["stride_h"] = 1 - maxpool_op.attrs["filter_width"] = maxpool_w - maxpool_op.attrs["filter_height"] = 1 - maxpool_op.attrs["strides"] = [1, maxpool_op.attrs["stride_h"], maxpool_op.attrs["stride_w"], 1] - maxpool_op.attrs["ksize"] = [1, maxpool_op.attrs["filter_height"], maxpool_op.attrs["filter_width"], 1] - maxpool_op.inputs = [create_reshape_tensor(ifm, maxpool_ifm_shape)] - ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, f"{maxpool_op.name}_0") - ifm_max.quantization = no_scale_quant - maxpool_op.set_output_tensor(ifm_max) - DebugDatabase.add_optimised(self.op, maxpool_op) - pass_number += 1 + ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant)) # PASS 1 - Sub+LUT(exp) - sub_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub_op.add_input_tensor(ifm) - sub_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])) + sub_op_quantization = one_scale_quant.clone() + sub_op_quantization.zero_point = 127 + ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]) + sub_op = create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, sub_op_quantization, dtype=DataType.int32) sub_op.set_activation_lut( create_const_tensor( - f"{sub_op.name}_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT + f"{sub_op.name}_exp_lut", [1, 1, 1, 256], DataType.int32, exp_lut, np.int32, TensorPurpose.LUT ) ) - ifm_exp = Tensor(ifm.shape, DataType.int32, f"{sub_op.name}_0") - ifm_exp.quantization = one_scale_quant.clone() - ifm_exp.quantization.zero_point = 127 - sub_op.activation = ActivationFunction(Op.LUT) + ifm_exp = add_op_get_ofm(sub_op) # Note: activation.min/max are non-quantized values sub_op.activation.min = -128 - ifm_exp.quantization.zero_point sub_op.activation.max = 127 - ifm_exp.quantization.zero_point - sub_op.set_output_tensor(ifm_exp) - DebugDatabase.add_optimised(self.op, sub_op) - pass_number += 1 # PASS 2 - SHR - shr2_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}") - shr2_op.attrs["rounding_mode"] = NpuRoundingMode.NATURAL - shr2_op.add_input_tensor(ifm_exp) - shr2_op.add_input_tensor( - create_const_tensor( - f"{shr2_op.name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_shr{pass_number}" + shift = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [12], np.int32, quantization=no_scale_quant + ) + rescaled_exp = add_op_get_ofm( + create_shr( + name, ifm_exp, shift, no_scale_quant, activation, attrs={"rounding_mode": NpuRoundingMode.NATURAL}, + ) ) - shr2_op.activation = activation.clone() - rescaled_exp = Tensor(ifm.shape, ifm_exp.dtype, f"{shr2_op.name}_0") - rescaled_exp.quantization = no_scale_quant - shr2_op.set_output_tensor(rescaled_exp) - DebugDatabase.add_optimised(self.op, shr2_op) - pass_number += 1 # PASS 3 - Reduce sum - reduce_sum_op = Operation(Op.ReduceSum, f"{self.op.name}_reduce_sum3") - reduce_sum_op.attrs["padding"] = b"VALID" - reduce_sum_op.attrs["stride_w"] = 1 - reduce_sum_op.attrs["stride_h"] = 1 - reduce_sum_op.attrs["filter_width"] = 1 - reduce_sum_op.attrs["filter_height"] = 1 - reduce_sum_op.attrs["strides"] = [1, reduce_sum_op.attrs["stride_h"], reduce_sum_op.attrs["stride_w"], 1] - reduce_sum_op.attrs["ksize"] = [1, reduce_sum_op.attrs["filter_height"], reduce_sum_op.attrs["filter_width"], 1] - reduce_sum_op.add_input_tensor(rescaled_exp) - reduce_sum_op.activation = activation.clone() - - reduce_sum_shape = [1, rescaled_exp.shape[1], rescaled_exp.shape[2], 1] - sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, f"{reduce_sum_op.name}_0") - sum_of_exp.quantization = no_scale_quant - reduce_sum_op.set_output_tensor(sum_of_exp) - DebugDatabase.add_optimised(self.op, reduce_sum_op) - pass_number += 1 + sum_of_exp = add_op_get_ofm( + create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", rescaled_exp, no_scale_quant, activation) + ) # PASS 4 - CLZ - clz_op = Operation(Op.CLZ, f"{self.op.name}_clz{pass_number}") - clz_op.add_input_tensor(sum_of_exp) - clz_op.activation = activation.clone() - headroom_plus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{clz_op.name}_0") - headroom_plus_one.quantization = no_scale_quant - clz_op.set_output_tensor(headroom_plus_one) - DebugDatabase.add_optimised(self.op, clz_op) - pass_number += 1 + headroom_plus_one = add_op_get_ofm( + create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant, activation) + ) # PASS 5 - Sub - sub5_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub5_op.add_input_tensor( - create_const_tensor( - "headroom_offset_const", - [1, 1, 1, 1], - DataType.int32, - [12 + 31 - 8], - np.int32, - quantization=no_scale_quant, - ), + headroom_offset = create_const_tensor( + "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant, + ) + right_shift = add_op_get_ofm( + create_sub( + f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation, + ) ) - sub5_op.add_input_tensor(headroom_plus_one) - sub5_op.activation = activation.clone() - right_shift = Tensor(sum_of_exp.shape, DataType.int32, f"{sub5_op.name}_0") - right_shift.quantization = no_scale_quant - sub5_op.set_output_tensor(right_shift) - DebugDatabase.add_optimised(self.op, sub5_op) - pass_number += 1 # PASS 6 - Sub one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant) - sub6_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub6_op.add_input_tensor(headroom_plus_one) - sub6_op.add_input_tensor(one) - sub6_op.activation = activation.clone() - headroom = Tensor(sum_of_exp.shape, DataType.int32, f"{sub6_op.name}_0") - headroom.quantization = no_scale_quant - sub6_op.set_output_tensor(headroom) - DebugDatabase.add_optimised(self.op, sub6_op) - pass_number += 1 + headroom = add_op_get_ofm( + create_sub(f"{self.op.name}_sub{pass_number}", headroom_plus_one, one, no_scale_quant, activation) + ) # PASS 7 - SHL - shl7_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}") - shl7_op.add_input_tensor(sum_of_exp) - shl7_op.add_input_tensor(headroom) - shl7_op.activation = activation.clone() - shifted_sum = Tensor(sum_of_exp.shape, DataType.int32, f"{shl7_op.name}_0") - shifted_sum.quantization = no_scale_quant - shl7_op.set_output_tensor(shifted_sum) - DebugDatabase.add_optimised(self.op, shl7_op) - pass_number += 1 + shifted_sum = add_op_get_ofm( + create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exp, headroom, no_scale_quant, activation) + ) # PASS 8 - Sub - sub8_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub8_op.add_input_tensor(shifted_sum) - sub8_op.add_input_tensor( - create_const_tensor( - "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant - ), + shifted_one = create_const_tensor( + "shifted_one_const", [1, 1, 1, 1], DataType.int32, [1 << 30], np.int32, quantization=no_scale_quant + ) + shifted_sum_minus_one = add_op_get_ofm( + create_sub(f"{self.op.name}_sub{pass_number}", shifted_sum, shifted_one, no_scale_quant, activation) ) - sub8_op.activation = activation.clone() - shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{sub8_op.name}_0") - shifted_sum_minus_one.quantization = no_scale_quant - sub8_op.set_output_tensor(shifted_sum_minus_one) - DebugDatabase.add_optimised(self.op, sub8_op) - pass_number += 1 # PASS 9 - SHL - shl9_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}") - shl9_op.add_input_tensor(shifted_sum_minus_one) - shl9_op.add_input_tensor(one) - shl9_op.activation = activation.clone() - shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl9_op.name}_0") - shifted_sum_minus_one.quantization = no_scale_quant - shl9_op.set_output_tensor(shifted_sum_minus_one) - DebugDatabase.add_optimised(self.op, shl9_op) - pass_number += 1 + shifted_sum_minus_one = add_op_get_ofm( + create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,) + ) # PASS 10 - Add - add10_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}") - add10_op.add_input_tensor( - create_const_tensor( - "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant - ), - ) - add10_op.add_input_tensor(shifted_sum_minus_one) - add10_op.activation = activation.clone() - add10_op.attrs["rescale"] = (1, 1) - half_denominator = Tensor(sum_of_exp.shape, DataType.int32, f"{add10_op.name}_0") - half_denominator.quantization = one_scale_quant - add10_op.set_output_tensor(half_denominator) - DebugDatabase.add_optimised(self.op, add10_op) - pass_number += 1 + f0_one_const = create_const_tensor( + "F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant + ) + half_denominator = add_op_get_ofm( + create_add( + f"{self.op.name}_add{pass_number}", + f0_one_const, + shifted_sum_minus_one, + one_scale_quant, + activation, + attrs={"rescale": (1, 1)}, + ) + ) # PASS 11 - Multiply - mul11_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul11_op.add_input_tensor(half_denominator) - mul11_op.add_input_tensor( - create_const_tensor( - "neg_32_over_17_const", - [1, 1, 1, 1], - DataType.int32, - [-1010580540], - np.int32, - quantization=one_scale_quant, - ), + neg_32_over_17 = create_const_tensor( + "neg_32_over_17_const", [1, 1, 1, 1], DataType.int32, [-1010580540], np.int32, quantization=one_scale_quant + ) + rescaled = add_op_get_ofm( + create_mul( + f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2, + ) ) - rescaled = Tensor(sum_of_exp.shape, DataType.int32, f"{mul11_op.name}_0") - rescaled.quantization = one_scale_quant.clone() - rescaled.quantization.scale_f32 = 2.0 - mul11_op.activation = activation2.clone() - mul11_op.set_output_tensor(rescaled) - DebugDatabase.add_optimised(self.op, mul11_op) - pass_number += 1 # PASS 12 - Add - add12_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}") - add12_op.add_input_tensor(rescaled) - add12_op.add_input_tensor( - create_const_tensor( - "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant - ), + const_48_over_17 = create_const_tensor( + "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant + ) + rescale_w_offset = add_op_get_ofm( + create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,) ) - add12_op.activation = activation.clone() - rescale_w_offset = Tensor(sum_of_exp.shape, DataType.int32, f"{add12_op.name}_0") - rescale_w_offset.quantization = one_scale_quant - add12_op.set_output_tensor(rescale_w_offset) - DebugDatabase.add_optimised(self.op, add12_op) - pass_number += 1 + # PASS 13 - 27 nr_x = rescale_w_offset F2_one = create_const_tensor( "F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], np.int32, quantization=no_scale_quant @@ -456,80 +372,44 @@ class SoftMax: ) for _ in range(3): # PASS 13, 18, 23 - MUL - mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul_op.add_input_tensor(nr_x) - mul_op.add_input_tensor(half_denominator) - mul_op.activation = activation2.clone() - half_denominator_times_x = Tensor(sum_of_exp.shape, DataType.int32, f"{mul_op.name}_0") - half_denominator_times_x.quantization = one_scale_quant.clone() - half_denominator_times_x.quantization.scale_f32 = 2.0 - mul_op.set_output_tensor(half_denominator_times_x) - pass_number += 1 + half_denominator_times_x = add_op_get_ofm( + create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,) + ) # PASS 14, 19, 24 - SUB - sub_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub_op.add_input_tensor(F2_one) - sub_op.add_input_tensor(half_denominator_times_x) - sub_op.activation = activation.clone() - one_minus_half_denominator_times_x = Tensor(sum_of_exp.shape, DataType.int32, f"{sub_op.name}_0") - one_minus_half_denominator_times_x.quantization = one_scale_quant - sub_op.set_output_tensor(one_minus_half_denominator_times_x) - DebugDatabase.add_optimised(self.op, sub_op) - pass_number += 1 + one_minus_half_denominator_times_x = add_op_get_ofm( + create_sub( + f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation, + ) + ) # PASS 15, 20, 25 - MUL - mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul_op.add_input_tensor(nr_x) - mul_op.add_input_tensor(one_minus_half_denominator_times_x) - mul_op.activation = activation2.clone() - to_rescale = Tensor(sum_of_exp.shape, DataType.int32, f"{mul_op.name}_0") - to_rescale.quantization = one_scale_quant.clone() - to_rescale.quantization.scale_f32 = 2.0 - mul_op.set_output_tensor(to_rescale) - pass_number += 1 + to_rescale = add_op_get_ofm( + create_mul( + f"{self.op.name}_mul{pass_number}", + nr_x, + one_minus_half_denominator_times_x, + two_scale_quant, + activation2, + ) + ) # PASS 16, 21, 26 - MUL - shl_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - shl_op.add_input_tensor(to_rescale) - shl_op.add_input_tensor(four) - shl_op.activation = activation.clone() - to_add = Tensor(sum_of_exp.shape, DataType.int32, f"{shl_op.name}_0") - to_add.quantization = no_scale_quant - shl_op.set_output_tensor(to_add) - DebugDatabase.add_optimised(self.op, shl_op) - pass_number += 1 + to_add = add_op_get_ofm( + create_mul(f"{self.op.name}_mul{pass_number}", to_rescale, four, no_scale_quant, activation) + ) # PASS 17, 22, 27 - ADD - add_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}") - add_op.add_input_tensor(nr_x) - add_op.add_input_tensor(to_add) - add_op.activation = activation.clone() - nr_x = Tensor(sum_of_exp.shape, DataType.int32, f"{add_op.name}_0") - nr_x.quantization = one_scale_quant - add_op.set_output_tensor(nr_x) - DebugDatabase.add_optimised(self.op, add_op) - pass_number += 1 + nr_x = add_op_get_ofm( + create_add(f"{self.op.name}_add{pass_number}", nr_x, to_add, one_scale_quant, activation) + ) # PASS 28 - Multiply - mul28_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul28_op.add_input_tensor(nr_x) - mul28_op.add_input_tensor( - create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant) - ) - mul28_op.activation = activation.clone() - scale_factor = Tensor(sum_of_exp.shape, DataType.int32, f"{mul28_op.name}_0") - scale_factor.quantization = one_scale_quant - mul28_op.set_output_tensor(scale_factor) - DebugDatabase.add_optimised(self.op, mul28_op) - pass_number += 1 + two = create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant) + scale_factor = add_op_get_ofm( + create_mul(f"{self.op.name}_mul{pass_number}", nr_x, two, one_scale_quant, activation) + ) # PASS 29 - Multiply - mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul_op.add_input_tensor(ifm_exp) - mul_op.add_input_tensor(scale_factor) - mul_op.activation = activation2.clone() - scaled_exp = Tensor(ifm_exp.shape, DataType.int32, f"{mul_op.name}_0") - scaled_exp.quantization = one_scale_quant.clone() - scaled_exp.quantization.scale_f32 = 2.0 - mul_op.set_output_tensor(scaled_exp) - DebugDatabase.add_optimised(self.op, mul_op) - pass_number += 1 + scaled_exp = add_op_get_ofm( + create_mul(f"{self.op.name}_mul{pass_number}", ifm_exp, scale_factor, two_scale_quant, activation2) + ) # PASS 30 - SHR shr30_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}") @@ -538,7 +418,6 @@ class SoftMax: shr30_op.add_input_tensor(right_shift) shr30_op.set_output_tensor(ofm) DebugDatabase.add_optimised(self.op, shr30_op) - pass_number += 1 return shr30_op @@ -547,176 +426,97 @@ class SoftMax: no_scale_quant.scale_f32 = None pass_number = 0 + def add_op_get_ofm(op): + DebugDatabase.add_optimised(self.op, op) + nonlocal pass_number + pass_number += 1 + return op.ofm + # PASS 0 - Depthwise Maxpool - maxpool_op = self.op.clone(f"_maxpool{pass_number}") - maxpool_op.type = Op.MaxPool - DebugDatabase.add_optimised(self.op, maxpool_op) - maxpool_h = ifm.shape[1] * ifm.shape[2] - maxpool_w = ifm.shape[3] - maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1] - maxpool_op.attrs["padding"] = b"VALID" - maxpool_op.attrs["stride_w"] = 1 - maxpool_op.attrs["stride_h"] = 1 - maxpool_op.attrs["filter_width"] = maxpool_w - maxpool_op.attrs["filter_height"] = 1 - maxpool_op.attrs["strides"] = [1, maxpool_op.attrs["stride_h"], maxpool_op.attrs["stride_w"], 1] - maxpool_op.attrs["ksize"] = [1, maxpool_op.attrs["filter_height"], maxpool_op.attrs["filter_width"], 1] - maxpool_op.inputs = [create_reshape_tensor(ifm, maxpool_ifm_shape)] - ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, f"{maxpool_op.name}_0") - ifm_max.quantization = no_scale_quant - maxpool_op.set_output_tensor(ifm_max) - DebugDatabase.add_optimised(self.op, maxpool_op) - pass_number += 1 + ifm_max = add_op_get_ofm(create_depthwise_maxpool(f"{self.op.name}_maxpool{pass_number}", ifm, no_scale_quant)) # PASS 1 - Sub - sub1_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub1_op.add_input_tensor(ifm) - sub1_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1])) - sub1_ofm = Tensor(ifm.shape, DataType.int32, f"{sub1_op.name}_0") - sub1_ofm.quantization = ifm.quantization.clone() - sub1_op.set_output_tensor(sub1_ofm) - DebugDatabase.add_optimised(self.op, sub1_op) - pass_number += 1 + ifm_max = create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]) + sub1_ofm = add_op_get_ofm( + create_sub(f"{self.op.name}_sub{pass_number}", ifm, ifm_max, ifm.quantization.clone(), dtype=DataType.int32) + ) # PASS 2 - Mul + name = f"{self.op.name}_mul{pass_number}" beta = self.op.attrs.get("beta", 1.0) mul2_out_range = 10.0 / 65535.0 mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range) - mul2_quant = ifm.quantization.clone() - mul2_quant.scale_f32 = beta - mul2_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul2_op.add_input_tensor(sub1_ofm) - mul2_op.add_input_tensor( - create_const_tensor( - f"{mul2_op.name}_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=mul2_quant - ), + scale_quant = ifm.quantization.clone() + scale_quant.scale_f32 = beta + mul2_quant = ofm.quantization.clone() + mul2_quant.scale_f32 = mul2_out_range + scale = create_const_tensor( + f"{name}_scale_const", [1, 1, 1, 1], DataType.int32, [mul2_scale], np.int32, quantization=scale_quant ) - mul2_ofm = Tensor(ifm.shape, DataType.int32, f"{self.op.name}_mul{pass_number}") - mul2_ofm.quantization = ofm.quantization.clone() - mul2_ofm.quantization.scale_f32 = mul2_out_range - mul2_op.set_output_tensor(mul2_ofm) - DebugDatabase.add_optimised(self.op, mul2_op) - pass_number += 1 + mul2_ofm = add_op_get_ofm(create_mul(name, sub1_ofm, scale, mul2_quant)) # PASS 3 - Add+LUT(exp) - add_op = Operation(Op.Add, f"{self.op.name}_add{pass_number}") - add_op.add_input_tensor(mul2_ofm) - add_op.add_input_tensor( - create_const_tensor( - f"{add_op.name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_add{pass_number}" + const_add = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [32767], np.int32, quantization=no_scale_quant ) + add_op = create_add(name, mul2_ofm, const_add, mul2_ofm.quantization.clone(), dtype=DataType.int16) add_op.set_activation_lut( create_const_tensor( - f"{add_op.name}_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT + f"{name}_exp_lut", [1, 1, 1, 512], DataType.int32, self.EXP_LUT, np.int32, TensorPurpose.LUT ) ) - exp_ofm = Tensor(mul2_ofm.shape, DataType.int16, f"{add_op.name}_0") - exp_ofm.quantization = mul2_ofm.quantization.clone() - add_op.set_output_tensor(exp_ofm) - DebugDatabase.add_optimised(self.op, add_op) - pass_number += 1 + ifm_exp = add_op_get_ofm(add_op) # PASS 4 - Reduce sum - reduce_sum_op = Operation(Op.ReduceSum, self.op.name + "_reduce_sum4") - reduce_sum_op.attrs["padding"] = b"VALID" - reduce_sum_op.attrs["stride_w"] = 1 - reduce_sum_op.attrs["stride_h"] = 1 - reduce_sum_op.attrs["filter_width"] = 1 - reduce_sum_op.attrs["filter_height"] = 1 - reduce_sum_op.attrs["strides"] = [1, reduce_sum_op.attrs["stride_h"], reduce_sum_op.attrs["stride_w"], 1] - reduce_sum_op.attrs["ksize"] = [1, reduce_sum_op.attrs["filter_height"], reduce_sum_op.attrs["filter_width"], 1] - reduce_sum_op.add_input_tensor(exp_ofm) - - reduce_sum_shape = [1, exp_ofm.shape[1], exp_ofm.shape[2], 1] - sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, f"{reduce_sum_op.name}_0") - sum_of_exp.quantization = no_scale_quant - reduce_sum_op.set_output_tensor(sum_of_exp) - DebugDatabase.add_optimised(self.op, reduce_sum_op) - pass_number += 1 + sum_of_exp = add_op_get_ofm( + create_reduce_sum(f"{self.op.name}_reduce_sum{pass_number}", ifm_exp, no_scale_quant) + ) # PASS 5 - CLZ - clz_op = Operation(Op.CLZ, f"{self.op.name}_clz{pass_number}") - clz_op.add_input_tensor(sum_of_exp) - headroom_plus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{clz_op.name}_0") - headroom_plus_one.quantization = no_scale_quant - clz_op.set_output_tensor(headroom_plus_one) - DebugDatabase.add_optimised(self.op, clz_op) - pass_number += 1 + headroom_plus_one = add_op_get_ofm(create_clz(f"{self.op.name}_clz{pass_number}", sum_of_exp, no_scale_quant)) # PASS 6 - Sub - sub6_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub6_op.add_input_tensor( - create_const_tensor( - f"{sub6_op.name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_sub{pass_number}" + const_31 = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant ) - sub6_op.add_input_tensor(headroom_plus_one) - reciprocal_right_shift = Tensor(sum_of_exp.shape, DataType.int32, f"{sub6_op.name}_0") - reciprocal_right_shift.quantization = no_scale_quant - sub6_op.set_output_tensor(reciprocal_right_shift) - DebugDatabase.add_optimised(self.op, sub6_op) - pass_number += 1 + reciprocal_right_shift = add_op_get_ofm(create_sub(name, const_31, headroom_plus_one, no_scale_quant)) # PASS 7 - SHL - shl7_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}") - shl7_op.add_input_tensor( - create_const_tensor( - f"{shl7_op.name}_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant - ), + one = create_const_tensor( + f"one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant + ) + constant_one = add_op_get_ofm( + create_shl(f"{self.op.name}_shl{pass_number}", one, reciprocal_right_shift, no_scale_quant) ) - shl7_op.add_input_tensor(reciprocal_right_shift) - constant_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl7_op.name}_0") - constant_one.quantization = no_scale_quant - shl7_op.set_output_tensor(constant_one) - DebugDatabase.add_optimised(self.op, shl7_op) - pass_number += 1 # PASS 8 - Sub - sub8_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub8_op.add_input_tensor(sum_of_exp) - sub8_op.add_input_tensor(constant_one) - sum_of_exps_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{sub8_op.name}_0") - sum_of_exps_minus_one.quantization = no_scale_quant - sub8_op.set_output_tensor(sum_of_exps_minus_one) - DebugDatabase.add_optimised(self.op, sub8_op) - pass_number += 1 + sum_of_exps_minus_one = add_op_get_ofm( + create_sub(f"{self.op.name}_sub{pass_number}", sum_of_exp, constant_one, no_scale_quant) + ) # PASS 9 - SHL - shl9_op = Operation(Op.SHL, f"{self.op.name}_shl{pass_number}") - shl9_op.add_input_tensor(sum_of_exps_minus_one) - shl9_op.add_input_tensor(headroom_plus_one) - shifted_sum_minus_one = Tensor(sum_of_exp.shape, DataType.int32, f"{shl9_op.name}_0") - shifted_sum_minus_one.quantization = no_scale_quant - shl9_op.set_output_tensor(shifted_sum_minus_one) - DebugDatabase.add_optimised(self.op, shl9_op) - pass_number += 1 + shifted_sum_minus_one = add_op_get_ofm( + create_shl(f"{self.op.name}_shl{pass_number}", sum_of_exps_minus_one, headroom_plus_one, no_scale_quant) + ) # PASS 10 - SHR - shr10_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}") - shr10_op.add_input_tensor(shifted_sum_minus_one) - shr10_op.add_input_tensor( - create_const_tensor( - f"{shr10_op.name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_shr{pass_number}" + shift = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [15], np.int32, quantization=no_scale_quant ) - shifted_sum_minus_one_16 = Tensor(sum_of_exp.shape, DataType.int32, f"{shr10_op.name}_0") - shifted_sum_minus_one_16.quantization = shifted_sum_minus_one.quantization.clone() - shr10_op.set_output_tensor(shifted_sum_minus_one_16) - DebugDatabase.add_optimised(self.op, shr10_op) - pass_number += 1 + shifted_sum_minus_one_16 = add_op_get_ofm(create_shr(name, shifted_sum_minus_one, shift, no_scale_quant)) # PASS 11 - Sub+LUT(one over one plus x) - sub11_op = Operation(Op.Sub, f"{self.op.name}_sub{pass_number}") - sub11_op.add_input_tensor(shifted_sum_minus_one_16) - sub11_op.add_input_tensor( - create_const_tensor( - f"{sub11_op.name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant - ), + name = f"{self.op.name}_sub{pass_number}" + sub11_const = create_const_tensor( + f"{name}_const", [1, 1, 1, 1], DataType.int32, [32768], np.int32, quantization=no_scale_quant ) + sub11_op = create_sub(name, shifted_sum_minus_one_16, sub11_const, no_scale_quant, dtype=DataType.int16) sub11_op.set_activation_lut( create_const_tensor( - f"{sub11_op.name}_lut", + f"{name}_one_over_one_plus_x_lut", [1, 1, 1, 512], DataType.int32, self.ONE_OVER_ONE_PLUS_X_LUT, @@ -724,21 +524,14 @@ class SoftMax: TensorPurpose.LUT, ) ) - reciprocal_scale = Tensor(sum_of_exp.shape, DataType.int16, f"{sub11_op.name}_0") - reciprocal_scale.quantization = no_scale_quant - sub11_op.set_output_tensor(reciprocal_scale) - DebugDatabase.add_optimised(self.op, sub11_op) - pass_number += 1 + reciprocal_scale = add_op_get_ofm(sub11_op) # PASS 12 - Multiply - mul_op = Operation(Op.Mul, f"{self.op.name}_mul{pass_number}") - mul_op.add_input_tensor(exp_ofm) - mul_op.add_input_tensor(reciprocal_scale) - mul_ofm = Tensor(exp_ofm.shape, DataType.int32, f"{mul_op.name}_0") - mul_ofm.quantization = no_scale_quant - mul_op.set_output_tensor(mul_ofm) - DebugDatabase.add_optimised(self.op, mul_op) - pass_number += 1 + mul_ofm = add_op_get_ofm( + create_mul( + f"{self.op.name}_mul{pass_number}", ifm_exp, reciprocal_scale, no_scale_quant, dtype=DataType.int32 + ) + ) # PASS 13 - SHR shr13_op = Operation(Op.SHR, f"{self.op.name}_shr{pass_number}") @@ -746,6 +539,5 @@ class SoftMax: shr13_op.add_input_tensor(reciprocal_right_shift) shr13_op.set_output_tensor(ofm) DebugDatabase.add_optimised(self.op, shr13_op) - pass_number += 1 return shr13_op |