diff options
Diffstat (limited to 'ethosu/vela/softmax.py')
-rw-r--r-- | ethosu/vela/softmax.py | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py index 12c20164..efd91a35 100644 --- a/ethosu/vela/softmax.py +++ b/ethosu/vela/softmax.py @@ -25,6 +25,7 @@ import numpy as np from . import fp_math from . import scaling from .data_type import DataType +from .debug_database import DebugDatabase from .operation import Op from .operation import Operation from .tensor import create_const_tensor @@ -220,6 +221,9 @@ class SoftMax: def get_graph_8bit(self, ifm, ofm): exp_lut = self.generate_exp_table(self.op.attrs.get("beta", 1.0), ifm.quantization.scale_f32) + ifm = create_reshape_tensor(ifm, ifm.get_full_shape()) + DebugDatabase.add_optimised(self.op, ifm.ops[0]) + ofm = create_reshape_tensor(ofm, ofm.get_full_shape(), False) no_scale_quant = ifm.quantization.clone() no_scale_quant.scale_f32 = None no_scale_quant.zero_point = 0 @@ -245,6 +249,7 @@ class SoftMax: ifm_max = Tensor([1, maxpool_h, 1, 1], ifm.dtype, maxpool_op.name + "_0") ifm_max.quantization = no_scale_quant maxpool_op.set_output_tensor(ifm_max) + DebugDatabase.add_optimised(self.op, maxpool_op) # PASS 1 - Sub+LUT(exp) sub_op = Operation(Op.Sub, self.op.name + "_sub1") @@ -261,6 +266,7 @@ class SoftMax: ifm_exp.quantization.quant_min = -128 ifm_exp.quantization.quant_max = 127 sub_op.set_output_tensor(ifm_exp) + DebugDatabase.add_optimised(self.op, sub_op) # PASS 2 - SHR shr2_op = Operation(Op.SHR, self.op.name + "_shr2") @@ -274,6 +280,7 @@ class SoftMax: rescaled_exp = Tensor(ifm.shape, ifm_exp.dtype, shr2_op.name + "_0") rescaled_exp.quantization = no_scale_quant shr2_op.set_output_tensor(rescaled_exp) + DebugDatabase.add_optimised(self.op, shr2_op) # PASS 3 - Reduce sum reduce_sum_op = Operation(Op.ReduceSum, self.op.name + "_reduce_sum3") @@ -290,6 +297,7 @@ class SoftMax: sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, reduce_sum_op.name + "_0") sum_of_exp.quantization = no_scale_quant reduce_sum_op.set_output_tensor(sum_of_exp) + DebugDatabase.add_optimised(self.op, reduce_sum_op) # PASS 4 - CLZ clz_op = Operation(Op.CLZ, self.op.name + "_clz4") @@ -297,6 +305,7 @@ class SoftMax: headroom_plus_one = Tensor(reduce_sum_shape, DataType.int32, clz_op.name + "_0") headroom_plus_one.quantization = no_scale_quant clz_op.set_output_tensor(headroom_plus_one) + DebugDatabase.add_optimised(self.op, clz_op) # PASS 5 - Sub sub5_op = Operation(Op.Sub, self.op.name + "_sub5") @@ -314,6 +323,7 @@ class SoftMax: right_shift = Tensor(reduce_sum_shape, DataType.int32, sub5_op.name + "_0") right_shift.quantization = no_scale_quant sub5_op.set_output_tensor(right_shift) + DebugDatabase.add_optimised(self.op, sub5_op) # PASS 6 - Sub one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant) @@ -323,6 +333,7 @@ class SoftMax: headroom = Tensor(reduce_sum_shape, DataType.int32, sub6_op.name + "_0") headroom.quantization = no_scale_quant sub6_op.set_output_tensor(headroom) + DebugDatabase.add_optimised(self.op, sub6_op) # PASS 7 - SHL shl7_op = Operation(Op.SHL, self.op.name + "_shl7") @@ -331,6 +342,7 @@ class SoftMax: shifted_sum = Tensor(reduce_sum_shape, DataType.int32, shl7_op.name + "_0") shifted_sum.quantization = no_scale_quant shl7_op.set_output_tensor(shifted_sum) + DebugDatabase.add_optimised(self.op, shl7_op) # PASS 8 - Sub sub8_op = Operation(Op.Sub, self.op.name + "_sub8") @@ -343,6 +355,7 @@ class SoftMax: shifted_sum_minus_one = Tensor(reduce_sum_shape, DataType.int32, sub8_op.name + "_0") shifted_sum_minus_one.quantization = no_scale_quant sub8_op.set_output_tensor(shifted_sum_minus_one) + DebugDatabase.add_optimised(self.op, sub8_op) # PASS 9 - SHL shl9_op = Operation(Op.SHL, self.op.name + "_shl9") @@ -351,6 +364,7 @@ class SoftMax: shifted_sum_minus_one = Tensor(reduce_sum_shape, DataType.int32, shl9_op.name + "_0") shifted_sum_minus_one.quantization = no_scale_quant shl9_op.set_output_tensor(shifted_sum_minus_one) + DebugDatabase.add_optimised(self.op, shl9_op) # PASS 10 - Add add10_op = Operation(Op.Add, self.op.name + "_add10") @@ -364,6 +378,7 @@ class SoftMax: half_denominator = Tensor(reduce_sum_shape, DataType.int32, add10_op.name + "_0") half_denominator.quantization = one_scale_quant add10_op.set_output_tensor(half_denominator) + DebugDatabase.add_optimised(self.op, add10_op) # PASS 11 - Multiply mul11_op = Operation(Op.Mul, self.op.name + "_mul11") @@ -382,6 +397,7 @@ class SoftMax: rescaled.quantization = one_scale_quant.clone() rescaled.quantization.scale_f32 = 2.0 mul11_op.set_output_tensor(rescaled) + DebugDatabase.add_optimised(self.op, mul11_op) # PASS 12 - Add add12_op = Operation(Op.Add, self.op.name + "_add12") @@ -394,6 +410,7 @@ class SoftMax: rescale_w_offset = Tensor(reduce_sum_shape, DataType.int32, add12_op.name + "_0") rescale_w_offset.quantization = one_scale_quant add12_op.set_output_tensor(rescale_w_offset) + DebugDatabase.add_optimised(self.op, add12_op) nr_x = rescale_w_offset F2_one = create_const_tensor( @@ -411,6 +428,7 @@ class SoftMax: half_denominator_times_x.quantization = one_scale_quant.clone() half_denominator_times_x.quantization.scale_f32 = 2.0 mul_op.set_output_tensor(half_denominator_times_x) + DebugDatabase.add_optimised(self.op, mul_op) # PASS 14, 19, 24 - SUB sub_op = Operation(Op.Sub, self.op.name + "_sub%d" % (14 + i * 5)) sub_op.add_input_tensor(F2_one) @@ -418,6 +436,7 @@ class SoftMax: one_minus_half_denominator_times_x = Tensor(reduce_sum_shape, DataType.int32, sub_op.name + "_0") one_minus_half_denominator_times_x.quantization = one_scale_quant sub_op.set_output_tensor(one_minus_half_denominator_times_x) + DebugDatabase.add_optimised(self.op, sub_op) # PASS 15, 20, 25 - MUL mul_op = Operation(Op.Mul, self.op.name + "_mul%d" % (15 + i * 5)) mul_op.add_input_tensor(nr_x) @@ -426,6 +445,7 @@ class SoftMax: to_rescale.quantization = one_scale_quant.clone() to_rescale.quantization.scale_f32 = 2.0 mul_op.set_output_tensor(to_rescale) + DebugDatabase.add_optimised(self.op, mul_op) # PASS 16, 21, 26 - MUL shl_op = Operation(Op.Mul, self.op.name + "_mul%d" % (16 + i * 5)) shl_op.add_input_tensor(to_rescale) @@ -433,6 +453,7 @@ class SoftMax: to_add = Tensor(reduce_sum_shape, DataType.int32, shl_op.name + "_0") to_add.quantization = no_scale_quant shl_op.set_output_tensor(to_add) + DebugDatabase.add_optimised(self.op, shl_op) # PASS 17, 22, 27 - ADD add_op = Operation(Op.Add, self.op.name + "_add%d" % (17 + i * 5)) add_op.add_input_tensor(nr_x) @@ -440,6 +461,7 @@ class SoftMax: nr_x = Tensor(reduce_sum_shape, DataType.int32, add_op.name + "_0") nr_x.quantization = one_scale_quant add_op.set_output_tensor(nr_x) + DebugDatabase.add_optimised(self.op, add_op) # PASS 28 - Multiply mul28_op = Operation(Op.Mul, self.op.name + "_mul28") @@ -450,6 +472,7 @@ class SoftMax: scale_factor = Tensor(reduce_sum_shape, DataType.int32, mul28_op.name + "_0") scale_factor.quantization = one_scale_quant mul28_op.set_output_tensor(scale_factor) + DebugDatabase.add_optimised(self.op, mul28_op) # PASS 29 - Multiply mul_op = Operation(Op.Mul, self.op.name + "_mul29") @@ -459,6 +482,7 @@ class SoftMax: scaled_exp.quantization = one_scale_quant.clone() scaled_exp.quantization.scale_f32 = 2.0 mul_op.set_output_tensor(scaled_exp) + DebugDatabase.add_optimised(self.op, mul_op) # PASS 30 - SHR shr30_op = Operation(Op.SHR, self.op.name + "_shr30") @@ -466,6 +490,7 @@ class SoftMax: shr30_op.add_input_tensor(scaled_exp) shr30_op.add_input_tensor(right_shift) shr30_op.set_output_tensor(ofm) + DebugDatabase.add_optimised(self.op, shr30_op) return shr30_op @@ -476,6 +501,7 @@ class SoftMax: # PASS 0 - Depthwise Maxpool maxpool_op = self.op.clone("_maxpool0") maxpool_op.type = Op.MaxPool + DebugDatabase.add_optimised(self.op, maxpool_op) maxpool_h = ifm.shape[1] * ifm.shape[2] maxpool_w = ifm.shape[3] maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1] @@ -490,6 +516,7 @@ class SoftMax: maxpool_ofm = Tensor([1, maxpool_h, 1, 1], ifm.dtype, maxpool_op.name + "_0") maxpool_ofm.quantization = no_scale_quant maxpool_op.set_output_tensor(maxpool_ofm) + DebugDatabase.add_optimised(self.op, maxpool_op) # PASS 1 - Sub sub1_op = Operation(Op.Sub, self.op.name + "_sub1") @@ -498,6 +525,7 @@ class SoftMax: sub1_ofm = Tensor(ifm.shape, DataType.int32, sub1_op.name + "_0") sub1_ofm.quantization = ifm.quantization.clone() sub1_op.set_output_tensor(sub1_ofm) + DebugDatabase.add_optimised(self.op, sub1_op) # PASS 2 - Mul beta = self.op.attrs.get("beta", 1.0) @@ -516,6 +544,7 @@ class SoftMax: mul2_ofm.quantization = ofm.quantization.clone() mul2_ofm.quantization.scale_f32 = mul2_out_range mul2_op.set_output_tensor(mul2_ofm) + DebugDatabase.add_optimised(self.op, mul2_op) # PASS 3 - Add+LUT(exp) add_op = Operation(Op.Add, self.op.name + "_add3") @@ -533,6 +562,7 @@ class SoftMax: exp_ofm = Tensor(mul2_ofm.shape, DataType.int16, add_op.name + "_0") exp_ofm.quantization = mul2_ofm.quantization.clone() add_op.set_output_tensor(exp_ofm) + DebugDatabase.add_optimised(self.op, add_op) # PASS 4 - Reduce sum reduce_sum_op = Operation(Op.ReduceSum, self.op.name + "_reduce_sum4") @@ -549,6 +579,7 @@ class SoftMax: sum_of_exp = Tensor(reduce_sum_shape, DataType.int32, reduce_sum_op.name + "_0") sum_of_exp.quantization = no_scale_quant reduce_sum_op.set_output_tensor(sum_of_exp) + DebugDatabase.add_optimised(self.op, reduce_sum_op) # PASS 5 - CLZ clz_op = Operation(Op.CLZ, self.op.name + "_clz5") @@ -556,6 +587,7 @@ class SoftMax: headroom_plus_one = Tensor(reduce_sum_shape, DataType.int32, clz_op.name + "_0") headroom_plus_one.quantization = no_scale_quant clz_op.set_output_tensor(headroom_plus_one) + DebugDatabase.add_optimised(self.op, clz_op) # PASS 6 - Sub sub6_op = Operation(Op.Sub, self.op.name + "_sub6") @@ -568,6 +600,7 @@ class SoftMax: reciprocal_right_shift = Tensor(reduce_sum_shape, DataType.int32, sub6_op.name + "_0") reciprocal_right_shift.quantization = no_scale_quant sub6_op.set_output_tensor(reciprocal_right_shift) + DebugDatabase.add_optimised(self.op, sub6_op) # PASS 7 - SHL shl7_op = Operation(Op.SHL, self.op.name + "_shl7") @@ -580,6 +613,7 @@ class SoftMax: constant_one = Tensor(reduce_sum_shape, DataType.int32, shl7_op.name + "_0") constant_one.quantization = no_scale_quant shl7_op.set_output_tensor(constant_one) + DebugDatabase.add_optimised(self.op, shl7_op) # PASS 8 - Sub sub8_op = Operation(Op.Sub, self.op.name + "_sub8") @@ -588,6 +622,7 @@ class SoftMax: sum_of_exps_minus_one = Tensor(reduce_sum_shape, DataType.int32, sub8_op.name + "_0") sum_of_exps_minus_one.quantization = no_scale_quant sub8_op.set_output_tensor(sum_of_exps_minus_one) + DebugDatabase.add_optimised(self.op, sub8_op) # PASS 9 - SHL shl9_op = Operation(Op.SHL, self.op.name + "_shl9") @@ -596,6 +631,7 @@ class SoftMax: shifted_sum_minus_one = Tensor(reduce_sum_shape, DataType.int32, shl9_op.name + "_0") shifted_sum_minus_one.quantization = no_scale_quant shl9_op.set_output_tensor(shifted_sum_minus_one) + DebugDatabase.add_optimised(self.op, shl9_op) # PASS 10 - SHR shr10_op = Operation(Op.SHR, self.op.name + "_shr10") @@ -608,6 +644,7 @@ class SoftMax: shifted_sum_minus_one_16 = Tensor(reduce_sum_shape, DataType.int32, shr10_op.name + "_0") shifted_sum_minus_one_16.quantization = shifted_sum_minus_one.quantization.clone() shr10_op.set_output_tensor(shifted_sum_minus_one_16) + DebugDatabase.add_optimised(self.op, shr10_op) # PASS 11 - Sub+LUT(one over one plus x) sub11_op = Operation(Op.Sub, self.op.name + "_sub11") @@ -630,6 +667,7 @@ class SoftMax: reciprocal_scale = Tensor(reduce_sum_shape, DataType.int16, sub11_op.name + "_0") reciprocal_scale.quantization = no_scale_quant sub11_op.set_output_tensor(reciprocal_scale) + DebugDatabase.add_optimised(self.op, sub11_op) # PASS 12 - Multiply mul_op = Operation(Op.Mul, self.op.name + "_mul12") @@ -638,11 +676,13 @@ class SoftMax: mul_ofm = Tensor(exp_ofm.shape, DataType.int32, mul_op.name + "_0") mul_ofm.quantization = no_scale_quant mul_op.set_output_tensor(mul_ofm) + DebugDatabase.add_optimised(self.op, mul_op) # PASS 13 - SHR shr13_op = Operation(Op.SHR, self.op.name + "_shr13") shr13_op.add_input_tensor(mul_ofm) shr13_op.add_input_tensor(reciprocal_right_shift) shr13_op.set_output_tensor(ofm) + DebugDatabase.add_optimised(self.op, shr13_op) return shr13_op |