From e6ccd87a2f40877cacdd9721a5116a6853dfe573 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Mon, 9 Nov 2020 16:46:37 +0000 Subject: MLBEDSW-3019: Add profiling debug database - Added mechanism to track input to output graph transforms for debugging the resultant command stream. - Provides base implementation for MLBEDSW-2661 Signed-off-by: Tim Hall Change-Id: I2dfe8a409fbde7ad0282bfab5acb11ba1c8b82d8 --- ethosu/vela/graph_optimiser.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'ethosu/vela/graph_optimiser.py') diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index e31348b5..73046302 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -25,6 +25,7 @@ from . import lut from . import rewrite_graph from . import scaling from .data_type import DataType +from .debug_database import DebugDatabase from .errors import UnsupportedFeatureError from .ethos_u55_regs.ethos_u55_regs import resampling_mode from .numeric_util import clamp_sigmoid @@ -77,6 +78,7 @@ def rewrite_concat(tens, arch, nng): new_op.attrs["concat_end"] = offset new_op.run_on_npu = True tens.ops.append(new_op) + DebugDatabase.add_optimised(concat_op, new_op) assert tens.shape[axis] == offset # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a @@ -128,6 +130,7 @@ def rewrite_split(tens, arch, nng): new_op.attrs["split_end"] = offset_end new_op.run_on_npu = True new_op.set_output_tensor(tens) + DebugDatabase.add_optimised(split_op, new_op) return tens @@ -399,6 +402,7 @@ def fixup_pack_input(op, arch, nng): reshape_op.attrs["new_shape"] = desired_shape reshape_op.inputs = [inp, new_shape_tens] reshape_op.set_output_tensor(reshape_out) + DebugDatabase.add_optimised(op, reshape_op) op.inputs[idx] = reshape_out @@ -492,6 +496,7 @@ def fixup_unpack_output(tens, arch, nng): reshape_op.attrs["new_shape"] = reshape_input_shape reshape_op.inputs = [reshape_in, new_shape_tens] reshape_op.set_output_tensor(out_tens) + DebugDatabase.add_optimised(op, reshape_op) op.outputs[idx] = reshape_in @@ -568,6 +573,7 @@ def convert_depthwise_to_conv(op, arch, nng): op.attrs["depth_multiplier"], ifm_tensor.shape[3], ofm_tensor.shape[3] ) ) + DebugDatabase.add_optimised(op, op) return op @@ -616,6 +622,9 @@ def convert_conv_to_fc(op, arch, nng): reshape_op.set_output_tensor(orig_ofm_tensor) # Replace this ops OFM to point to the 2D tensor op.outputs[0] = fc_ofm_tensor + # Record optimisation in debug database + DebugDatabase.add_optimised(op, reshape_op) + DebugDatabase.add_optimised(op, op) return op @@ -670,6 +679,10 @@ def fixup_act_reorder(op, arch, nng): # Mark the op so that it will be removed as passthrough later on op.type = Op.Identity + + # Record optimisation in debug database + DebugDatabase.add_optimised(op, act_op) + DebugDatabase.add_optimised(op, op) return op @@ -788,6 +801,10 @@ def convert_mul_max_to_abs_or_lrelu(op, arch, nng): op.name = op.name.replace("Maximum", new_op.name) op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name) op.inputs = [shared_in] + + # Record optimisation in debug database + DebugDatabase.add_optimised(op, op) + return op @@ -812,6 +829,7 @@ def convert_lrelu_to_mul_max(op, arch): mul_alpha.add_input_tensor(alpha_tens) fm_alpha = ofm.clone(op.name + "_alpha") mul_alpha.set_output_tensor(fm_alpha) + DebugDatabase.add_optimised(op, mul_alpha) if check_quantized_tens_scaling_equal(ifm, ofm): # No identity multiplication is needed @@ -832,6 +850,7 @@ def convert_lrelu_to_mul_max(op, arch): mul_identity.add_input_tensor(identity_tens) fm_id = ofm.clone(op.name + "_id") mul_identity.set_output_tensor(fm_id) + DebugDatabase.add_optimised(op, mul_alpha) # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs op.type = Op.Maximum @@ -840,6 +859,8 @@ def convert_lrelu_to_mul_max(op, arch): ifm.consumer_list.remove(op) op.add_input_tensor(fm_alpha) op.add_input_tensor(fm_id) + + DebugDatabase.add_optimised(op, op) return op @@ -1012,6 +1033,7 @@ def fuse_activation_function_with_prev(op, arch, nng): prev_op.set_activation_lut(op.activation_lut) # Bypass op prev_op.set_output_tensor(ofm) + DebugDatabase.add_optimised(op, prev_op) return op @@ -1052,6 +1074,11 @@ def supported_operator_check(op, arch, nng): return op +def _record_optimised(op, arch): + if op.type != Op.Const: + DebugDatabase.add_optimised(op, op) + + def optimise_graph_a(nng, arch, verbose_graph=False): if verbose_graph: nng.print_graph() @@ -1093,6 +1120,10 @@ def optimise_graph_a(nng, arch, verbose_graph=False): nng, sg, arch, [remove_passthrough_tensor], [fuse_activation_function_with_prev, add_padding_fields] ) + # Post-optimisation operator debug tracing + for sg in nng.subgraphs: + rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [_record_optimised]) + if verbose_graph: nng.print_graph() return nng -- cgit v1.2.1