diff options
author | Johan Alfvén <johan.alfven@arm.com> | 2022-04-19 16:07:05 +0200 |
---|---|---|
committer | tim.hall <tim.hall@arm.com> | 2022-05-18 17:36:00 +0000 |
commit | 0b20781caf7e4fa22f9b9bf2c080c3dbeff8c643 (patch) | |
tree | 5080a97c35aaeb1c3430eb33e4cda59c062dcdd7 /ethosu | |
parent | 1538dce9f07a310587f057aee5fbe25509963879 (diff) | |
download | ethos-u-vela-0b20781caf7e4fa22f9b9bf2c080c3dbeff8c643.tar.gz |
MLBEDSW-6430: MLCE: Vela splitting network into two ethos operators
- Due to how the graph is traversed, the final pass list contained unnecessary
multiple Ethos-U operators. Functionality wise not a problem but it adds extra
context switching between CPU and NPU.
- By applying sorting rules to the pass list, it is possible to create a more
optimal pass list that reduces the numbers of Ethos-U operator.
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
Change-Id: Ib556f902e1f321b5c50238fada7aa92b9810b27a
Diffstat (limited to 'ethosu')
-rw-r--r-- | ethosu/vela/pass_packing.py | 63 |
1 files changed, 62 insertions, 1 deletions
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py index 8535fa06..74e1f344 100644 --- a/ethosu/vela/pass_packing.py +++ b/ethosu/vela/pass_packing.py @@ -461,7 +461,68 @@ def pack_into_passes(nng, arch, verbose_packing=False): startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs startup_ps.name = "startup_weight_initialisation" - sg.passes = list(reversed(reverse_pass_list)) + # Graphs with both CPU and NPU ops might not have an optimal order in + # the pass list due to how the graph is traversed (depth first search). + # This can result in more context switching between CPU and NPU. + # Try to optmize this by moving/grouping CPU ops where that is possible. + # Criteria for CPU pass to be moved: + # + # 1) CPU passes that only depends on sg.input_tensor can be + # moved to the top of the list. + # + # 2) A CPU pass X is allowed to be grouped together with CPU pass Y + # if there is no NPU pass between pass X and pass Y that depends + # on output from pass X or a MemoryOnly pass. + # + # Criteria 2 will try to move as many CPU passes towards the bottom of + # the list. + + pass_list_top = [] + pass_list = [] + + # Filter out early passes from the rest + for ps in list(reversed(reverse_pass_list)): + if startup_ps == ps: + # startup pass belongs in the top + pass_list_top.insert(0, ps) + continue + + if ( + ps.placement == PassPlacement.Cpu + and ps.ops[0].ifm in sg.input_tensors + and (ps.ops[0].ifm2 in sg.input_tensors or ps.ops[0].ifm2 is None) + ): + # This CPU pass only depends on sg.input_tensors + pass_list_top.append(ps) + else: + # Add pass to the list that will be sorted in the next step + pass_list.append(ps) + + # Sort the rest of the list based on critera 2. + # Search from bottom of list and when a CPU pass is found + # search forward in the list and see if it is possible to join another CPU pass. + for cpu_ps in reversed(pass_list): + if cpu_ps.placement != PassPlacement.Cpu: + continue + # CPU pass found, search forward and move pass if possible + idx = pass_list.index(cpu_ps) + for next_ps in pass_list[idx + 1 :]: + if next_ps.placement == PassPlacement.Cpu: + # It is possible to move the CPU pass + pass_list.remove(cpu_ps) + insert_index = pass_list.index(next_ps) + pass_list.insert(insert_index, cpu_ps) + break + if ( + cpu_ps.ops[0].ofm not in [next_ps.ops[0].ifm, next_ps.ops[0].ifm2] + and next_ps.placement != PassPlacement.MemoryOnly + ): + continue + + break + pass_list_top.extend(pass_list) + + sg.passes = pass_list_top sg.build_pass_links() if verbose_packing: |