aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/compiler_driver.py
diff options
context:
space:
mode:
Diffstat (limited to 'ethosu/vela/compiler_driver.py')
-rw-r--r--ethosu/vela/compiler_driver.py16
1 files changed, 15 insertions, 1 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index d2892096..51c97070 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -40,6 +40,7 @@ from .scheduler import OptimizationStrategy
from .tensor import MemArea
from .tensor import MemType
from .tensor import Tensor
+from .utils import progress_print
class CompilerOptions:
@@ -62,6 +63,7 @@ class CompilerOptions:
verbose_operators=False,
verbose_weights=False,
verbose_performance=False,
+ verbose_progress=False,
show_cpu_operations=False,
tensor_allocator=TensorAllocator.Greedy,
timing=False,
@@ -82,6 +84,7 @@ class CompilerOptions:
self.verbose_operators = verbose_operators
self.verbose_weights = verbose_weights
self.verbose_performance = verbose_performance
+ self.verbose_progress = verbose_progress
self.show_cpu_operations = show_cpu_operations
self.tensor_allocator = tensor_allocator
self.timing = timing
@@ -154,11 +157,13 @@ def _check_schedule(nng, arch, scheduler_options):
def compiler_driver(nng, arch, options, scheduler_options, network_type, output_basename):
assert verify_graph_health(nng)
+ verbose_progress = scheduler_options.verbose_progress
# Pre-optimisation operator tracking
for sg in nng.subgraphs:
visit_graph_post_order(sg.output_tensors, arch, [], [_record_operator])
+ progress_print(verbose_progress, "Performing graph optimisation")
nng = graph_optimiser.optimise_graph(
nng, arch, network_type, options.verbose_graph, options.force_symmetric_int_weights
)
@@ -167,17 +172,22 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
if options.verbose_quantization:
nng.print_graph_with_tensor_quantization()
+ progress_print(verbose_progress, "Defining tensor purpose")
nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
assert verify_graph_health(nng)
+
+ progress_print(verbose_progress, "Performing pass packing")
pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
assert verify_graph_health(nng)
+ progress_print(verbose_progress, "Extracting npu subgraphs")
extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
assert verify_graph_health(nng)
if options.timing:
start = time.time()
+ progress_print(verbose_progress, "Scheduling passes")
# Run the scheduler
scheduler.schedule_passes(nng, arch, options, scheduler_options)
_check_schedule(nng, arch, scheduler_options)
@@ -199,6 +209,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
# Create list of NPU subgraphs with same order as the list of all subgraphs
npu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Npu]
+ progress_print(verbose_progress, "Calculating live ranges for constant NPU tensors")
# Calculate live ranges for all constant Npu tensors, in permanent storage
for sg in npu_subgraphs:
lr_graph_flash = live_range.create_linear_live_range_graph(
@@ -209,6 +220,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
)
if npu_subgraphs:
+ progress_print(verbose_progress, "Allocating NPU constant tensors to the first NPU subgraph")
# Allocate all Npu constant tensors to the first Npu subgraph since it is
# processed first during serialization into tensors
first_npu_sg = npu_subgraphs[0]
@@ -225,6 +237,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
root_sg = nng.get_root_subgraph()
+ progress_print(verbose_progress, "Generating command stream")
# Generate command streams and serialise Npu-ops into tensors
for sg in npu_subgraphs:
high_level_command_stream_generator.generate_high_level_command_stream_for_schedule(
@@ -249,6 +262,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
if scratch_fast_tens is not None:
scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])
+ progress_print(verbose_progress, "Allocating CPU constant tensors")
# Allocate all Cpu constant tensors, this is done last because the Npu-ops
# have to be serialized into flash and scratch tensors first
tensor_allocation.allocate_tensors(
@@ -261,7 +275,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
verbose_allocation=options.verbose_allocation,
cpu_tensor_alignment=options.cpu_tensor_alignment,
)
-
+ progress_print(verbose_progress, "Calculating new performance for the network")
npu_performance.calc_new_performance_for_network(
nng, arch, network_type, options.verbose_performance, output_basename
)