From 1c54ac1499da4b1c0de39336c1a9b22e506388b1 Mon Sep 17 00:00:00 2001
From: Raul Farkas <raul.farkas@arm.com>
Date: Wed, 26 Apr 2023 07:49:15 +0100
Subject: MLBEDSW-7390: Add verbose progress option

Add --verbose-progress CLI option used to enable printing progress
information in the compiler driver and scheduler.

Change-Id: I99ac8c6a654e60391d5c11e28b89250405daa53a
Signed-off-by: Raul Farkas <raul.farkas@arm.com>
---
 OPTIONS.md                       |  8 ++++
 ethosu/vela/compiler_driver.py   | 16 +++++++-
 ethosu/vela/live_range.py        | 10 +++--
 ethosu/vela/scheduler.py         | 75 +++++++++++++++++++++--------------
 ethosu/vela/tensor_allocation.py |  6 ++-
 ethosu/vela/utils.py             | 86 ++++++++++++++++++++++++++++++++++++++++
 ethosu/vela/vela.py              |  3 ++
 7 files changed, 169 insertions(+), 35 deletions(-)
 create mode 100644 ethosu/vela/utils.py

diff --git a/OPTIONS.md b/OPTIONS.md
index 36cd1722..9eaff235 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -432,6 +432,14 @@ Verbose weights information.
 vela network.tflite --verbose-weights
 ```
 
+### Verbose Progress
+
+Verbose progress information from the compiler driver and scheduler.  
+
+```bash
+vela network.tflite --verbose-progress
+```
+
 ## Configuration File
 
 This is used to describe various properties of the Ethos-U embedded system.  The
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index d2892096..51c97070 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -40,6 +40,7 @@ from .scheduler import OptimizationStrategy
 from .tensor import MemArea
 from .tensor import MemType
 from .tensor import Tensor
+from .utils import progress_print
 
 
 class CompilerOptions:
@@ -62,6 +63,7 @@ class CompilerOptions:
         verbose_operators=False,
         verbose_weights=False,
         verbose_performance=False,
+        verbose_progress=False,
         show_cpu_operations=False,
         tensor_allocator=TensorAllocator.Greedy,
         timing=False,
@@ -82,6 +84,7 @@ class CompilerOptions:
         self.verbose_operators = verbose_operators
         self.verbose_weights = verbose_weights
         self.verbose_performance = verbose_performance
+        self.verbose_progress = verbose_progress
         self.show_cpu_operations = show_cpu_operations
         self.tensor_allocator = tensor_allocator
         self.timing = timing
@@ -154,11 +157,13 @@ def _check_schedule(nng, arch, scheduler_options):
 
 def compiler_driver(nng, arch, options, scheduler_options, network_type, output_basename):
     assert verify_graph_health(nng)
+    verbose_progress = scheduler_options.verbose_progress
 
     # Pre-optimisation operator tracking
     for sg in nng.subgraphs:
         visit_graph_post_order(sg.output_tensors, arch, [], [_record_operator])
 
+    progress_print(verbose_progress, "Performing graph optimisation")
     nng = graph_optimiser.optimise_graph(
         nng, arch, network_type, options.verbose_graph, options.force_symmetric_int_weights
     )
@@ -167,17 +172,22 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
     if options.verbose_quantization:
         nng.print_graph_with_tensor_quantization()
 
+    progress_print(verbose_progress, "Defining tensor purpose")
     nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
     assert verify_graph_health(nng)
+
+    progress_print(verbose_progress, "Performing pass packing")
     pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
     assert verify_graph_health(nng)
 
+    progress_print(verbose_progress, "Extracting npu subgraphs")
     extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
 
     assert verify_graph_health(nng)
     if options.timing:
         start = time.time()
 
+    progress_print(verbose_progress, "Scheduling passes")
     # Run the scheduler
     scheduler.schedule_passes(nng, arch, options, scheduler_options)
     _check_schedule(nng, arch, scheduler_options)
@@ -199,6 +209,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
     # Create list of NPU subgraphs with same order as the list of all subgraphs
     npu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Npu]
 
+    progress_print(verbose_progress, "Calculating live ranges for constant NPU tensors")
     # Calculate live ranges for all constant Npu tensors, in permanent storage
     for sg in npu_subgraphs:
         lr_graph_flash = live_range.create_linear_live_range_graph(
@@ -209,6 +220,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
         )
 
     if npu_subgraphs:
+        progress_print(verbose_progress, "Allocating NPU constant tensors to the first NPU subgraph")
         # Allocate all Npu constant tensors to the first Npu subgraph since it is
         # processed first during serialization into tensors
         first_npu_sg = npu_subgraphs[0]
@@ -225,6 +237,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
 
     root_sg = nng.get_root_subgraph()
 
+    progress_print(verbose_progress, "Generating command stream")
     # Generate command streams and serialise Npu-ops into tensors
     for sg in npu_subgraphs:
         high_level_command_stream_generator.generate_high_level_command_stream_for_schedule(
@@ -249,6 +262,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
     if scratch_fast_tens is not None:
         scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])
 
+    progress_print(verbose_progress, "Allocating CPU constant tensors")
     # Allocate all Cpu constant tensors, this is done last because the Npu-ops
     # have to be serialized into flash and scratch tensors first
     tensor_allocation.allocate_tensors(
@@ -261,7 +275,7 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_
         verbose_allocation=options.verbose_allocation,
         cpu_tensor_alignment=options.cpu_tensor_alignment,
     )
-
+    progress_print(verbose_progress, "Calculating new performance for the network")
     npu_performance.calc_new_performance_for_network(
         nng, arch, network_type, options.verbose_performance, output_basename
     )
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index d64f68e0..9f94dd63 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -27,6 +27,7 @@ from .tensor import MemArea
 from .tensor import MemType
 from .tensor import Tensor
 from .tensor import TensorPurpose
+from .utils import progress_print
 
 
 class LiveRange:
@@ -231,6 +232,7 @@ def extract_live_ranges_from_cascaded_passes(
     target_mem_type_set,
     lr_graph=None,
     cpu_tensor_alignment=Tensor.AllocationQuantum,
+    verbose_progress: bool = False,
 ):
     if lr_graph is None:
         lr_graph = LiveRangeGraph()
@@ -239,7 +241,8 @@ def extract_live_ranges_from_cascaded_passes(
         # if subgraph has been processed already, return the lr_graph as is
         return lr_graph
 
-    for cps in sg.cascaded_passes:
+    for index, cps in enumerate(sg.cascaded_passes):
+        progress_print(verbose_progress, "Processing cascaded pass", index, sg.cascaded_passes)
         cps.time = lr_graph.current_time
 
         time_for_pass = cps.time
@@ -320,9 +323,10 @@ def create_linear_live_range_graph(sg, target_mem_area, target_mem_type_set, lr_
     return lr_graph
 
 
-def extract_live_ranges_from_schedule(sg, target_mem_area, target_mem_type_set, lr_graph):
+def extract_live_ranges_from_schedule(sg, target_mem_area, target_mem_type_set, lr_graph, verbose_progress=False):
     time_for_cascade = {}
-    for sched_op in sg.sched_ops:
+    for index, sched_op in enumerate(sg.sched_ops):
+        progress_print(verbose_progress, "Processing SchedulerOp", index, sg.sched_ops)
         op_info = sg.schedule.cost_map[sched_op]
         cascade = op_info.cascade
         cascade_info = sg.schedule.cascades.get(cascade, None)
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index cbd7ce44..6e2cd4a6 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -31,6 +31,8 @@ from typing import Optional
 from typing import Tuple
 from typing import TYPE_CHECKING
 
+from .utils import progress_print
+
 # Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
 if TYPE_CHECKING:
     from .npu_performance import CycleCost
@@ -148,10 +150,12 @@ class SchedulerOptions:
         optimization_strategy,
         sram_target,
         verbose_schedule,
+        verbose_progress=False,
     ):
         self.optimization_strategy = optimization_strategy
         self.optimization_sram_limit = sram_target
         self.verbose_schedule = verbose_schedule
+        self.verbose_progress = verbose_progress
 
     def __str__(self) -> str:
         return f"{type(self).__name__}: {str(self.__dict__)}"
@@ -531,7 +535,9 @@ class Scheduler:
     def create_initial_schedule(self) -> Schedule:
         """Creates an initial schedule with no cascading or buffering of any kind"""
         schedule = Schedule(self.sg, "MAX")
-        for op in self.sched_ops:
+        verbose_progress = self.scheduler_options.verbose_progress
+        for index, op in enumerate(self.sched_ops):
+            progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops)
             cost = op.create_scheduler_info(self.nng, op.ofm.shape)
             cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
             schedule.cost_map[op] = cost
@@ -540,16 +546,12 @@ class Scheduler:
 
     def update_op_memory_snapshot(self, schedule: Schedule):
         memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
-
+        verbose_progress = self.scheduler_options.verbose_progress
+        progress_print(verbose_progress, "")
         # Collect live ranges from tensors
         lr_graph = live_range.LiveRangeGraph()
         for mem_area, mem_type_set in memories_list:
-            live_range.extract_live_ranges_from_schedule(
-                self.sg,
-                mem_area,
-                mem_type_set,
-                lr_graph,
-            )
+            live_range.extract_live_ranges_from_schedule(self.sg, mem_area, mem_type_set, lr_graph, verbose_progress)
 
         # Populate time-array with memory used by live ranges
         temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
@@ -607,9 +609,10 @@ class Scheduler:
     def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
         """Create a buffered schedule"""
         buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
-
+        verbose_progress = self.scheduler_options.verbose_progress
         prev_op = None
-        for sched_op in self.sched_ops:
+        for index, sched_op in enumerate(self.sched_ops):
+            progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops)
             if sched_op not in ref_schedule.cost_map:
                 # sched_op is not part of this sub-schedule - skip
                 continue
@@ -871,10 +874,11 @@ class Scheduler:
         next operators stride"""
         min_schedule = Schedule(self.sg, "MIN")
         cost_map = min_schedule.cost_map
-
+        verbose_progress = self.scheduler_options.verbose_progress
         # Keep track of the previous Op - which consumes the current Op's OFM
         prev_op: Optional[SchedulerOperation] = None
-        for sched_op in reversed(self.sched_ops):
+        for index, sched_op in enumerate(reversed(self.sched_ops)):
+            progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops)
             min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
             min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
 
@@ -968,13 +972,15 @@ class Scheduler:
         return peak_mem_usage
 
     def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):
+        verbose_progress = self.scheduler_options.verbose_progress
         # Update memory snapshot
         self.sg.schedule = min_schedule
         self.update_op_memory_snapshot(min_schedule)
 
         # Calculate residual memory for Min schedule
         non_local_mem_usage = {}
-        for sched_op in self.sched_ops:
+        for index, sched_op in enumerate(self.sched_ops):
+            progress_print(verbose_progress, "Processing SchedulerOp", index, self.sched_ops)
             time_index = min_schedule.cost_map[sched_op].time_index
 
             if self.arch.is_spilling_enabled():
@@ -1089,13 +1095,16 @@ class Scheduler:
         options: SchedulerOptions,
     ) -> Schedule:
         """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
+        verbose_progress = options.verbose_progress
         sram_limit = options.optimization_sram_limit
         if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
             # Maximum performance schedule fits within the SRAM target
             return max_sched
 
         # Iterate over a copy of the cascades since they may change during the loop
-        for cascade_info in list(schedule.cascades.values()):
+        cascades = list(schedule.cascades.values())
+        for index, cascade_info in enumerate(cascades):
+            progress_print(verbose_progress, "Processing cascade", index, cascades)
             # Optimize the sub-schedule in this cascade
             opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
             if opt_sub_schedule:
@@ -1119,6 +1128,7 @@ class Scheduler:
         min_schedule: Schedule,
         options: SchedulerOptions,
     ):
+        verbose_progress = options.verbose_progress
         default_schedule = self.sg.schedule
         npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
         default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
@@ -1135,12 +1145,7 @@ class Scheduler:
         memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
         lr_graph = live_range.LiveRangeGraph()
         for mem_area, mem_type_set in memories_list:
-            live_range.extract_live_ranges_from_schedule(
-                self.sg,
-                mem_area,
-                mem_type_set,
-                lr_graph,
-            )
+            live_range.extract_live_ranges_from_schedule(self.sg, mem_area, mem_type_set, lr_graph, verbose_progress)
 
         # Find the relation between the sched_op and the buffering tensor
         weight_ops = {}
@@ -1416,7 +1421,9 @@ class Scheduler:
             print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
 
 
-def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):
+def _update_memory_snapshot_for_all_npu_graphs(
+    nng: Graph, arch: ArchitectureFeatures, schedulers, verbose_progress: bool = False
+):
     mem_area = arch.fast_storage_mem_area
     mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))
 
@@ -1426,11 +1433,7 @@ def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFea
     # will be set for all the tensors.
     lr_graph = live_range.LiveRangeGraph()
     live_range.extract_live_ranges_from_cascaded_passes(
-        nng.get_root_subgraph(),
-        mem_area,
-        mem_type_set,
-        lr_graph,
-        Tensor.AllocationQuantum,
+        nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum, verbose_progress
     )
     # Populate time-array with memory used by live ranges
     temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)
@@ -1471,6 +1474,7 @@ def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
             mem_type_set,
             tensor_allocator=options.tensor_allocator,
             verbose_allocation=options.verbose_allocation,
+            verbose_progress=options.verbose_progress,
             cpu_tensor_alignment=options.cpu_tensor_alignment,
             hillclimb_max_iterations=options.hillclimb_max_iterations,
         )
@@ -1570,14 +1574,17 @@ class FastStorageComponentAllocator:
 
 def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
     """Entry point for the Scheduler"""
+    verbose_progress = scheduler_options.verbose_progress
     # Initialize CPU subgraphs
     schedulers = dict()
     # Initialize schedulers with max schedule. Only schedule NPU subgraphs
-    for sg in nng.subgraphs:
+    for sg_idx, sg in enumerate(nng.subgraphs):
+        progress_print(verbose_progress, "Processing subgraph", sg_idx, nng.subgraphs)
         if sg.placement != PassPlacement.Npu:
             # Create cascaded passes for CPU Ops
             cascaded_passes = []
-            for idx, ps in enumerate(sg.passes):
+            for pass_idx, ps in enumerate(sg.passes):
+                progress_print(verbose_progress, "Creating cascaded passes for CPU op", pass_idx, sg.passes)
                 cps = CascadedPass(
                     ps.name,
                     SchedulingStrategy.WeightStream,
@@ -1589,7 +1596,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
                     False,
                 )
 
-                cps.time = idx
+                cps.time = pass_idx
                 ps.cascade = cps
                 cascaded_passes.append(cps)
 
@@ -1599,6 +1606,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
             scheduler = Scheduler(nng, sg, arch, scheduler_options)
             schedulers[sg] = scheduler
 
+            progress_print(verbose_progress, "Creating scheduler representation")
             scheduler.create_scheduler_representation(arch)
             sg.sched_ops = scheduler.sched_ops
 
@@ -1606,6 +1614,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
             max_schedule_template = scheduler.create_initial_schedule()
             scheduler.max_schedule = max_schedule_template
 
+            progress_print(verbose_progress, "Creating optimised max schedule")
             # Create the optimimised Max schedule
             sg.schedule = max_schedule_template
             scheduler.update_op_memory_snapshot(max_schedule_template)
@@ -1613,6 +1622,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
             sg.schedule = opt_max_schedule
             scheduler.update_op_memory_snapshot(opt_max_schedule)
 
+            progress_print(verbose_progress, "Creating minimal schedule")
             # Create Min schedule
             min_schedule = scheduler.propose_minimal_schedule()
             initial_sram_limit = scheduler_options.optimization_sram_limit
@@ -1620,11 +1630,13 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
                 initial_sram_limit = scheduler.min_memory_req
 
             # Build cascades for Min schedule
+            progress_print(verbose_progress, "Building cascades for minimal schedule")
             scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)
             sg.schedule = min_schedule
             scheduler.update_op_memory_snapshot(min_schedule)
 
             if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
+                progress_print(verbose_progress, "Creating schedule optimized for performance")
                 # Create an optimized schedule
                 sg.schedule = scheduler.optimize_schedule(
                     min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
@@ -1635,6 +1647,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
             scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
 
             if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:
+                progress_print(verbose_progress, "Optimizing weight buffering size")
                 # It might be possible to gain performance by reducing
                 # weight buffer size and instead fit fms in fast storage
                 scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)
@@ -1642,8 +1655,10 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
             if scheduler_options.verbose_schedule:
                 scheduler.print_schedule(sg.schedule)
 
+    progress_print(verbose_progress, "Update memory snapshot for all NPU graphs")
     # Make a full live range calculation starting from the root sg
-    _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)
+    _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers, verbose_progress)
 
+    progress_print(verbose_progress, "Update tensor allocation")
     # Evaluate schedule
     _update_tensor_allocation(nng, arch, options)
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 8c91e2ed..fa9ace17 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -201,6 +201,7 @@ def allocate(
     lr_graph=None,
     cpu_tensor_alignment=Tensor.AllocationQuantum,
     hillclimb_max_iterations=None,
+    verbose_progress=False,
 ):
     # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
     lrs = live_range.extract_live_ranges_from_cascaded_passes(
@@ -209,6 +210,7 @@ def allocate(
         mem_type_set,
         lr_graph=lr_graph,
         cpu_tensor_alignment=cpu_tensor_alignment,
+        verbose_progress=verbose_progress,
     )
     total_sz = 0
     if lrs.ranges:
@@ -235,6 +237,7 @@ def allocate_tensors(
     mem_type_set,
     tensor_allocator=TensorAllocator.Greedy,
     verbose_allocation=False,
+    verbose_progress=False,
     lr_graph=None,
     cpu_tensor_alignment=Tensor.AllocationQuantum,
     hillclimb_max_iterations=None,
@@ -251,6 +254,7 @@ def allocate_tensors(
         lr_graph=lr_graph,
         cpu_tensor_alignment=cpu_tensor_alignment,
         hillclimb_max_iterations=hillclimb_max_iterations,
+        verbose_progress=verbose_progress,
     )
 
     if lrs.ranges:
diff --git a/ethosu/vela/utils.py b/ethosu/vela/utils.py
new file mode 100644
index 00000000..386ba354
--- /dev/null
+++ b/ethosu/vela/utils.py
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Contains various utility functions used across the codebase.
+from __future__ import annotations
+
+import collections
+import inspect
+
+
+def progress_print(
+    enabled: bool,
+    message: str,
+    progress_counter: int = -1,
+    progress_total: int | collections.Sized = 0,
+    progress_granularity: float = 0.20,
+):
+    """Print progress information.
+
+    :param enabled: boolean indicating whether message should be printed.
+    :param message: message to be printed
+    :param progress_counter: the value of the incremental counter that indicates the progress
+    :param progress_total: integer value or sized data structure to use to extract the total number of elements that
+                           progress is measured against
+    :param progress_granularity: floating point percentage indicating how often progress information should be printed
+    :param enable_context: boolean used to indicate whether context information should be printed with the message
+
+    Example
+    -------
+    def example_function(verbose_progress: bool = True):
+        a_list = [x for x in range(101)]
+        for index, value in a:
+            progress_print(verbose_progress,
+                            message="Processing",
+                            progress_counter=index,
+                            progress_total=a_list,
+                            progress_granulrity=0.25,
+                            enable_context=True)
+
+    **Output**
+    Processing 0/100
+    Processing 25/100
+    Processing 50/100
+    Processing 75/100
+    Processing 100/100
+    """
+    if not enabled:
+        return
+
+    context_str = ""
+    # Get calling function name
+    context_str = inspect.stack()[1].function
+    context_str += ": " if message else ""
+    display_total = progress_total
+    # If a sized collection is provided, extract its size to use as progress total
+    if isinstance(progress_total, collections.Sized):
+        progress_total = len(progress_total)
+        display_total = progress_total - 1
+
+    # Print progress information with "counter/total" information
+    if progress_counter > -1 and progress_total > 0 and 0 < progress_granularity < 1:
+        # Extract progress frequency and ensure it is not equal to 0 (avoid zero division)
+        progress_frequency = int(progress_total * progress_granularity)
+        progress_frequency = progress_frequency if progress_frequency else 1
+        # Check whether information should be printed based on computed progress frequency
+        if (
+            progress_counter % progress_frequency == 0 and progress_counter <= progress_total - progress_frequency
+        ) or progress_counter == display_total:
+            print(f"{context_str}{message} {progress_counter}/{display_total}")
+        return
+
+    print(f"{context_str}{message}")
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index c44c7894..fbf1d370 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -372,6 +372,7 @@ def main(args=None):
         parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
         parser.add_argument("--verbose-weights", action="store_true", help="Verbose weights information")
         parser.add_argument("--verbose-performance", action="store_true", help="Verbose performance information")
+        parser.add_argument("--verbose-progress", action="store_true", help="Verbose progress information")
         parser.add_argument(
             "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
         )
@@ -555,6 +556,7 @@ def main(args=None):
             verbose_operators=args.verbose_operators,
             verbose_weights=args.verbose_weights,
             verbose_performance=args.verbose_performance,
+            verbose_progress=args.verbose_progress,
             show_cpu_operations=args.show_cpu_operations,
             tensor_allocator=args.tensor_allocator,
             timing=args.timing,
@@ -568,6 +570,7 @@ def main(args=None):
             optimization_strategy=args.optimise,
             sram_target=arch.arena_cache_size,
             verbose_schedule=args.verbose_schedule,
+            verbose_progress=args.verbose_progress,
         )
 
         model_reader_options = model_reader.ModelReaderOptions()
-- 
cgit v1.2.1