From b9b515ca996e1ec5835a2c35033cc6f818f71f12 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Sun, 1 Nov 2020 21:27:19 +0000 Subject: vela: Remove and change CLI options - Removed unused --show-minimum-possible-allocation - Change --allocation-alignment to --cpu-tensor-alignment Signed-off-by: Tim Hall Change-Id: I00e367c3190aeea08a3f136332711e9accc85ba3 --- OPTIONS.md | 21 ++++++++------------- ethosu/vela/compiler_driver.py | 16 +++++----------- ethosu/vela/live_range.py | 8 ++++---- ethosu/vela/tensor_allocation.py | 20 ++++++-------------- ethosu/vela/vela.py | 28 +++++++++++++--------------- 5 files changed, 36 insertions(+), 57 deletions(-) diff --git a/OPTIONS.md b/OPTIONS.md index baf6c5a3..10bfea55 100644 --- a/OPTIONS.md +++ b/OPTIONS.md @@ -280,11 +280,14 @@ SRAM, albeit at the cost of performance (inference speed). vela network.tflite --weight-estimation-scaling=1.2 ``` -### Allocation alignment - -Controls the allocation byte alignment. Only affects CPU tensors, NPU tensors -will remain 16-byte aligned independent of this option. Alignment has to be a -power of two and greater or equal to 16. +### CPU Tensor Alignment + +Controls the allocation byte alignment. This affects all CPU tensors including +Ethos-U Custom operator inputs and outputs. In this instance a CPU tensor is +defined as any tensor that is explicitly listed in the resulting `.tflite` file. +The Ethos-U NPU internal tensors will remain 16-byte aligned independent of this +option, these tensors are contained within the command stream. Alignment has to +be a power of two and greater or equal to 16. **Type: Integer** **Default: 16** @@ -305,14 +308,6 @@ Prints a summary of all the subgraphs and their inputs and outputs. vela network.tflite --show-subgraph-io-summary ``` -### Show Minimum Possible Allocation - -Prints the minimum possible allocation. - -```bash -vela network.tflite --show-minimum-possible-allocation -``` - ### Show Cpu Operations Show the operations that fall back to the CPU. diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index 0739133b..a2b20e47 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -61,12 +61,11 @@ Note the difference between ArchitectureFeatures and CompilerOptions verbose_high_level_command_stream=False, verbose_register_command_stream=False, verbose_operators=False, - show_minimum_possible_allocation=False, show_cpu_operations=False, tensor_allocator=TensorAllocator.Greedy, timing=False, output_dir="outputs", - allocation_alignment=Tensor.AllocationQuantum, + cpu_tensor_alignment=Tensor.AllocationQuantum, ): self.verbose_graph = verbose_graph @@ -78,12 +77,11 @@ Note the difference between ArchitectureFeatures and CompilerOptions self.verbose_high_level_command_stream = verbose_high_level_command_stream self.verbose_register_command_stream = verbose_register_command_stream self.verbose_operators = verbose_operators - self.show_minimum_possible_allocation = show_minimum_possible_allocation self.show_cpu_operations = show_cpu_operations self.tensor_allocator = tensor_allocator self.timing = timing self.output_dir = output_dir - self.allocation_alignment = allocation_alignment + self.cpu_tensor_alignment = cpu_tensor_alignment def __str__(self): return type(self).__name__ + ": " + str(self.__dict__) @@ -209,7 +207,6 @@ def compiler_driver(nng, arch, options, scheduler_options): set((MemType.Permanent_NPU,)), tensor_allocator=TensorAllocator.LinearAlloc, verbose_allocation=options.verbose_allocation, - show_minimum_possible_allocation=options.show_minimum_possible_allocation, lr_graph=lr_graph_flash, ) @@ -259,8 +256,7 @@ def compiler_driver(nng, arch, options, scheduler_options): dry_test=dry_test, tensor_allocator=options.tensor_allocator, verbose_allocation=options.verbose_allocation, - show_minimum_possible_allocation=options.show_minimum_possible_allocation, - allocation_alignment=options.allocation_alignment, + cpu_tensor_alignment=options.cpu_tensor_alignment, ) if dry_test or not alloc_success: for sg in nng.subgraphs: @@ -281,8 +277,7 @@ def compiler_driver(nng, arch, options, scheduler_options): mem_type_set, tensor_allocator=options.tensor_allocator, verbose_allocation=options.verbose_allocation, - show_minimum_possible_allocation=options.show_minimum_possible_allocation, - allocation_alignment=options.allocation_alignment, + cpu_tensor_alignment=options.cpu_tensor_alignment, ) # Generate command streams and serialise Npu-ops into tensors @@ -316,8 +311,7 @@ def compiler_driver(nng, arch, options, scheduler_options): set((MemType.Permanent_CPU,)), tensor_allocator=TensorAllocator.LinearAlloc, verbose_allocation=options.verbose_allocation, - show_minimum_possible_allocation=options.show_minimum_possible_allocation, - allocation_alignment=options.allocation_alignment, + cpu_tensor_alignment=options.cpu_tensor_alignment, ) npu_performance.calc_performance_for_network(nng, arch) diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py index a29cafe0..dbc0ce40 100644 --- a/ethosu/vela/live_range.py +++ b/ethosu/vela/live_range.py @@ -236,7 +236,7 @@ def extract_live_ranges_from_cascaded_passes( target_mem_type_set, ignore_subgraph_input_output_tensors=False, lr_graph=None, - allocation_alignment=Tensor.AllocationQuantum, + cpu_tensor_alignment=Tensor.AllocationQuantum, ): if lr_graph is None: lr_graph = LiveRangeGraph() @@ -261,7 +261,7 @@ def extract_live_ranges_from_cascaded_passes( for tens in cps.inputs: if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set): continue - rng = lr_graph.get_or_create_range(tens, allocation_alignment) + rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment) rng.mark_usage(time_for_pass) cps_primary_op = cps.passes[0].primary_op @@ -285,7 +285,7 @@ def extract_live_ranges_from_cascaded_passes( for tens in cps.intermediates + cps.outputs: if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set): continue - rng = lr_graph.get_or_create_range(tens, allocation_alignment) + rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment) rng.mark_usage(time_for_pass) lr_graph.current_time += 2 @@ -298,7 +298,7 @@ def extract_live_ranges_from_cascaded_passes( for tens in sg.output_tensors: if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set): continue - rng = lr_graph.get_or_create_range(tens, allocation_alignment) + rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment) rng.mark_usage(end_time) # Add subgraph to set of processed subgraphs diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index 8329a617..d1a33728 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -85,7 +85,7 @@ def mark_sram_used_for_cascaded_passes(sg, lrs): ps.sram_used = sram_used -def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation): +def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation): if verbose_allocation: if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)): print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)") @@ -108,13 +108,6 @@ def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_m print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024)) print() - if show_minimum_possible_allocation and mem_area == MemArea.Sram: - min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes) - print( - "Min possible allocation %d bytes / %.1f KB / %.1f MB" - % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024) - ) - def allocate_tensors( nng, @@ -124,9 +117,8 @@ def allocate_tensors( mem_type_set, tensor_allocator=TensorAllocator.Greedy, verbose_allocation=False, - show_minimum_possible_allocation=False, lr_graph=None, - allocation_alignment=Tensor.AllocationQuantum, + cpu_tensor_alignment=Tensor.AllocationQuantum, max_size=None, dry_test=False, ): @@ -138,15 +130,15 @@ def allocate_tensors( mem_type_set, ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors, lr_graph=lr_graph, - allocation_alignment=allocation_alignment, + cpu_tensor_alignment=cpu_tensor_alignment, ) if lrs.ranges: tens_alloc = tensor_allocator if tens_alloc == TensorAllocator.Greedy: - total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, allocation_alignment, verbose_allocation) + total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation) elif tens_alloc == TensorAllocator.LinearAlloc: - total_sz = linear_allocate_live_ranges(lrs, allocation_alignment) + total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment) else: assert 0 alloc_ok = max_size is None or total_sz <= max_size @@ -171,7 +163,7 @@ def allocate_tensors( nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges) nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges) - print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation) + print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation) if mem_area == MemArea.Sram: # Mark Sram usage for all subgraphs diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index 4f632d56..05bd9ec1 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -234,10 +234,6 @@ def main(args=None): "--verbose-register-command-stream", action="store_true", help="Verbose register command stream" ) parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list") - - parser.add_argument( - "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation" - ) parser.add_argument( "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU" ) @@ -316,8 +312,8 @@ def main(args=None): default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP, choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1), help=( - "Set the maximum value that can be used for the block dependency between npu kernel operations " - "(default: %(default)s)" + "Set the maximum value that can be used for the block dependency between npu kernel operations" + " (default: %(default)s)" ), ) parser.add_argument( @@ -334,10 +330,13 @@ def main(args=None): help=("Performs an additional scaling of weight compression scale estimate (default: %(default)s)"), ) parser.add_argument( - "--allocation-alignment", + "--cpu-tensor-alignment", type=int, default=Tensor.AllocationQuantum, - help=("Controls the allocation byte alignment of cpu tensors (default: %(default)s)"), + help=( + "Controls the allocation byte alignment of cpu tensors including Ethos-U Custom operator inputs and outputs" + " (default: %(default)s)" + ), ) args = parser.parse_args(args=args) @@ -362,11 +361,11 @@ def main(args=None): else: force_block_config = None - alignment = args.allocation_alignment - if alignment < 16: - parser.error("the following argument needs to be greater or equal to 16: ALLOCATION_ALIGNMENT") - if alignment & (alignment - 1) != 0: - parser.error("the following argument needs to be a power of 2: ALLOCATION_ALIGNMENT") + if args.cpu_tensor_alignment < 16 or args.cpu_tensor_alignment & (args.cpu_tensor_alignment - 1) != 0: + parser.error( + "Invalid argument to --cpu-tensor-alignment = {} (must be greater than or equal to 16 and a power of 2)" + "".format(args.cpu_tensor_alignment) + ) arch = architecture_features.ArchitectureFeatures( vela_config_files=args.config, @@ -390,12 +389,11 @@ def main(args=None): verbose_high_level_command_stream=args.verbose_high_level_command_stream, verbose_register_command_stream=args.verbose_register_command_stream, verbose_operators=args.verbose_operators, - show_minimum_possible_allocation=args.show_minimum_possible_allocation, show_cpu_operations=args.show_cpu_operations, tensor_allocator=args.tensor_allocator, timing=args.timing, output_dir=args.output_dir, - allocation_alignment=alignment, + cpu_tensor_alignment=args.cpu_tensor_alignment, ) scheduler_options = scheduler.SchedulerOptions( -- cgit v1.2.1