aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2020-11-01 21:27:19 +0000
committerTim Hall <tim.hall@arm.com>2020-11-20 12:55:47 +0000
commitb9b515ca996e1ec5835a2c35033cc6f818f71f12 (patch)
tree25d9810755f8fff896cc1d523d0fad90651bc399
parent1bd531dec0b4eb745fb8856d14c1aba2b8a73026 (diff)
downloadethos-u-vela-b9b515ca996e1ec5835a2c35033cc6f818f71f12.tar.gz
vela: Remove and change CLI options
- Removed unused --show-minimum-possible-allocation - Change --allocation-alignment to --cpu-tensor-alignment Signed-off-by: Tim Hall <tim.hall@arm.com> Change-Id: I00e367c3190aeea08a3f136332711e9accc85ba3
-rw-r--r--OPTIONS.md21
-rw-r--r--ethosu/vela/compiler_driver.py16
-rw-r--r--ethosu/vela/live_range.py8
-rw-r--r--ethosu/vela/tensor_allocation.py20
-rw-r--r--ethosu/vela/vela.py28
5 files changed, 36 insertions, 57 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index baf6c5a..10bfea5 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -280,11 +280,14 @@ SRAM, albeit at the cost of performance (inference speed).
vela network.tflite --weight-estimation-scaling=1.2
```
-### Allocation alignment
-
-Controls the allocation byte alignment. Only affects CPU tensors, NPU tensors
-will remain 16-byte aligned independent of this option. Alignment has to be a
-power of two and greater or equal to 16.
+### CPU Tensor Alignment
+
+Controls the allocation byte alignment. This affects all CPU tensors including
+Ethos-U Custom operator inputs and outputs. In this instance a CPU tensor is
+defined as any tensor that is explicitly listed in the resulting `.tflite` file.
+The Ethos-U NPU internal tensors will remain 16-byte aligned independent of this
+option, these tensors are contained within the command stream. Alignment has to
+be a power of two and greater or equal to 16.
**Type: Integer**
**Default: 16**
@@ -305,14 +308,6 @@ Prints a summary of all the subgraphs and their inputs and outputs.
vela network.tflite --show-subgraph-io-summary
```
-### Show Minimum Possible Allocation
-
-Prints the minimum possible allocation.
-
-```bash
-vela network.tflite --show-minimum-possible-allocation
-```
-
### Show Cpu Operations
Show the operations that fall back to the CPU.
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 0739133..a2b20e4 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -61,12 +61,11 @@ Note the difference between ArchitectureFeatures and CompilerOptions
verbose_high_level_command_stream=False,
verbose_register_command_stream=False,
verbose_operators=False,
- show_minimum_possible_allocation=False,
show_cpu_operations=False,
tensor_allocator=TensorAllocator.Greedy,
timing=False,
output_dir="outputs",
- allocation_alignment=Tensor.AllocationQuantum,
+ cpu_tensor_alignment=Tensor.AllocationQuantum,
):
self.verbose_graph = verbose_graph
@@ -78,12 +77,11 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.verbose_high_level_command_stream = verbose_high_level_command_stream
self.verbose_register_command_stream = verbose_register_command_stream
self.verbose_operators = verbose_operators
- self.show_minimum_possible_allocation = show_minimum_possible_allocation
self.show_cpu_operations = show_cpu_operations
self.tensor_allocator = tensor_allocator
self.timing = timing
self.output_dir = output_dir
- self.allocation_alignment = allocation_alignment
+ self.cpu_tensor_alignment = cpu_tensor_alignment
def __str__(self):
return type(self).__name__ + ": " + str(self.__dict__)
@@ -209,7 +207,6 @@ def compiler_driver(nng, arch, options, scheduler_options):
set((MemType.Permanent_NPU,)),
tensor_allocator=TensorAllocator.LinearAlloc,
verbose_allocation=options.verbose_allocation,
- show_minimum_possible_allocation=options.show_minimum_possible_allocation,
lr_graph=lr_graph_flash,
)
@@ -259,8 +256,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
dry_test=dry_test,
tensor_allocator=options.tensor_allocator,
verbose_allocation=options.verbose_allocation,
- show_minimum_possible_allocation=options.show_minimum_possible_allocation,
- allocation_alignment=options.allocation_alignment,
+ cpu_tensor_alignment=options.cpu_tensor_alignment,
)
if dry_test or not alloc_success:
for sg in nng.subgraphs:
@@ -281,8 +277,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
mem_type_set,
tensor_allocator=options.tensor_allocator,
verbose_allocation=options.verbose_allocation,
- show_minimum_possible_allocation=options.show_minimum_possible_allocation,
- allocation_alignment=options.allocation_alignment,
+ cpu_tensor_alignment=options.cpu_tensor_alignment,
)
# Generate command streams and serialise Npu-ops into tensors
@@ -316,8 +311,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
set((MemType.Permanent_CPU,)),
tensor_allocator=TensorAllocator.LinearAlloc,
verbose_allocation=options.verbose_allocation,
- show_minimum_possible_allocation=options.show_minimum_possible_allocation,
- allocation_alignment=options.allocation_alignment,
+ cpu_tensor_alignment=options.cpu_tensor_alignment,
)
npu_performance.calc_performance_for_network(nng, arch)
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index a29cafe..dbc0ce4 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -236,7 +236,7 @@ def extract_live_ranges_from_cascaded_passes(
target_mem_type_set,
ignore_subgraph_input_output_tensors=False,
lr_graph=None,
- allocation_alignment=Tensor.AllocationQuantum,
+ cpu_tensor_alignment=Tensor.AllocationQuantum,
):
if lr_graph is None:
lr_graph = LiveRangeGraph()
@@ -261,7 +261,7 @@ def extract_live_ranges_from_cascaded_passes(
for tens in cps.inputs:
if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set):
continue
- rng = lr_graph.get_or_create_range(tens, allocation_alignment)
+ rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment)
rng.mark_usage(time_for_pass)
cps_primary_op = cps.passes[0].primary_op
@@ -285,7 +285,7 @@ def extract_live_ranges_from_cascaded_passes(
for tens in cps.intermediates + cps.outputs:
if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set):
continue
- rng = lr_graph.get_or_create_range(tens, allocation_alignment)
+ rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment)
rng.mark_usage(time_for_pass)
lr_graph.current_time += 2
@@ -298,7 +298,7 @@ def extract_live_ranges_from_cascaded_passes(
for tens in sg.output_tensors:
if tensor_should_be_ignored(lr_graph, tens, target_mem_area, target_mem_type_set):
continue
- rng = lr_graph.get_or_create_range(tens, allocation_alignment)
+ rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment)
rng.mark_usage(end_time)
# Add subgraph to set of processed subgraphs
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 8329a61..d1a3372 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -85,7 +85,7 @@ def mark_sram_used_for_cascaded_passes(sg, lrs):
ps.sram_used = sram_used
-def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation):
+def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation):
if verbose_allocation:
if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)):
print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
@@ -108,13 +108,6 @@ def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_m
print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024))
print()
- if show_minimum_possible_allocation and mem_area == MemArea.Sram:
- min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes)
- print(
- "Min possible allocation %d bytes / %.1f KB / %.1f MB"
- % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024)
- )
-
def allocate_tensors(
nng,
@@ -124,9 +117,8 @@ def allocate_tensors(
mem_type_set,
tensor_allocator=TensorAllocator.Greedy,
verbose_allocation=False,
- show_minimum_possible_allocation=False,
lr_graph=None,
- allocation_alignment=Tensor.AllocationQuantum,
+ cpu_tensor_alignment=Tensor.AllocationQuantum,
max_size=None,
dry_test=False,
):
@@ -138,15 +130,15 @@ def allocate_tensors(
mem_type_set,
ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
lr_graph=lr_graph,
- allocation_alignment=allocation_alignment,
+ cpu_tensor_alignment=cpu_tensor_alignment,
)
if lrs.ranges:
tens_alloc = tensor_allocator
if tens_alloc == TensorAllocator.Greedy:
- total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, allocation_alignment, verbose_allocation)
+ total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation)
elif tens_alloc == TensorAllocator.LinearAlloc:
- total_sz = linear_allocate_live_ranges(lrs, allocation_alignment)
+ total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
else:
assert 0
alloc_ok = max_size is None or total_sz <= max_size
@@ -171,7 +163,7 @@ def allocate_tensors(
nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
- print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation)
+ print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
if mem_area == MemArea.Sram:
# Mark Sram usage for all subgraphs
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 4f632d5..05bd9ec 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -234,10 +234,6 @@ def main(args=None):
"--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
)
parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
-
- parser.add_argument(
- "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
- )
parser.add_argument(
"--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
)
@@ -316,8 +312,8 @@ def main(args=None):
default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
help=(
- "Set the maximum value that can be used for the block dependency between npu kernel operations "
- "(default: %(default)s)"
+ "Set the maximum value that can be used for the block dependency between npu kernel operations"
+ " (default: %(default)s)"
),
)
parser.add_argument(
@@ -334,10 +330,13 @@ def main(args=None):
help=("Performs an additional scaling of weight compression scale estimate (default: %(default)s)"),
)
parser.add_argument(
- "--allocation-alignment",
+ "--cpu-tensor-alignment",
type=int,
default=Tensor.AllocationQuantum,
- help=("Controls the allocation byte alignment of cpu tensors (default: %(default)s)"),
+ help=(
+ "Controls the allocation byte alignment of cpu tensors including Ethos-U Custom operator inputs and outputs"
+ " (default: %(default)s)"
+ ),
)
args = parser.parse_args(args=args)
@@ -362,11 +361,11 @@ def main(args=None):
else:
force_block_config = None
- alignment = args.allocation_alignment
- if alignment < 16:
- parser.error("the following argument needs to be greater or equal to 16: ALLOCATION_ALIGNMENT")
- if alignment & (alignment - 1) != 0:
- parser.error("the following argument needs to be a power of 2: ALLOCATION_ALIGNMENT")
+ if args.cpu_tensor_alignment < 16 or args.cpu_tensor_alignment & (args.cpu_tensor_alignment - 1) != 0:
+ parser.error(
+ "Invalid argument to --cpu-tensor-alignment = {} (must be greater than or equal to 16 and a power of 2)"
+ "".format(args.cpu_tensor_alignment)
+ )
arch = architecture_features.ArchitectureFeatures(
vela_config_files=args.config,
@@ -390,12 +389,11 @@ def main(args=None):
verbose_high_level_command_stream=args.verbose_high_level_command_stream,
verbose_register_command_stream=args.verbose_register_command_stream,
verbose_operators=args.verbose_operators,
- show_minimum_possible_allocation=args.show_minimum_possible_allocation,
show_cpu_operations=args.show_cpu_operations,
tensor_allocator=args.tensor_allocator,
timing=args.timing,
output_dir=args.output_dir,
- allocation_alignment=alignment,
+ cpu_tensor_alignment=args.cpu_tensor_alignment,
)
scheduler_options = scheduler.SchedulerOptions(