diff options
author | Patrik Gustavsson <patrik.gustavsson@arm.com> | 2020-05-27 09:15:11 +0200 |
---|---|---|
committer | Patrik Gustavsson <patrik.gustavsson@arm.com> | 2020-06-25 11:42:56 +0200 |
commit | eca2e95e1fea150d8a942f8b5f0a4d9d7aefebc1 (patch) | |
tree | 438b385f1ded3c18c3b84d2204a57c39be6be34a /ethosu/vela/compiler_driver.py | |
parent | eec4e50e19cb5522640eae5fd4566917dc2a7b9d (diff) | |
download | ethos-u-vela-eca2e95e1fea150d8a942f8b5f0a4d9d7aefebc1.tar.gz |
MLBEDSW-2306 Added more supported mem-cfgs
Additional supported memory configurations:
-Permanent_storage = DRAM
-Tensor arena either in DRAM or SRAM
Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
Change-Id: I20beb7151e306bfdba540e7c0b2a7b478b4d94e1
Diffstat (limited to 'ethosu/vela/compiler_driver.py')
-rw-r--r-- | ethosu/vela/compiler_driver.py | 55 |
1 files changed, 37 insertions, 18 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index 9c345dba..e495f1ce 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -33,7 +33,7 @@ from . import weight_compressor from .nn_graph import PassPlacement from .nn_graph import TensorAllocator from .rewrite_graph import verify_graph_health -from .tensor import MemArea +from .tensor import MemType class CompilerOptions: @@ -120,9 +120,6 @@ def compiler_driver(nng, arch, options, scheduler_options): # block config, and calc and pack the scales and biases weight_compressor.update_pass_weight_and_scale_tensors(nng, arch) - # Memory area for all non-constant tensors (Cpu and Npu) - non_const_mem_area = MemArea.Sram - # LiveRanges for constant tensors for all Npu subgraphs permanent_storage = arch.permanent_storage_mem_area lr_graph_flash = live_range.LiveRangeGraph() @@ -135,7 +132,11 @@ def compiler_driver(nng, arch, options, scheduler_options): for sg in nng.subgraphs: if sg.placement == PassPlacement.Npu: lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes( - sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash + sg, + permanent_storage, + MemType.Permanent_NPU, + ignore_subgraph_input_output_tensors=True, + lr_graph=lr_graph_flash, ) if len(nng.subgraphs) > 1: @@ -143,12 +144,12 @@ def compiler_driver(nng, arch, options, scheduler_options): # processed first during serialization into tensors first_npu_sg = nng.subgraphs[1] assert first_npu_sg.placement == PassPlacement.Npu - # Use the linear allocator for constant tensors tensor_allocation.allocate_tensors( nng, first_npu_sg, arch, permanent_storage, + set((MemType.Permanent_NPU,)), scheduler_options.use_ifm_ofm_overlap, TensorAllocator.LinearAlloc, options.verbose_allocation, @@ -159,19 +160,36 @@ def compiler_driver(nng, arch, options, scheduler_options): # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step # will start at the root subgraph's input and traverse from top to bottom. When # it comes across an Npu-op it will extract live ranges for it's corresponding - # Npu subgraph and add them to the root's live range graph. Finally, all of the - # non-constant tensors are allocated together + # Npu subgraph and add them to the root's live range graph. + # The non-constant tensors are stored either in arch.feature_map_storage_mem_area or + # arch.fast_storage_mem_area. + # When these memory areas are the same, all non-constant tensors are allocated together. + # Otherwise they are allocated separately. + root_sg = nng.get_root_subgraph() - tensor_allocation.allocate_tensors( - nng, - root_sg, - arch, - non_const_mem_area, - scheduler_options.use_ifm_ofm_overlap, - options.tensor_allocator, - options.verbose_allocation, - options.show_minimum_possible_allocation, - ) + + alloc_list = [] + if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area: + mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast))) + alloc_list.append(mem_alloc_scratch) + else: + mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,))) + mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,))) + alloc_list.append(mem_alloc_scratch) + alloc_list.append(mem_alloc_scratch_fast) + + for alloc in alloc_list: + tensor_allocation.allocate_tensors( + nng, + root_sg, + arch, + alloc[0], + alloc[1], + scheduler_options.use_ifm_ofm_overlap, + options.tensor_allocator, + options.verbose_allocation, + options.show_minimum_possible_allocation, + ) # Generate command streams and serialise Npu-ops into tensors for sg in nng.subgraphs: @@ -194,6 +212,7 @@ def compiler_driver(nng, arch, options, scheduler_options): root_sg, arch, permanent_storage, + set((MemType.Permanent_CPU,)), scheduler_options.use_ifm_ofm_overlap, TensorAllocator.LinearAlloc, options.verbose_allocation, |