diff options
Diffstat (limited to 'ethosu/vela/compiler_driver.py')
-rw-r--r-- | ethosu/vela/compiler_driver.py | 139 |
1 files changed, 104 insertions, 35 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index 92fe5840..6c1142d1 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -88,6 +88,45 @@ Note the difference between ArchitectureFeatures and CompilerOptions __repr__ = __str__ +def next_sram_factor(alloc_results): + # Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator. + # Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try), + # dry_test is True while still bisecting. + upper = 1.0 + lower = 0.7 + MAX_ITERATIONS = 8 + if len(alloc_results) == 0: + # First iteration, try max SRAM, keep the result if it succeeds + return (upper, False) + elif len(alloc_results) == 1: + if alloc_results[0]: + # The allocator succeeded at first try; stop + return (None, False) + else: + # Start bisecting, try lowerbound SRAM + return (lower, True) + elif len(alloc_results) > MAX_ITERATIONS: + # Stop + return (None, False) + if not alloc_results[1]: + # Allocation at lower failed; search interval 0 - lower + upper = lower + lower = 0 + best = lower + for success in alloc_results[2:]: + middle = (lower + upper) / 2 + if success: + best = max(best, middle) + lower = middle + else: + upper = middle + if len(alloc_results) == MAX_ITERATIONS: + # Done bisecting; repeat the best match, but not as dry test + return (best, False) + # Next try; run only as dry test + return ((lower + upper) / 2, True) + + def compiler_driver(nng, arch, options, scheduler_options): assert verify_graph_health(nng) nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph) @@ -156,11 +195,11 @@ def compiler_driver(nng, arch, options, scheduler_options): arch, permanent_storage, set((MemType.Permanent_NPU,)), - scheduler_options.use_ifm_ofm_overlap, - TensorAllocator.LinearAlloc, - options.verbose_allocation, - options.show_minimum_possible_allocation, - lr_graph_flash, + use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap, + tensor_allocator=TensorAllocator.LinearAlloc, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, + lr_graph=lr_graph_flash, ) # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step @@ -175,28 +214,68 @@ def compiler_driver(nng, arch, options, scheduler_options): root_sg = nng.get_root_subgraph() alloc_list = [] - if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area: + feature_maps_in_fast_storage = arch.feature_map_storage_mem_area == arch.fast_storage_mem_area + if feature_maps_in_fast_storage: mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast))) alloc_list.append(mem_alloc_scratch) else: - mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,))) mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,))) - alloc_list.append(mem_alloc_scratch) + mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,))) + # Order is important alloc_list.append(mem_alloc_scratch_fast) + alloc_list.append(mem_alloc_scratch) - for alloc in alloc_list: - tensor_allocation.allocate_tensors( - nng, - root_sg, - arch, - alloc[0], - alloc[1], - scheduler_options.use_ifm_ofm_overlap, - options.tensor_allocator, - options.verbose_allocation, - options.show_minimum_possible_allocation, - allocation_alignment=options.allocation_alignment, - ) + for mem_area, mem_type_set in alloc_list: + if feature_maps_in_fast_storage or mem_area != arch.fast_storage_mem_area: + tensor_allocation.allocate_tensors( + nng, + root_sg, + arch, + mem_area, + mem_type_set, + use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap, + tensor_allocator=options.tensor_allocator, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, + allocation_alignment=options.allocation_alignment, + ) + else: + # For the case where scratch_fast != scratch: attempt to place feature maps used between + # cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM. + alloc_results = [] + while True: + assert len(alloc_results) < 10, "Infinite allocator loop" + sram_factor, dry_test = next_sram_factor(alloc_results) + if sram_factor is None: + break + # Try to move as many feature maps as possible to SRAM before allocating + sram_limit = sram_factor * arch.sram_size + for sg in nng.subgraphs: + scheduler.use_fast_storage_for_feature_maps(sg, sram_limit, arch) + alloc_success = tensor_allocation.allocate_tensors( + nng, + root_sg, + arch, + mem_area, + mem_type_set, + max_size=arch.sram_size, + dry_test=dry_test, + use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap, + tensor_allocator=options.tensor_allocator, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, + allocation_alignment=options.allocation_alignment, + ) + if dry_test or not alloc_success: + for sg in nng.subgraphs: + scheduler.undo_use_fast_storage(sg, arch) + alloc_results.append(alloc_success) + if not alloc_results[-1]: + raise VelaError( + "Sram limit {} bytes, has been exceeded by the scratch fast tensor. " + "Increasing the value of --weight-estimation-scaling may help to resolve the issue. " + "See OPTIONS.md for more information.".format(arch.sram_size) + ) # Generate command streams and serialise Npu-ops into tensors for sg in nng.subgraphs: @@ -213,16 +292,6 @@ def compiler_driver(nng, arch, options, scheduler_options): npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch) - if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area): - if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size: - raise VelaError( - "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes. " - "Increasing the value of --weight-estimation-scaling may help to resolve the issue. " - "See OPTIONS.md for more information.".format( - arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) - ) - ) - # Allocate all Cpu constant tensors, this is done last because the Npu-ops # have to be serialized into flash and scratch tensors first tensor_allocation.allocate_tensors( @@ -231,10 +300,10 @@ def compiler_driver(nng, arch, options, scheduler_options): arch, permanent_storage, set((MemType.Permanent_CPU,)), - scheduler_options.use_ifm_ofm_overlap, - TensorAllocator.LinearAlloc, - options.verbose_allocation, - options.show_minimum_possible_allocation, + use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap, + tensor_allocator=TensorAllocator.LinearAlloc, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, allocation_alignment=options.allocation_alignment, ) |