MLBEDSW-2337: Intermediate feature maps in fast storage

Attempts to use fast storage for feature maps used in between cascaded passes. This is only relevant for system configurations where feature maps are by default not placed in SRAM, but there is SRAM for fast storage. Change-Id: I207b7cf32cfcb5bea3e6b93c2da1161c4af5221d Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
author: Louis Verhaard <louis.verhaard@arm.com> 2020-09-15 14:05:38 +0200
committer: Louis Verhaard <louis.verhaard@arm.com> 2020-09-25 08:32:55 +0200
commit: 0b9c9a3873da3d368e184308f4f9a4c202e3fb67 (patch)
tree: f057bfeacd6c6323cd0e7f33cd7ec33f941b4f8d /ethosu/vela/compiler_driver.py
parent: 8854dc9088586e1bb0bf2640b2289903dfa3c822 (diff)
download: ethos-u-vela-0b9c9a3873da3d368e184308f4f9a4c202e3fb67.tar.gz
1 files changed, 104 insertions, 35 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 92fe5840..6c1142d1 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -88,6 +88,45 @@ Note the difference between ArchitectureFeatures and CompilerOptions
     __repr__ = __str__
 
 
+def next_sram_factor(alloc_results):
+    # Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.
+    # Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),
+    # dry_test is True while still bisecting.
+    upper = 1.0
+    lower = 0.7
+    MAX_ITERATIONS = 8
+    if len(alloc_results) == 0:
+        # First iteration, try max SRAM, keep the result if it succeeds
+        return (upper, False)
+    elif len(alloc_results) == 1:
+        if alloc_results[0]:
+            # The allocator succeeded at first try; stop
+            return (None, False)
+        else:
+            # Start bisecting, try lowerbound SRAM
+            return (lower, True)
+    elif len(alloc_results) > MAX_ITERATIONS:
+        # Stop
+        return (None, False)
+    if not alloc_results[1]:
+        # Allocation at lower failed; search interval 0 - lower
+        upper = lower
+        lower = 0
+    best = lower
+    for success in alloc_results[2:]:
+        middle = (lower + upper) / 2
+        if success:
+            best = max(best, middle)
+            lower = middle
+        else:
+            upper = middle
+    if len(alloc_results) == MAX_ITERATIONS:
+        # Done bisecting; repeat the best match, but not as dry test
+        return (best, False)
+    # Next try; run only as dry test
+    return ((lower + upper) / 2, True)
+
+
 def compiler_driver(nng, arch, options, scheduler_options):
     assert verify_graph_health(nng)
     nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
@@ -156,11 +195,11 @@ def compiler_driver(nng, arch, options, scheduler_options):
             arch,
             permanent_storage,
             set((MemType.Permanent_NPU,)),
-            scheduler_options.use_ifm_ofm_overlap,
-            TensorAllocator.LinearAlloc,
-            options.verbose_allocation,
-            options.show_minimum_possible_allocation,
-            lr_graph_flash,
+            use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
+            tensor_allocator=TensorAllocator.LinearAlloc,
+            verbose_allocation=options.verbose_allocation,
+            show_minimum_possible_allocation=options.show_minimum_possible_allocation,
+            lr_graph=lr_graph_flash,
         )
 
     # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
@@ -175,28 +214,68 @@ def compiler_driver(nng, arch, options, scheduler_options):
     root_sg = nng.get_root_subgraph()
 
     alloc_list = []
-    if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+    feature_maps_in_fast_storage = arch.feature_map_storage_mem_area == arch.fast_storage_mem_area
+    if feature_maps_in_fast_storage:
         mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
         alloc_list.append(mem_alloc_scratch)
     else:
-        mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
         mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
-        alloc_list.append(mem_alloc_scratch)
+        mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
+        # Order is important
         alloc_list.append(mem_alloc_scratch_fast)
+        alloc_list.append(mem_alloc_scratch)
 
-    for alloc in alloc_list:
-        tensor_allocation.allocate_tensors(
-            nng,
-            root_sg,
-            arch,
-            alloc[0],
-            alloc[1],
-            scheduler_options.use_ifm_ofm_overlap,
-            options.tensor_allocator,
-            options.verbose_allocation,
-            options.show_minimum_possible_allocation,
-            allocation_alignment=options.allocation_alignment,
-        )
+    for mem_area, mem_type_set in alloc_list:
+        if feature_maps_in_fast_storage or mem_area != arch.fast_storage_mem_area:
+            tensor_allocation.allocate_tensors(
+                nng,
+                root_sg,
+                arch,
+                mem_area,
+                mem_type_set,
+                use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
+                tensor_allocator=options.tensor_allocator,
+                verbose_allocation=options.verbose_allocation,
+                show_minimum_possible_allocation=options.show_minimum_possible_allocation,
+                allocation_alignment=options.allocation_alignment,
+            )
+        else:
+            # For the case where scratch_fast != scratch: attempt to place feature maps used between
+            # cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM.
+            alloc_results = []
+            while True:
+                assert len(alloc_results) < 10, "Infinite allocator loop"
+                sram_factor, dry_test = next_sram_factor(alloc_results)
+                if sram_factor is None:
+                    break
+                # Try to move as many feature maps as possible to SRAM before allocating
+                sram_limit = sram_factor * arch.sram_size
+                for sg in nng.subgraphs:
+                    scheduler.use_fast_storage_for_feature_maps(sg, sram_limit, arch)
+                alloc_success = tensor_allocation.allocate_tensors(
+                    nng,
+                    root_sg,
+                    arch,
+                    mem_area,
+                    mem_type_set,
+                    max_size=arch.sram_size,
+                    dry_test=dry_test,
+                    use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
+                    tensor_allocator=options.tensor_allocator,
+                    verbose_allocation=options.verbose_allocation,
+                    show_minimum_possible_allocation=options.show_minimum_possible_allocation,
+                    allocation_alignment=options.allocation_alignment,
+                )
+                if dry_test or not alloc_success:
+                    for sg in nng.subgraphs:
+                        scheduler.undo_use_fast_storage(sg, arch)
+                alloc_results.append(alloc_success)
+            if not alloc_results[-1]:
+                raise VelaError(
+                    "Sram limit {} bytes, has been exceeded by the scratch fast tensor. "
+                    "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
+                    "See OPTIONS.md for more information.".format(arch.sram_size)
+                )
 
     # Generate command streams and serialise Npu-ops into tensors
     for sg in nng.subgraphs:
@@ -213,16 +292,6 @@ def compiler_driver(nng, arch, options, scheduler_options):
 
     npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
 
-    if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
-        if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
-            raise VelaError(
-                "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes. "
-                "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
-                "See OPTIONS.md for more information.".format(
-                    arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)
-                )
-            )
-
     # Allocate all Cpu constant tensors, this is done last because the Npu-ops
     # have to be serialized into flash and scratch tensors first
     tensor_allocation.allocate_tensors(
@@ -231,10 +300,10 @@ def compiler_driver(nng, arch, options, scheduler_options):
         arch,
         permanent_storage,
         set((MemType.Permanent_CPU,)),
-        scheduler_options.use_ifm_ofm_overlap,
-        TensorAllocator.LinearAlloc,
-        options.verbose_allocation,
-        options.show_minimum_possible_allocation,
+        use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
+        tensor_allocator=TensorAllocator.LinearAlloc,
+        verbose_allocation=options.verbose_allocation,
+        show_minimum_possible_allocation=options.show_minimum_possible_allocation,
         allocation_alignment=options.allocation_alignment,
     )
author	Louis Verhaard <louis.verhaard@arm.com>	2020-09-15 14:05:38 +0200
committer	Louis Verhaard <louis.verhaard@arm.com>	2020-09-25 08:32:55 +0200
commit	0b9c9a3873da3d368e184308f4f9a4c202e3fb67 (patch)
tree	f057bfeacd6c6323cd0e7f33cd7ec33f941b4f8d /ethosu/vela/compiler_driver.py
parent	8854dc9088586e1bb0bf2640b2289903dfa3c822 (diff)
download	ethos-u-vela-0b9c9a3873da3d368e184308f4f9a4c202e3fb67.tar.gz