aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ethosu/vela/compiler_driver.py139
-rw-r--r--ethosu/vela/nn_graph.py2
-rw-r--r--ethosu/vela/scheduler.py172
-rw-r--r--ethosu/vela/tensor.py2
-rw-r--r--ethosu/vela/tensor_allocation.py10
-rw-r--r--ethosu/vela/test/test_compiler_driver.py44
6 files changed, 290 insertions, 79 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 92fe5840..6c1142d1 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -88,6 +88,45 @@ Note the difference between ArchitectureFeatures and CompilerOptions
__repr__ = __str__
+def next_sram_factor(alloc_results):
+ # Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.
+ # Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),
+ # dry_test is True while still bisecting.
+ upper = 1.0
+ lower = 0.7
+ MAX_ITERATIONS = 8
+ if len(alloc_results) == 0:
+ # First iteration, try max SRAM, keep the result if it succeeds
+ return (upper, False)
+ elif len(alloc_results) == 1:
+ if alloc_results[0]:
+ # The allocator succeeded at first try; stop
+ return (None, False)
+ else:
+ # Start bisecting, try lowerbound SRAM
+ return (lower, True)
+ elif len(alloc_results) > MAX_ITERATIONS:
+ # Stop
+ return (None, False)
+ if not alloc_results[1]:
+ # Allocation at lower failed; search interval 0 - lower
+ upper = lower
+ lower = 0
+ best = lower
+ for success in alloc_results[2:]:
+ middle = (lower + upper) / 2
+ if success:
+ best = max(best, middle)
+ lower = middle
+ else:
+ upper = middle
+ if len(alloc_results) == MAX_ITERATIONS:
+ # Done bisecting; repeat the best match, but not as dry test
+ return (best, False)
+ # Next try; run only as dry test
+ return ((lower + upper) / 2, True)
+
+
def compiler_driver(nng, arch, options, scheduler_options):
assert verify_graph_health(nng)
nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
@@ -156,11 +195,11 @@ def compiler_driver(nng, arch, options, scheduler_options):
arch,
permanent_storage,
set((MemType.Permanent_NPU,)),
- scheduler_options.use_ifm_ofm_overlap,
- TensorAllocator.LinearAlloc,
- options.verbose_allocation,
- options.show_minimum_possible_allocation,
- lr_graph_flash,
+ use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
+ tensor_allocator=TensorAllocator.LinearAlloc,
+ verbose_allocation=options.verbose_allocation,
+ show_minimum_possible_allocation=options.show_minimum_possible_allocation,
+ lr_graph=lr_graph_flash,
)
# Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
@@ -175,28 +214,68 @@ def compiler_driver(nng, arch, options, scheduler_options):
root_sg = nng.get_root_subgraph()
alloc_list = []
- if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+ feature_maps_in_fast_storage = arch.feature_map_storage_mem_area == arch.fast_storage_mem_area
+ if feature_maps_in_fast_storage:
mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
alloc_list.append(mem_alloc_scratch)
else:
- mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
- alloc_list.append(mem_alloc_scratch)
+ mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
+ # Order is important
alloc_list.append(mem_alloc_scratch_fast)
+ alloc_list.append(mem_alloc_scratch)
- for alloc in alloc_list:
- tensor_allocation.allocate_tensors(
- nng,
- root_sg,
- arch,
- alloc[0],
- alloc[1],
- scheduler_options.use_ifm_ofm_overlap,
- options.tensor_allocator,
- options.verbose_allocation,
- options.show_minimum_possible_allocation,
- allocation_alignment=options.allocation_alignment,
- )
+ for mem_area, mem_type_set in alloc_list:
+ if feature_maps_in_fast_storage or mem_area != arch.fast_storage_mem_area:
+ tensor_allocation.allocate_tensors(
+ nng,
+ root_sg,
+ arch,
+ mem_area,
+ mem_type_set,
+ use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
+ tensor_allocator=options.tensor_allocator,
+ verbose_allocation=options.verbose_allocation,
+ show_minimum_possible_allocation=options.show_minimum_possible_allocation,
+ allocation_alignment=options.allocation_alignment,
+ )
+ else:
+ # For the case where scratch_fast != scratch: attempt to place feature maps used between
+ # cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM.
+ alloc_results = []
+ while True:
+ assert len(alloc_results) < 10, "Infinite allocator loop"
+ sram_factor, dry_test = next_sram_factor(alloc_results)
+ if sram_factor is None:
+ break
+ # Try to move as many feature maps as possible to SRAM before allocating
+ sram_limit = sram_factor * arch.sram_size
+ for sg in nng.subgraphs:
+ scheduler.use_fast_storage_for_feature_maps(sg, sram_limit, arch)
+ alloc_success = tensor_allocation.allocate_tensors(
+ nng,
+ root_sg,
+ arch,
+ mem_area,
+ mem_type_set,
+ max_size=arch.sram_size,
+ dry_test=dry_test,
+ use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
+ tensor_allocator=options.tensor_allocator,
+ verbose_allocation=options.verbose_allocation,
+ show_minimum_possible_allocation=options.show_minimum_possible_allocation,
+ allocation_alignment=options.allocation_alignment,
+ )
+ if dry_test or not alloc_success:
+ for sg in nng.subgraphs:
+ scheduler.undo_use_fast_storage(sg, arch)
+ alloc_results.append(alloc_success)
+ if not alloc_results[-1]:
+ raise VelaError(
+ "Sram limit {} bytes, has been exceeded by the scratch fast tensor. "
+ "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
+ "See OPTIONS.md for more information.".format(arch.sram_size)
+ )
# Generate command streams and serialise Npu-ops into tensors
for sg in nng.subgraphs:
@@ -213,16 +292,6 @@ def compiler_driver(nng, arch, options, scheduler_options):
npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
- if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
- if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
- raise VelaError(
- "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes. "
- "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
- "See OPTIONS.md for more information.".format(
- arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)
- )
- )
-
# Allocate all Cpu constant tensors, this is done last because the Npu-ops
# have to be serialized into flash and scratch tensors first
tensor_allocation.allocate_tensors(
@@ -231,10 +300,10 @@ def compiler_driver(nng, arch, options, scheduler_options):
arch,
permanent_storage,
set((MemType.Permanent_CPU,)),
- scheduler_options.use_ifm_ofm_overlap,
- TensorAllocator.LinearAlloc,
- options.verbose_allocation,
- options.show_minimum_possible_allocation,
+ use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
+ tensor_allocator=TensorAllocator.LinearAlloc,
+ verbose_allocation=options.verbose_allocation,
+ show_minimum_possible_allocation=options.show_minimum_possible_allocation,
allocation_alignment=options.allocation_alignment,
)
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 21cd80b9..58aab611 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -141,6 +141,8 @@ class Subgraph:
self.placement = placement
self.command_stream_tensor = None
self.flash_tensor = None
+ # Scratch information locally used in the scheduler
+ self.scheduling_info = {}
self.memory_used = {}
self.memory_used_per_type = {}
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 24453d8c..5c2ddabb 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -959,52 +959,66 @@ class DynamicProgrammingScheduler:
self.sg.cascaded_passes = cascaded_passes
self.sg.build_cascaded_pass_links()
- if self.options.use_nhcwb16_between_cascaded_passes:
- # Check if NHCWB16 can be used in between cascaded passes
- # (NHCWB16 within cascaded passes has been handled earlier in this function)
- if self.sg.placement == PassPlacement.Npu:
- last_op_in_subgraph = self.sg.cascaded_passes[-1].passes[-1].primary_op
- for ps in self.sg.cascaded_passes:
- if ps.placement != PassPlacement.Npu:
+ # Check if NHCWB16 and/or fast storage can be used in between cascaded passes
+ # (NHCWB16 within cascaded passes has been handled earlier in this function)
+ if self.sg.placement == PassPlacement.Npu:
+ # Dictionary tensor -> list of ops, containing feature maps that can be attempted
+ # to be moved to fast storage
+ fast_storage_tensor_rewrites = {}
+ last_op_in_subgraph = self.sg.cascaded_passes[-1].passes[-1].primary_op
+ for ps in self.sg.cascaded_passes:
+ if ps.placement != PassPlacement.Npu:
+ continue
+ for output in ps.outputs:
+ if output.purpose != TensorPurpose.FeatureMap or output.avoid_NHCWB16:
continue
- for output in ps.outputs:
- if output.purpose != TensorPurpose.FeatureMap or output.avoid_NHCWB16:
- continue
- use_NHCWB16 = True
- rewrites = []
- for op in output.consumer_list:
- if op is None or (op.type == "ReduceSum" and output.dtype == DataType.int32):
- use_NHCWB16 = False
- elif op.type == "Reshape":
- # Detect no-op reshapes by comparing their full input and output tensor shapes.
- inshape = full_shape(4, op.inputs[0].shape, 1)
- outshape = full_shape(4, op.outputs[0].shape, 1)
- # Using NHCWB16 format for a no-op reshape is only an option if subsequent
- # consumers do not also need to perform a reshape or if the OFM is going to
- # be processed by CPU operations. No-op reshape consumers with empty lists
- # (those that have no consumers, or null-consumers used as list terminators)
- # must use normal NHWC output.
- incompatible_consumers = [
- (
- not consumer.run_on_npu
- or consumer.type == "Reshape"
- or (consumer is last_op_in_subgraph)
- )
- for consumer in op.outputs[0].consumer_list
- if consumer is not None
- ]
- if (outshape == inshape) and incompatible_consumers and not any(incompatible_consumers):
- rewrites.append(op)
- else:
- use_NHCWB16 = False
+ use_NHCWB16 = True
+ use_fast_storage = True
+ rewrites = []
+ for op in output.consumer_list:
+ if op is None:
+ use_NHCWB16 = False
+ use_fast_storage = False
+ continue
+ if op.type == "ReduceSum" and output.dtype == DataType.int32:
+ use_NHCWB16 = False
+ elif op.type == "Reshape":
+ # Detect no-op reshapes by comparing their full input and output tensor shapes.
+ inshape = full_shape(4, op.inputs[0].shape, 1)
+ outshape = full_shape(4, op.outputs[0].shape, 1)
+ # Using NHCWB16 format for a no-op reshape is only an option if subsequent
+ # consumers do not also need to perform a reshape or if the OFM is going to
+ # be processed by CPU operations. No-op reshape consumers with empty lists
+ # (those that have no consumers, or null-consumers used as list terminators)
+ # must use normal NHWC output.
+ incompatible_consumers = [
+ (
+ not consumer.run_on_npu
+ or consumer.type == "Reshape"
+ or (consumer is last_op_in_subgraph)
+ )
+ for consumer in op.outputs[0].consumer_list
+ if consumer is not None
+ ]
+ if (outshape == inshape) and incompatible_consumers and not any(incompatible_consumers):
+ rewrites.append(op)
else:
- use_NHCWB16 &= op.run_on_npu
-
- if use_NHCWB16:
- output.set_format(TensorFormat.NHCWB16, arch)
- for rewrite_op in rewrites:
- rewrite_op.outputs[0].set_format(TensorFormat.NHCWB16, arch)
+ use_NHCWB16 = False
+ use_fast_storage = False
+ use_NHCWB16 &= op.run_on_npu
+ use_fast_storage &= op.run_on_npu
+
+ if use_fast_storage:
+ fast_storage_tensor_rewrites[output] = rewrites
+ if use_NHCWB16 and self.options.use_nhcwb16_between_cascaded_passes:
+ output.set_format(TensorFormat.NHCWB16, arch)
+ for rewrite_op in rewrites:
+ rewrite_op.outputs[0].set_format(TensorFormat.NHCWB16, arch)
+ if self.feature_maps_not_in_fast_storage:
+ # Remember feature maps that can be moved to fast storage for later use
+ # in use_fast_storage_for_feature_maps
+ self.sg.scheduling_info["feature_map_rewrites"] = fast_storage_tensor_rewrites
def schedule_passes(nng, arch, options: SchedulerOptions):
@@ -1027,3 +1041,75 @@ def schedule_passes(nng, arch, options: SchedulerOptions):
if options.verbose_schedule:
sg.print_cascaded_passes()
+
+
+def _calc_tens_to_cps(sg, tensor_rewrites):
+ # Determines for each tensor the list of affected cascaded passes, in terms of SRAM consumption.
+ # Returns dictionary tensor -> list of cascaded passes
+ # Note: if cascaded passes are A, B, C, D, and a tensor is output
+ # of A and input to D, then it also consumes SRAM in passes B and C.
+ if "tens_to_cps" in sg.scheduling_info:
+ return sg.scheduling_info["tens_to_cps"]
+ # Determine life-time of tensors
+ min_index = {}
+ max_index = {}
+ index = 0
+ cps_list = [cps for cps in sg.cascaded_passes if cps.placement == PassPlacement.Npu]
+ for cps in cps_list:
+ for tens in cps.inputs + cps.outputs:
+ if tens in tensor_rewrites:
+ min_index[tens] = min(index, min_index.get(tens, len(cps_list)))
+ max_index[tens] = index
+ index += 1
+ # Convert to affected cps-es
+ tens_to_cps = {}
+ for tens in min_index:
+ tens_to_cps[tens] = cps_list[min_index[tens] : max_index[tens] + 1]
+ sg.scheduling_info["tens_to_cps"] = tens_to_cps
+ return tens_to_cps
+
+
+def use_fast_storage_for_feature_maps(sg, sram_limit, arch):
+ # Attempts to use as much fast storage as possible for feature maps shared between cascaded passes.
+ tensor_rewrites = sg.scheduling_info.get("feature_map_rewrites", {})
+ tens_to_cps = _calc_tens_to_cps(sg, tensor_rewrites)
+ # Sort tensors first on life-time (smallest first), then on size (biggest first)
+ tens_list = sorted([(len(tens_to_cps[tens]), -tens.storage_size(), tens.name, tens) for tens in tens_to_cps])
+ for _, _, _, tens in tens_list:
+ cps_list = tens_to_cps[tens]
+ if len(cps_list) <= 1:
+ continue
+ sz = tens.storage_size()
+ fits_in_fast_storage = all([cps.sram_used + sz <= sram_limit for cps in cps_list])
+ if fits_in_fast_storage:
+ tens.mem_area = arch.fast_storage_mem_area
+ tens.mem_type = MemType.Scratch_fast
+ tens.set_new_sub_purpose(TensorSubPurpose.Standard, None, None)
+ assert tens in tensor_rewrites
+ # Also rewrite reshapes
+ for rewrite_op in tensor_rewrites[tens]:
+ tens2 = rewrite_op.outputs[0]
+ tens2.mem_area = arch.fast_storage_mem_area
+ tens2.mem_type = MemType.Scratch_fast
+ tens2.set_new_sub_purpose(TensorSubPurpose.Standard, None, None)
+ for cps in cps_list:
+ cps.sram_used += sz
+
+
+def undo_use_fast_storage(sg, arch):
+ # Undoes the effects of a previous call to use_fast_storage_for_feature_maps
+ tensor_rewrites = sg.scheduling_info.get("feature_map_rewrites", {})
+ tens_to_cps = _calc_tens_to_cps(sg, tensor_rewrites)
+ mem_area = arch.tensor_storage_mem_area[TensorPurpose.FeatureMap]
+ for tens, cps_list in tens_to_cps.items():
+ if tens.mem_type == MemType.Scratch_fast:
+ sz = tens.storage_size()
+ tens.mem_area = mem_area
+ tens.mem_type = MemType.Scratch
+ # Also undo reshapes
+ for rewrite_op in tensor_rewrites[tens]:
+ tens2 = rewrite_op.outputs[0]
+ tens2.mem_area = mem_area
+ tens2.mem_type = MemType.Scratch
+ for cps in cps_list:
+ cps.sram_used -= sz
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index eedbadad..c0786bfc 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -278,7 +278,7 @@ class TensorAddressMap:
def set_address_for_tens(cls, tens_id, mem_type, address):
# Check previous address if there is one
previous_address = cls.address_map[tens_id].get(mem_type)
- if previous_address is not None:
+ if address is not None and previous_address is not None:
assert previous_address == address, "Two different addresses cannot be assigned to the same tensor."
# Set tensor's address for memory type
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index d53babc3..1efcd686 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -128,7 +128,10 @@ def allocate_tensors(
show_minimum_possible_allocation=False,
lr_graph=None,
allocation_alignment=Tensor.AllocationQuantum,
+ max_size=None,
+ dry_test=False,
):
+ # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
ignore_subgraph_input_output_tensors = False
lrs = live_range.extract_live_ranges_from_cascaded_passes(
sg,
@@ -149,6 +152,12 @@ def allocate_tensors(
total_sz = linear_allocate_live_ranges(lrs, allocation_alignment)
else:
assert 0
+ alloc_ok = max_size is None or total_sz <= max_size
+ if dry_test or not alloc_ok:
+ # Dry test or allocation failed; undo allocation
+ for lr in lrs.ranges.values():
+ lr.set_address(None)
+ return alloc_ok
if sg.memory_used.get(mem_area, 0) == 0:
sg.memory_used[mem_area] = total_sz
@@ -179,3 +188,4 @@ def allocate_tensors(
nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area]
except ZeroDivisionError:
nng.bits_per_element[mem_area] = 0.0
+ return True
diff --git a/ethosu/vela/test/test_compiler_driver.py b/ethosu/vela/test/test_compiler_driver.py
new file mode 100644
index 00000000..56a90c47
--- /dev/null
+++ b/ethosu/vela/test/test_compiler_driver.py
@@ -0,0 +1,44 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Unit tests for compiler driver
+from ethosu.vela.compiler_driver import next_sram_factor
+
+
+def test_next_sram_factor():
+ lower = 0.7
+ assert (1.0, False) == next_sram_factor([])
+ assert (None, False) == next_sram_factor([True])
+ assert (lower, True) == next_sram_factor([False])
+ assert ((1 + lower) / 2, True) == next_sram_factor([False, True])
+ assert (lower / 2, True) == next_sram_factor([False, False])
+ # Tests next_sram_factor for a range of simulated allocator efficiencies
+ for i in range(20):
+ allocator_factor = i / 20.0 # The simulated allocator efficiency
+ alloc_results = []
+ bisected_factor = 0 # The end result of the bisect search
+ while True:
+ factor, dry_test = next_sram_factor(alloc_results)
+ if factor is None:
+ break
+ alloc_result = factor < allocator_factor
+ if alloc_result and not dry_test:
+ bisected_factor = factor
+ alloc_results.append(alloc_result)
+ assert len(alloc_results) < 100
+ assert bisected_factor <= allocator_factor
+ assert abs(bisected_factor - allocator_factor) < 0.02