diff options
-rw-r--r-- | ethosu/vela/compiler_driver.py | 139 | ||||
-rw-r--r-- | ethosu/vela/nn_graph.py | 2 | ||||
-rw-r--r-- | ethosu/vela/scheduler.py | 172 | ||||
-rw-r--r-- | ethosu/vela/tensor.py | 2 | ||||
-rw-r--r-- | ethosu/vela/tensor_allocation.py | 10 | ||||
-rw-r--r-- | ethosu/vela/test/test_compiler_driver.py | 44 |
6 files changed, 290 insertions, 79 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index 92fe5840..6c1142d1 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -88,6 +88,45 @@ Note the difference between ArchitectureFeatures and CompilerOptions __repr__ = __str__ +def next_sram_factor(alloc_results): + # Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator. + # Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try), + # dry_test is True while still bisecting. + upper = 1.0 + lower = 0.7 + MAX_ITERATIONS = 8 + if len(alloc_results) == 0: + # First iteration, try max SRAM, keep the result if it succeeds + return (upper, False) + elif len(alloc_results) == 1: + if alloc_results[0]: + # The allocator succeeded at first try; stop + return (None, False) + else: + # Start bisecting, try lowerbound SRAM + return (lower, True) + elif len(alloc_results) > MAX_ITERATIONS: + # Stop + return (None, False) + if not alloc_results[1]: + # Allocation at lower failed; search interval 0 - lower + upper = lower + lower = 0 + best = lower + for success in alloc_results[2:]: + middle = (lower + upper) / 2 + if success: + best = max(best, middle) + lower = middle + else: + upper = middle + if len(alloc_results) == MAX_ITERATIONS: + # Done bisecting; repeat the best match, but not as dry test + return (best, False) + # Next try; run only as dry test + return ((lower + upper) / 2, True) + + def compiler_driver(nng, arch, options, scheduler_options): assert verify_graph_health(nng) nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph) @@ -156,11 +195,11 @@ def compiler_driver(nng, arch, options, scheduler_options): arch, permanent_storage, set((MemType.Permanent_NPU,)), - scheduler_options.use_ifm_ofm_overlap, - TensorAllocator.LinearAlloc, - options.verbose_allocation, - options.show_minimum_possible_allocation, - lr_graph_flash, + use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap, + tensor_allocator=TensorAllocator.LinearAlloc, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, + lr_graph=lr_graph_flash, ) # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step @@ -175,28 +214,68 @@ def compiler_driver(nng, arch, options, scheduler_options): root_sg = nng.get_root_subgraph() alloc_list = [] - if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area: + feature_maps_in_fast_storage = arch.feature_map_storage_mem_area == arch.fast_storage_mem_area + if feature_maps_in_fast_storage: mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast))) alloc_list.append(mem_alloc_scratch) else: - mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,))) mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,))) - alloc_list.append(mem_alloc_scratch) + mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,))) + # Order is important alloc_list.append(mem_alloc_scratch_fast) + alloc_list.append(mem_alloc_scratch) - for alloc in alloc_list: - tensor_allocation.allocate_tensors( - nng, - root_sg, - arch, - alloc[0], - alloc[1], - scheduler_options.use_ifm_ofm_overlap, - options.tensor_allocator, - options.verbose_allocation, - options.show_minimum_possible_allocation, - allocation_alignment=options.allocation_alignment, - ) + for mem_area, mem_type_set in alloc_list: + if feature_maps_in_fast_storage or mem_area != arch.fast_storage_mem_area: + tensor_allocation.allocate_tensors( + nng, + root_sg, + arch, + mem_area, + mem_type_set, + use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap, + tensor_allocator=options.tensor_allocator, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, + allocation_alignment=options.allocation_alignment, + ) + else: + # For the case where scratch_fast != scratch: attempt to place feature maps used between + # cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM. + alloc_results = [] + while True: + assert len(alloc_results) < 10, "Infinite allocator loop" + sram_factor, dry_test = next_sram_factor(alloc_results) + if sram_factor is None: + break + # Try to move as many feature maps as possible to SRAM before allocating + sram_limit = sram_factor * arch.sram_size + for sg in nng.subgraphs: + scheduler.use_fast_storage_for_feature_maps(sg, sram_limit, arch) + alloc_success = tensor_allocation.allocate_tensors( + nng, + root_sg, + arch, + mem_area, + mem_type_set, + max_size=arch.sram_size, + dry_test=dry_test, + use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap, + tensor_allocator=options.tensor_allocator, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, + allocation_alignment=options.allocation_alignment, + ) + if dry_test or not alloc_success: + for sg in nng.subgraphs: + scheduler.undo_use_fast_storage(sg, arch) + alloc_results.append(alloc_success) + if not alloc_results[-1]: + raise VelaError( + "Sram limit {} bytes, has been exceeded by the scratch fast tensor. " + "Increasing the value of --weight-estimation-scaling may help to resolve the issue. " + "See OPTIONS.md for more information.".format(arch.sram_size) + ) # Generate command streams and serialise Npu-ops into tensors for sg in nng.subgraphs: @@ -213,16 +292,6 @@ def compiler_driver(nng, arch, options, scheduler_options): npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch) - if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area): - if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size: - raise VelaError( - "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes. " - "Increasing the value of --weight-estimation-scaling may help to resolve the issue. " - "See OPTIONS.md for more information.".format( - arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) - ) - ) - # Allocate all Cpu constant tensors, this is done last because the Npu-ops # have to be serialized into flash and scratch tensors first tensor_allocation.allocate_tensors( @@ -231,10 +300,10 @@ def compiler_driver(nng, arch, options, scheduler_options): arch, permanent_storage, set((MemType.Permanent_CPU,)), - scheduler_options.use_ifm_ofm_overlap, - TensorAllocator.LinearAlloc, - options.verbose_allocation, - options.show_minimum_possible_allocation, + use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap, + tensor_allocator=TensorAllocator.LinearAlloc, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, allocation_alignment=options.allocation_alignment, ) diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py index 21cd80b9..58aab611 100644 --- a/ethosu/vela/nn_graph.py +++ b/ethosu/vela/nn_graph.py @@ -141,6 +141,8 @@ class Subgraph: self.placement = placement self.command_stream_tensor = None self.flash_tensor = None + # Scratch information locally used in the scheduler + self.scheduling_info = {} self.memory_used = {} self.memory_used_per_type = {} diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 24453d8c..5c2ddabb 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -959,52 +959,66 @@ class DynamicProgrammingScheduler: self.sg.cascaded_passes = cascaded_passes self.sg.build_cascaded_pass_links() - if self.options.use_nhcwb16_between_cascaded_passes: - # Check if NHCWB16 can be used in between cascaded passes - # (NHCWB16 within cascaded passes has been handled earlier in this function) - if self.sg.placement == PassPlacement.Npu: - last_op_in_subgraph = self.sg.cascaded_passes[-1].passes[-1].primary_op - for ps in self.sg.cascaded_passes: - if ps.placement != PassPlacement.Npu: + # Check if NHCWB16 and/or fast storage can be used in between cascaded passes + # (NHCWB16 within cascaded passes has been handled earlier in this function) + if self.sg.placement == PassPlacement.Npu: + # Dictionary tensor -> list of ops, containing feature maps that can be attempted + # to be moved to fast storage + fast_storage_tensor_rewrites = {} + last_op_in_subgraph = self.sg.cascaded_passes[-1].passes[-1].primary_op + for ps in self.sg.cascaded_passes: + if ps.placement != PassPlacement.Npu: + continue + for output in ps.outputs: + if output.purpose != TensorPurpose.FeatureMap or output.avoid_NHCWB16: continue - for output in ps.outputs: - if output.purpose != TensorPurpose.FeatureMap or output.avoid_NHCWB16: - continue - use_NHCWB16 = True - rewrites = [] - for op in output.consumer_list: - if op is None or (op.type == "ReduceSum" and output.dtype == DataType.int32): - use_NHCWB16 = False - elif op.type == "Reshape": - # Detect no-op reshapes by comparing their full input and output tensor shapes. - inshape = full_shape(4, op.inputs[0].shape, 1) - outshape = full_shape(4, op.outputs[0].shape, 1) - # Using NHCWB16 format for a no-op reshape is only an option if subsequent - # consumers do not also need to perform a reshape or if the OFM is going to - # be processed by CPU operations. No-op reshape consumers with empty lists - # (those that have no consumers, or null-consumers used as list terminators) - # must use normal NHWC output. - incompatible_consumers = [ - ( - not consumer.run_on_npu - or consumer.type == "Reshape" - or (consumer is last_op_in_subgraph) - ) - for consumer in op.outputs[0].consumer_list - if consumer is not None - ] - if (outshape == inshape) and incompatible_consumers and not any(incompatible_consumers): - rewrites.append(op) - else: - use_NHCWB16 = False + use_NHCWB16 = True + use_fast_storage = True + rewrites = [] + for op in output.consumer_list: + if op is None: + use_NHCWB16 = False + use_fast_storage = False + continue + if op.type == "ReduceSum" and output.dtype == DataType.int32: + use_NHCWB16 = False + elif op.type == "Reshape": + # Detect no-op reshapes by comparing their full input and output tensor shapes. + inshape = full_shape(4, op.inputs[0].shape, 1) + outshape = full_shape(4, op.outputs[0].shape, 1) + # Using NHCWB16 format for a no-op reshape is only an option if subsequent + # consumers do not also need to perform a reshape or if the OFM is going to + # be processed by CPU operations. No-op reshape consumers with empty lists + # (those that have no consumers, or null-consumers used as list terminators) + # must use normal NHWC output. + incompatible_consumers = [ + ( + not consumer.run_on_npu + or consumer.type == "Reshape" + or (consumer is last_op_in_subgraph) + ) + for consumer in op.outputs[0].consumer_list + if consumer is not None + ] + if (outshape == inshape) and incompatible_consumers and not any(incompatible_consumers): + rewrites.append(op) else: - use_NHCWB16 &= op.run_on_npu - - if use_NHCWB16: - output.set_format(TensorFormat.NHCWB16, arch) - for rewrite_op in rewrites: - rewrite_op.outputs[0].set_format(TensorFormat.NHCWB16, arch) + use_NHCWB16 = False + use_fast_storage = False + use_NHCWB16 &= op.run_on_npu + use_fast_storage &= op.run_on_npu + + if use_fast_storage: + fast_storage_tensor_rewrites[output] = rewrites + if use_NHCWB16 and self.options.use_nhcwb16_between_cascaded_passes: + output.set_format(TensorFormat.NHCWB16, arch) + for rewrite_op in rewrites: + rewrite_op.outputs[0].set_format(TensorFormat.NHCWB16, arch) + if self.feature_maps_not_in_fast_storage: + # Remember feature maps that can be moved to fast storage for later use + # in use_fast_storage_for_feature_maps + self.sg.scheduling_info["feature_map_rewrites"] = fast_storage_tensor_rewrites def schedule_passes(nng, arch, options: SchedulerOptions): @@ -1027,3 +1041,75 @@ def schedule_passes(nng, arch, options: SchedulerOptions): if options.verbose_schedule: sg.print_cascaded_passes() + + +def _calc_tens_to_cps(sg, tensor_rewrites): + # Determines for each tensor the list of affected cascaded passes, in terms of SRAM consumption. + # Returns dictionary tensor -> list of cascaded passes + # Note: if cascaded passes are A, B, C, D, and a tensor is output + # of A and input to D, then it also consumes SRAM in passes B and C. + if "tens_to_cps" in sg.scheduling_info: + return sg.scheduling_info["tens_to_cps"] + # Determine life-time of tensors + min_index = {} + max_index = {} + index = 0 + cps_list = [cps for cps in sg.cascaded_passes if cps.placement == PassPlacement.Npu] + for cps in cps_list: + for tens in cps.inputs + cps.outputs: + if tens in tensor_rewrites: + min_index[tens] = min(index, min_index.get(tens, len(cps_list))) + max_index[tens] = index + index += 1 + # Convert to affected cps-es + tens_to_cps = {} + for tens in min_index: + tens_to_cps[tens] = cps_list[min_index[tens] : max_index[tens] + 1] + sg.scheduling_info["tens_to_cps"] = tens_to_cps + return tens_to_cps + + +def use_fast_storage_for_feature_maps(sg, sram_limit, arch): + # Attempts to use as much fast storage as possible for feature maps shared between cascaded passes. + tensor_rewrites = sg.scheduling_info.get("feature_map_rewrites", {}) + tens_to_cps = _calc_tens_to_cps(sg, tensor_rewrites) + # Sort tensors first on life-time (smallest first), then on size (biggest first) + tens_list = sorted([(len(tens_to_cps[tens]), -tens.storage_size(), tens.name, tens) for tens in tens_to_cps]) + for _, _, _, tens in tens_list: + cps_list = tens_to_cps[tens] + if len(cps_list) <= 1: + continue + sz = tens.storage_size() + fits_in_fast_storage = all([cps.sram_used + sz <= sram_limit for cps in cps_list]) + if fits_in_fast_storage: + tens.mem_area = arch.fast_storage_mem_area + tens.mem_type = MemType.Scratch_fast + tens.set_new_sub_purpose(TensorSubPurpose.Standard, None, None) + assert tens in tensor_rewrites + # Also rewrite reshapes + for rewrite_op in tensor_rewrites[tens]: + tens2 = rewrite_op.outputs[0] + tens2.mem_area = arch.fast_storage_mem_area + tens2.mem_type = MemType.Scratch_fast + tens2.set_new_sub_purpose(TensorSubPurpose.Standard, None, None) + for cps in cps_list: + cps.sram_used += sz + + +def undo_use_fast_storage(sg, arch): + # Undoes the effects of a previous call to use_fast_storage_for_feature_maps + tensor_rewrites = sg.scheduling_info.get("feature_map_rewrites", {}) + tens_to_cps = _calc_tens_to_cps(sg, tensor_rewrites) + mem_area = arch.tensor_storage_mem_area[TensorPurpose.FeatureMap] + for tens, cps_list in tens_to_cps.items(): + if tens.mem_type == MemType.Scratch_fast: + sz = tens.storage_size() + tens.mem_area = mem_area + tens.mem_type = MemType.Scratch + # Also undo reshapes + for rewrite_op in tensor_rewrites[tens]: + tens2 = rewrite_op.outputs[0] + tens2.mem_area = mem_area + tens2.mem_type = MemType.Scratch + for cps in cps_list: + cps.sram_used -= sz diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index eedbadad..c0786bfc 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -278,7 +278,7 @@ class TensorAddressMap: def set_address_for_tens(cls, tens_id, mem_type, address): # Check previous address if there is one previous_address = cls.address_map[tens_id].get(mem_type) - if previous_address is not None: + if address is not None and previous_address is not None: assert previous_address == address, "Two different addresses cannot be assigned to the same tensor." # Set tensor's address for memory type diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index d53babc3..1efcd686 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -128,7 +128,10 @@ def allocate_tensors( show_minimum_possible_allocation=False, lr_graph=None, allocation_alignment=Tensor.AllocationQuantum, + max_size=None, + dry_test=False, ): + # Allocates addresses to tensors, returns False if tensors could not be fit within max_size ignore_subgraph_input_output_tensors = False lrs = live_range.extract_live_ranges_from_cascaded_passes( sg, @@ -149,6 +152,12 @@ def allocate_tensors( total_sz = linear_allocate_live_ranges(lrs, allocation_alignment) else: assert 0 + alloc_ok = max_size is None or total_sz <= max_size + if dry_test or not alloc_ok: + # Dry test or allocation failed; undo allocation + for lr in lrs.ranges.values(): + lr.set_address(None) + return alloc_ok if sg.memory_used.get(mem_area, 0) == 0: sg.memory_used[mem_area] = total_sz @@ -179,3 +188,4 @@ def allocate_tensors( nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area] except ZeroDivisionError: nng.bits_per_element[mem_area] = 0.0 + return True diff --git a/ethosu/vela/test/test_compiler_driver.py b/ethosu/vela/test/test_compiler_driver.py new file mode 100644 index 00000000..56a90c47 --- /dev/null +++ b/ethosu/vela/test/test_compiler_driver.py @@ -0,0 +1,44 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Description: +# Unit tests for compiler driver +from ethosu.vela.compiler_driver import next_sram_factor + + +def test_next_sram_factor(): + lower = 0.7 + assert (1.0, False) == next_sram_factor([]) + assert (None, False) == next_sram_factor([True]) + assert (lower, True) == next_sram_factor([False]) + assert ((1 + lower) / 2, True) == next_sram_factor([False, True]) + assert (lower / 2, True) == next_sram_factor([False, False]) + # Tests next_sram_factor for a range of simulated allocator efficiencies + for i in range(20): + allocator_factor = i / 20.0 # The simulated allocator efficiency + alloc_results = [] + bisected_factor = 0 # The end result of the bisect search + while True: + factor, dry_test = next_sram_factor(alloc_results) + if factor is None: + break + alloc_result = factor < allocator_factor + if alloc_result and not dry_test: + bisected_factor = factor + alloc_results.append(alloc_result) + assert len(alloc_results) < 100 + assert bisected_factor <= allocator_factor + assert abs(bisected_factor - allocator_factor) < 0.02 |