diff options
author | Patrik Gustavsson <patrik.gustavsson@arm.com> | 2020-06-29 17:36:55 +0200 |
---|---|---|
committer | tim.hall <tim.hall@arm.com> | 2020-07-07 15:23:20 +0000 |
commit | 3ab9452881a15d88710f4b5d7c14ba5069e74948 (patch) | |
tree | 5f942504d52094f728d974c5bb66ae67aa83eedb | |
parent | e843d3311b8945baa32654af0dccb229b6861438 (diff) | |
download | ethos-u-vela-3ab9452881a15d88710f4b5d7c14ba5069e74948.tar.gz |
MLBEDSW-2551 Add support for more mem-cfgs
Added support for one more memory configuration-
Change-Id: Iac19992386e3e9b80bd519acb1b0a399c47d736f
Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
-rw-r--r-- | ethosu/vela/architecture_features.py | 8 | ||||
-rw-r--r-- | ethosu/vela/compiler_driver.py | 9 | ||||
-rw-r--r-- | ethosu/vela/npu_serialisation.py | 34 | ||||
-rw-r--r-- | ethosu/vela/scheduler.py | 3 | ||||
-rw-r--r-- | ethosu/vela/tflite_writer.py | 15 |
5 files changed, 52 insertions, 17 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py index 1dce435e..6460c527 100644 --- a/ethosu/vela/architecture_features.py +++ b/ethosu/vela/architecture_features.py @@ -274,8 +274,8 @@ Note the difference between ArchitectureFeatures and CompilerOptions self.cycles_weight = 40 self.max_sram_used_weight = 1000 - if self.is_yoda_system: - self.max_sram_used_weight = 1000 + if self.is_yoda_system and (self.fast_storage_mem_area != self.feature_map_storage_mem_area): + self.max_sram_used_weight = 0 # Shared Buffer Block allocations self.shram_bank_size = 1024 # bytes @@ -587,10 +587,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")] self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")] - if self.fast_storage_mem_area != self.feature_map_storage_mem_area: - raise Exception( - "Invalid memory configuration fast_storage_mem_area must be same as feature_map_storage_mem_area" - ) self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")] if is_yoda_system: if self.permanent_storage_mem_area is not MemArea.Dram: diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index e495f1ce..b5a6c42d 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -126,6 +126,7 @@ def compiler_driver(nng, arch, options, scheduler_options): # Placeholders for scratch and flash tensors that are common for all Npu subgraphs scratch_tens = None + scratch_fast_tens = None flash_tens = None # Calculate live ranges for all constant Npu tensors, in permanent storage @@ -199,12 +200,16 @@ def compiler_driver(nng, arch, options, scheduler_options): register_command_stream_generator.generate_register_command_stream( nng, sg, arch, options.verbose_register_command_stream ) - scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors( - nng, sg, arch, scratch_tens, flash_tens + scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors( + nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens ) npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch) + if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area): + if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size: + print("Warning: Sram limit has been exceeded, by the scratch fast tensor") + # Allocate all Cpu constant tensors, this is done last because the Npu-ops # have to be serialized into flash and scratch tensors first tensor_allocation.allocate_tensors( diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py index bd13a3ec..2d1c6b10 100644 --- a/ethosu/vela/npu_serialisation.py +++ b/ethosu/vela/npu_serialisation.py @@ -55,12 +55,13 @@ def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor): memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values -def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens): +def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens): if sg.placement != PassPlacement.Npu: - return scratch_tens, flash_tens + return scratch_tens, scratch_fast_tens, flash_tens flash_area = arch.permanent_storage_mem_area scratch_area = arch.feature_map_storage_mem_area + scratch_fast_area = arch.fast_storage_mem_area flash_size = sg.memory_used.get(flash_area, 0) scratch_size = sg.memory_used.get(scratch_area, 0) @@ -85,6 +86,10 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens) nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size + if scratch_area != scratch_fast_area: + nng.total_size[scratch_fast_area] = nng.total_size.get(scratch_fast_area, 0) + nng.total_elements[scratch_fast_area] = nng.total_elements.get(scratch_fast_area, 0) + if flash_tens == scratch_tens is None: # First Npu subgraph, create scratch and flash tensors sg.scratch_tensor = make_memory_tensor( @@ -94,12 +99,22 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens) sg.flash_tensor = make_memory_tensor( sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch ) + # Scratch fast tensor size set to 0. This forces a minimal allocation in the tensor arena + # which causes a slot in the basep registers to be reserved, so that the scratch fast tensor + # address can be overridden. + sg.scratch_fast_tensor = make_memory_tensor( + sg.name + "_scratch_fast", scratch_fast_area, MemType.Scratch, 0, False, arch + ) + sg.scratch_fast_tensor.purpose = TensorPurpose.Scratch else: sg.scratch_tensor = scratch_tens sg.scratch_tensor.shape[0] += scratch_size sg.flash_tensor = flash_tens sg.flash_tensor.shape[0] += flash_size + sg.scratch_fast_tensor = scratch_fast_tens + sg.scratch_fast_tensor.shape[0] = 0 + for cps in sg.cascaded_passes: for ps in cps.passes: if ps.placement == PassPlacement.Npu: @@ -126,7 +141,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens) ) sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8) - return sg.scratch_tensor, sg.flash_tensor + return sg.scratch_tensor, sg.scratch_fast_tensor, sg.flash_tensor def add_const_tens_to_startup_cascaded_pass(startup_cps, tens): @@ -152,11 +167,16 @@ def rewrite_npu_call_ops(nng, sg, arch): op.attrs["custom_type"] = op.type sz = 0 - for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]: + for tens in [ + callee.scratch_fast_tensor, + callee.scratch_tensor, + callee.flash_tensor, + callee.command_stream_tensor, + ]: op.inputs.insert(0, tens) ps.inputs.insert(0, tens) cps.inputs.insert(0, tens) - if tens != callee.scratch_tensor: + if tens != callee.scratch_tensor and tens != callee.scratch_fast_tensor: add_const_tens_to_startup_cascaded_pass(startup_cps, tens) sz += tens.storage_size() @@ -166,3 +186,7 @@ def rewrite_npu_call_ops(nng, sg, arch): if callee.scratch_tensor is not None: if callee.scratch_tensor.mem_area == MemArea.Sram: cps.sram_used += callee.scratch_tensor.storage_size() + + if callee.scratch_fast_tensor is not None: + if callee.scratch_fast_tensor.mem_area == MemArea.Sram: + cps.sram_used += callee.scratch_fast_tensor.storage_size() diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index be104b88..36bb3c27 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -232,7 +232,8 @@ class DynamicProgrammingScheduler: if self.arch.feature_map_storage_mem_area != MemArea.Sram: self.use_ifm_ofm_overlap = False # force off IFM/OFM overlap if IFMs and OFMs are not in the SRAM - self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap + else: + self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap self.verbose_schedule = options.verbose_schedule self.verbose_pareto_frontier_schedules = options.verbose_pareto_frontier_schedules diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py index 4aa23b5f..cf40b5b5 100644 --- a/ethosu/vela/tflite_writer.py +++ b/ethosu/vela/tflite_writer.py @@ -142,7 +142,7 @@ class TFLiteSerialiser: buffer_map = {} - buf_idx = 1 + buf_idx = 2 for tens in tensors: # Set buffer ids depending on allocation @@ -314,7 +314,11 @@ class TFLiteSerialiser: all_tensors = [tens for nm, idx, tens in sorted((tens.name, idx, tens) for idx, tens in enumerate(tensor_set))] - scratch_tensors = [tens for tens in all_tensors if tens.purpose == TensorPurpose.Scratch] + scratch_tensors = [tens for tens in all_tensors if tens.name.endswith("scratch")] + + for tens in all_tensors: + if tens.name.endswith("scratch_fast"): + scratch_fast_tensor = tens if len(scratch_tensors) == 0: scratch_tensor = None @@ -331,11 +335,16 @@ class TFLiteSerialiser: assert all(inp in sg.original_inputs for inp in sg.input_tensors) inputs = [self.tensor_map[tens] for tens in sg.original_inputs] - # Add the Scratch Tensor as input to the NPU subgraph to get it allocated by TensorFlow Lite Micro + # Add the Scratch Tensors as input to the NPU subgraph to get them allocated by TensorFlow Lite Micro scratch_tensor_idx = self.tensor_map.get(scratch_tensor, None) + scratch_fast_tensor_idx = self.tensor_map.get(scratch_fast_tensor, None) + if scratch_tensor_idx is not None and scratch_tensor_idx not in inputs: inputs.append(scratch_tensor_idx) + if scratch_fast_tensor_idx is not None and scratch_fast_tensor_idx not in inputs: + inputs.append(scratch_fast_tensor_idx) + inputs_offset = self.write_int_vector(inputs) outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in sg.output_tensors]) |