aboutsummaryrefslogtreecommitdiff
path: root/ethosu
diff options
context:
space:
mode:
authorPatrik Gustavsson <patrik.gustavsson@arm.com>2020-06-29 17:36:55 +0200
committertim.hall <tim.hall@arm.com>2020-07-07 15:23:20 +0000
commit3ab9452881a15d88710f4b5d7c14ba5069e74948 (patch)
tree5f942504d52094f728d974c5bb66ae67aa83eedb /ethosu
parente843d3311b8945baa32654af0dccb229b6861438 (diff)
downloadethos-u-vela-3ab9452881a15d88710f4b5d7c14ba5069e74948.tar.gz
MLBEDSW-2551 Add support for more mem-cfgs
Added support for one more memory configuration- Change-Id: Iac19992386e3e9b80bd519acb1b0a399c47d736f Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
Diffstat (limited to 'ethosu')
-rw-r--r--ethosu/vela/architecture_features.py8
-rw-r--r--ethosu/vela/compiler_driver.py9
-rw-r--r--ethosu/vela/npu_serialisation.py34
-rw-r--r--ethosu/vela/scheduler.py3
-rw-r--r--ethosu/vela/tflite_writer.py15
5 files changed, 52 insertions, 17 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 1dce435e..6460c527 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -274,8 +274,8 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.cycles_weight = 40
self.max_sram_used_weight = 1000
- if self.is_yoda_system:
- self.max_sram_used_weight = 1000
+ if self.is_yoda_system and (self.fast_storage_mem_area != self.feature_map_storage_mem_area):
+ self.max_sram_used_weight = 0
# Shared Buffer Block allocations
self.shram_bank_size = 1024 # bytes
@@ -587,10 +587,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")]
self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")]
- if self.fast_storage_mem_area != self.feature_map_storage_mem_area:
- raise Exception(
- "Invalid memory configuration fast_storage_mem_area must be same as feature_map_storage_mem_area"
- )
self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")]
if is_yoda_system:
if self.permanent_storage_mem_area is not MemArea.Dram:
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index e495f1ce..b5a6c42d 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -126,6 +126,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
# Placeholders for scratch and flash tensors that are common for all Npu subgraphs
scratch_tens = None
+ scratch_fast_tens = None
flash_tens = None
# Calculate live ranges for all constant Npu tensors, in permanent storage
@@ -199,12 +200,16 @@ def compiler_driver(nng, arch, options, scheduler_options):
register_command_stream_generator.generate_register_command_stream(
nng, sg, arch, options.verbose_register_command_stream
)
- scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
- nng, sg, arch, scratch_tens, flash_tens
+ scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
+ nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens
)
npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
+ if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
+ if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
+ print("Warning: Sram limit has been exceeded, by the scratch fast tensor")
+
# Allocate all Cpu constant tensors, this is done last because the Npu-ops
# have to be serialized into flash and scratch tensors first
tensor_allocation.allocate_tensors(
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index bd13a3ec..2d1c6b10 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -55,12 +55,13 @@ def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor):
memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values
-def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
+def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens):
if sg.placement != PassPlacement.Npu:
- return scratch_tens, flash_tens
+ return scratch_tens, scratch_fast_tens, flash_tens
flash_area = arch.permanent_storage_mem_area
scratch_area = arch.feature_map_storage_mem_area
+ scratch_fast_area = arch.fast_storage_mem_area
flash_size = sg.memory_used.get(flash_area, 0)
scratch_size = sg.memory_used.get(scratch_area, 0)
@@ -85,6 +86,10 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
+ if scratch_area != scratch_fast_area:
+ nng.total_size[scratch_fast_area] = nng.total_size.get(scratch_fast_area, 0)
+ nng.total_elements[scratch_fast_area] = nng.total_elements.get(scratch_fast_area, 0)
+
if flash_tens == scratch_tens is None:
# First Npu subgraph, create scratch and flash tensors
sg.scratch_tensor = make_memory_tensor(
@@ -94,12 +99,22 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
sg.flash_tensor = make_memory_tensor(
sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch
)
+ # Scratch fast tensor size set to 0. This forces a minimal allocation in the tensor arena
+ # which causes a slot in the basep registers to be reserved, so that the scratch fast tensor
+ # address can be overridden.
+ sg.scratch_fast_tensor = make_memory_tensor(
+ sg.name + "_scratch_fast", scratch_fast_area, MemType.Scratch, 0, False, arch
+ )
+ sg.scratch_fast_tensor.purpose = TensorPurpose.Scratch
else:
sg.scratch_tensor = scratch_tens
sg.scratch_tensor.shape[0] += scratch_size
sg.flash_tensor = flash_tens
sg.flash_tensor.shape[0] += flash_size
+ sg.scratch_fast_tensor = scratch_fast_tens
+ sg.scratch_fast_tensor.shape[0] = 0
+
for cps in sg.cascaded_passes:
for ps in cps.passes:
if ps.placement == PassPlacement.Npu:
@@ -126,7 +141,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
)
sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
- return sg.scratch_tensor, sg.flash_tensor
+ return sg.scratch_tensor, sg.scratch_fast_tensor, sg.flash_tensor
def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
@@ -152,11 +167,16 @@ def rewrite_npu_call_ops(nng, sg, arch):
op.attrs["custom_type"] = op.type
sz = 0
- for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]:
+ for tens in [
+ callee.scratch_fast_tensor,
+ callee.scratch_tensor,
+ callee.flash_tensor,
+ callee.command_stream_tensor,
+ ]:
op.inputs.insert(0, tens)
ps.inputs.insert(0, tens)
cps.inputs.insert(0, tens)
- if tens != callee.scratch_tensor:
+ if tens != callee.scratch_tensor and tens != callee.scratch_fast_tensor:
add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
sz += tens.storage_size()
@@ -166,3 +186,7 @@ def rewrite_npu_call_ops(nng, sg, arch):
if callee.scratch_tensor is not None:
if callee.scratch_tensor.mem_area == MemArea.Sram:
cps.sram_used += callee.scratch_tensor.storage_size()
+
+ if callee.scratch_fast_tensor is not None:
+ if callee.scratch_fast_tensor.mem_area == MemArea.Sram:
+ cps.sram_used += callee.scratch_fast_tensor.storage_size()
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index be104b88..36bb3c27 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -232,7 +232,8 @@ class DynamicProgrammingScheduler:
if self.arch.feature_map_storage_mem_area != MemArea.Sram:
self.use_ifm_ofm_overlap = False # force off IFM/OFM overlap if IFMs and OFMs are not in the SRAM
- self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap
+ else:
+ self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap
self.verbose_schedule = options.verbose_schedule
self.verbose_pareto_frontier_schedules = options.verbose_pareto_frontier_schedules
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index 4aa23b5f..cf40b5b5 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -142,7 +142,7 @@ class TFLiteSerialiser:
buffer_map = {}
- buf_idx = 1
+ buf_idx = 2
for tens in tensors:
# Set buffer ids depending on allocation
@@ -314,7 +314,11 @@ class TFLiteSerialiser:
all_tensors = [tens for nm, idx, tens in sorted((tens.name, idx, tens) for idx, tens in enumerate(tensor_set))]
- scratch_tensors = [tens for tens in all_tensors if tens.purpose == TensorPurpose.Scratch]
+ scratch_tensors = [tens for tens in all_tensors if tens.name.endswith("scratch")]
+
+ for tens in all_tensors:
+ if tens.name.endswith("scratch_fast"):
+ scratch_fast_tensor = tens
if len(scratch_tensors) == 0:
scratch_tensor = None
@@ -331,11 +335,16 @@ class TFLiteSerialiser:
assert all(inp in sg.original_inputs for inp in sg.input_tensors)
inputs = [self.tensor_map[tens] for tens in sg.original_inputs]
- # Add the Scratch Tensor as input to the NPU subgraph to get it allocated by TensorFlow Lite Micro
+ # Add the Scratch Tensors as input to the NPU subgraph to get them allocated by TensorFlow Lite Micro
scratch_tensor_idx = self.tensor_map.get(scratch_tensor, None)
+ scratch_fast_tensor_idx = self.tensor_map.get(scratch_fast_tensor, None)
+
if scratch_tensor_idx is not None and scratch_tensor_idx not in inputs:
inputs.append(scratch_tensor_idx)
+ if scratch_fast_tensor_idx is not None and scratch_fast_tensor_idx not in inputs:
+ inputs.append(scratch_fast_tensor_idx)
+
inputs_offset = self.write_int_vector(inputs)
outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in sg.output_tensors])