aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrik Gustavsson <patrik.gustavsson@arm.com>2020-05-27 09:15:11 +0200
committerPatrik Gustavsson <patrik.gustavsson@arm.com>2020-06-25 11:42:56 +0200
commiteca2e95e1fea150d8a942f8b5f0a4d9d7aefebc1 (patch)
tree438b385f1ded3c18c3b84d2204a57c39be6be34a
parenteec4e50e19cb5522640eae5fd4566917dc2a7b9d (diff)
downloadethos-u-vela-eca2e95e1fea150d8a942f8b5f0a4d9d7aefebc1.tar.gz
MLBEDSW-2306 Added more supported mem-cfgs
Additional supported memory configurations: -Permanent_storage = DRAM -Tensor arena either in DRAM or SRAM Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com> Change-Id: I20beb7151e306bfdba540e7c0b2a7b478b4d94e1
-rw-r--r--ethosu/vela/architecture_features.py32
-rw-r--r--ethosu/vela/compiler_driver.py55
-rw-r--r--ethosu/vela/insert_dma.py48
-rw-r--r--ethosu/vela/live_range.py40
-rw-r--r--ethosu/vela/mark_tensors.py3
-rw-r--r--ethosu/vela/nn_graph.py11
-rw-r--r--ethosu/vela/npu_serialisation.py25
-rw-r--r--ethosu/vela/register_command_stream_generator.py51
-rw-r--r--ethosu/vela/scheduler.py2
-rw-r--r--ethosu/vela/stats_writer.py5
-rw-r--r--ethosu/vela/tensor.py30
-rw-r--r--ethosu/vela/tensor_allocation.py28
-rw-r--r--ethosu/vela/tflite_writer.py17
13 files changed, 234 insertions, 113 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index fef2c40f..e33c5d55 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -28,6 +28,7 @@ from .numeric_util import round_up_divide
from .operation import NpuBlockType
from .supported_operators import SupportedOperators
from .tensor import MemArea
+from .tensor import MemType
from .tensor import TensorFormat
from .tensor import TensorPurpose
@@ -168,11 +169,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions
is_yoda_system = "yoda-" in self.accelerator_config
- if is_yoda_system:
- self.sram_size = 256 * 1024
- else:
- self.sram_size = 200 * 1024 * 1024
-
self.ncores = accel_config.cores
self.ofm_ublock = accel_config.ofm_ublock
self.ifm_ublock = accel_config.ifm_ublock
@@ -233,7 +229,8 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.default_weight_format = TensorFormat.WeightsCompressed
self.default_feature_map_format = TensorFormat.NHWC
- if permanent_storage != MemArea.OffChipFlash:
+ # This is to ignore permanent_storage = On/OffChipflash for Yoda
+ if not is_yoda_system and permanent_storage != MemArea.OffChipFlash:
self.permanent_storage_mem_area = permanent_storage
self.tensor_storage_mem_area = {
@@ -243,10 +240,10 @@ Note the difference between ArchitectureFeatures and CompilerOptions
TensorPurpose.FeatureMap: self.feature_map_storage_mem_area,
}
- self.tensor_load_mem_area = dict(self.tensor_storage_mem_area)
-
- if self.tensor_storage_mem_area[TensorPurpose.Weights] in (MemArea.OffChipFlash,):
- self.tensor_load_mem_area[TensorPurpose.Weights] = MemArea.Sram
+ self.tensor_storage_mem_type = {
+ TensorPurpose.Weights: MemType.Permanent_NPU,
+ TensorPurpose.FeatureMap: MemType.Scratch,
+ }
self.min_block_sizes = {
NpuBlockType.Default: (dpu_min_height, dpu_min_width),
@@ -278,7 +275,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.max_sram_used_weight = 1000
if is_yoda_system:
- self.max_sram_used_weight = 0
+ self.max_sram_used_weight = 1000
# Shared Buffer Block allocations
self.shram_bank_size = 1024 # bytes
@@ -589,14 +586,21 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")]
self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")]
+
+ if self.fast_storage_mem_area != self.feature_map_storage_mem_area:
+ raise Exception(
+ "Invalid memory configuration fast_storage_mem_area must be same as feature_map_storage_mem_area"
+ )
self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")]
- if self.permanent_storage_mem_area not in set((MemArea.OnChipFlash, MemArea.OffChipFlash)):
+ if self.permanent_storage_mem_area not in set((MemArea.OnChipFlash, MemArea.OffChipFlash, MemArea.Dram)):
raise Exception(
"Invalid permanent_storage_mem_area = "
+ str(self.permanent_storage_mem_area)
- + " (must be 'OnChipFlash' or 'OffChipFlash'). To store the weights and other constant data in SRAM"
- " select 'OnChipFlash'"
+ + " (must be 'OnChipFlash', 'OffChipFlash' or 'DRAM')."
+ " To store the weights and other constant data in SRAM on ethosu-55 select 'OnChipFlash'"
)
+ self.sram_size = 1024 * int(self.__sys_config("sram_size_kb", "204800"))
+
except Exception:
print("Error: Reading System Configuration in vela configuration file, section {}".format(section_key))
raise
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 9c345dba..e495f1ce 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -33,7 +33,7 @@ from . import weight_compressor
from .nn_graph import PassPlacement
from .nn_graph import TensorAllocator
from .rewrite_graph import verify_graph_health
-from .tensor import MemArea
+from .tensor import MemType
class CompilerOptions:
@@ -120,9 +120,6 @@ def compiler_driver(nng, arch, options, scheduler_options):
# block config, and calc and pack the scales and biases
weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
- # Memory area for all non-constant tensors (Cpu and Npu)
- non_const_mem_area = MemArea.Sram
-
# LiveRanges for constant tensors for all Npu subgraphs
permanent_storage = arch.permanent_storage_mem_area
lr_graph_flash = live_range.LiveRangeGraph()
@@ -135,7 +132,11 @@ def compiler_driver(nng, arch, options, scheduler_options):
for sg in nng.subgraphs:
if sg.placement == PassPlacement.Npu:
lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
- sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash
+ sg,
+ permanent_storage,
+ MemType.Permanent_NPU,
+ ignore_subgraph_input_output_tensors=True,
+ lr_graph=lr_graph_flash,
)
if len(nng.subgraphs) > 1:
@@ -143,12 +144,12 @@ def compiler_driver(nng, arch, options, scheduler_options):
# processed first during serialization into tensors
first_npu_sg = nng.subgraphs[1]
assert first_npu_sg.placement == PassPlacement.Npu
- # Use the linear allocator for constant tensors
tensor_allocation.allocate_tensors(
nng,
first_npu_sg,
arch,
permanent_storage,
+ set((MemType.Permanent_NPU,)),
scheduler_options.use_ifm_ofm_overlap,
TensorAllocator.LinearAlloc,
options.verbose_allocation,
@@ -159,19 +160,36 @@ def compiler_driver(nng, arch, options, scheduler_options):
# Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
# will start at the root subgraph's input and traverse from top to bottom. When
# it comes across an Npu-op it will extract live ranges for it's corresponding
- # Npu subgraph and add them to the root's live range graph. Finally, all of the
- # non-constant tensors are allocated together
+ # Npu subgraph and add them to the root's live range graph.
+ # The non-constant tensors are stored either in arch.feature_map_storage_mem_area or
+ # arch.fast_storage_mem_area.
+ # When these memory areas are the same, all non-constant tensors are allocated together.
+ # Otherwise they are allocated separately.
+
root_sg = nng.get_root_subgraph()
- tensor_allocation.allocate_tensors(
- nng,
- root_sg,
- arch,
- non_const_mem_area,
- scheduler_options.use_ifm_ofm_overlap,
- options.tensor_allocator,
- options.verbose_allocation,
- options.show_minimum_possible_allocation,
- )
+
+ alloc_list = []
+ if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+ mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
+ alloc_list.append(mem_alloc_scratch)
+ else:
+ mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
+ mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
+ alloc_list.append(mem_alloc_scratch)
+ alloc_list.append(mem_alloc_scratch_fast)
+
+ for alloc in alloc_list:
+ tensor_allocation.allocate_tensors(
+ nng,
+ root_sg,
+ arch,
+ alloc[0],
+ alloc[1],
+ scheduler_options.use_ifm_ofm_overlap,
+ options.tensor_allocator,
+ options.verbose_allocation,
+ options.show_minimum_possible_allocation,
+ )
# Generate command streams and serialise Npu-ops into tensors
for sg in nng.subgraphs:
@@ -194,6 +212,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
root_sg,
arch,
permanent_storage,
+ set((MemType.Permanent_CPU,)),
scheduler_options.use_ifm_ofm_overlap,
TensorAllocator.LinearAlloc,
options.verbose_allocation,
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
index 7049a05f..5c05fc8f 100644
--- a/ethosu/vela/insert_dma.py
+++ b/ethosu/vela/insert_dma.py
@@ -19,6 +19,7 @@ from . import rewrite_graph
from .operation import NpuBlockType
from .operation import Operation
from .tensor import MemArea
+from .tensor import MemType
from .tensor import TensorPurpose
@@ -30,29 +31,34 @@ def insert_dma_cmd(op, arch):
return op # Already rewritten
for idx, tens in enumerate(op.inputs):
- if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area:
- if tens.purpose == TensorPurpose.Weights or (
- tens.purpose == TensorPurpose.FeatureMap and op.type in binary_elementwise_op and tens.shape != []
+ if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
+ # Tensor is in permanent storage
+ # Only when permanent storage differs from fast storage, there is a point moving the data
+ if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and (
+ arch.permanent_storage_mem_area != arch.fast_storage_mem_area
):
- only_vector_product_consumers = True
- for oper in tens.consumers():
- if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
- only_vector_product_consumers = False
- break
+ if tens.purpose == TensorPurpose.Weights or (
+ tens.purpose == TensorPurpose.FeatureMap and op.type in binary_elementwise_op and tens.shape != []
+ ):
+ only_vector_product_consumers = True
+ for oper in tens.consumers():
+ if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
+ only_vector_product_consumers = False
+ break
- # Tensor products has no need for DMA, tensors are only read once and can be in flash.
- # Other operations re-reads tensors, this is better done from SRAM.
- if not only_vector_product_consumers:
- # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
- new_tens = tens.clone_into_fast_storage(arch)
- dma_cmd = Operation("DMA", tens.ops[0].name + "_dma")
- dma_cmd.inputs = [tens]
- dma_cmd.outputs = [new_tens]
- dma_cmd.attrs["source"] = tens.mem_area
- dma_cmd.attrs["destination"] = new_tens.mem_area
- dma_cmd.run_on_npu = True
- new_tens.ops = [dma_cmd]
- op.inputs[idx] = new_tens
+ # Tensor products has no need for DMA, tensors are only read once and can be in flash.
+ # Other operations re-reads tensors, this is better done from SRAM.
+ if not only_vector_product_consumers:
+ # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
+ new_tens = tens.clone_into_fast_storage(arch)
+ dma_cmd = Operation("DMA", tens.ops[0].name + "_dma")
+ dma_cmd.inputs = [tens]
+ dma_cmd.outputs = [new_tens]
+ dma_cmd.attrs["source"] = tens.mem_area
+ dma_cmd.attrs["destination"] = new_tens.mem_area
+ dma_cmd.run_on_npu = True
+ new_tens.ops = [dma_cmd]
+ op.inputs[idx] = new_tens
return op
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 2a35a119..8fe3d571 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -18,7 +18,7 @@
# Can work with either a pass packed subgraph or a scheduled subgraph.
from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass
from .nn_graph import PassPlacement
-from .tensor import MemArea
+from .tensor import MemType
from .tensor import Tensor
@@ -220,6 +220,7 @@ def extract_live_ranges_from_passes(
def extract_live_ranges_from_cascaded_passes(
sg,
target_mem_area,
+ target_mem_type_set,
mark_output_tensors_overlapping_with_input_tensors=False,
use_ifm_ofm_overlap=True,
ignore_subgraph_input_output_tensors=False,
@@ -236,8 +237,8 @@ def extract_live_ranges_from_cascaded_passes(
lr_graph.ignore_tensors.update(sg.input_tensors)
lr_graph.ignore_tensors.update(sg.output_tensors)
- def tensor_should_be_ignored(tens, target_mem_area):
- if tens.mem_area != target_mem_area:
+ def tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
+ if tens.mem_area != target_mem_area or tens.mem_type not in target_mem_type_set:
return True
if tens in lr_graph.ignore_tensors:
return True
@@ -247,9 +248,24 @@ def extract_live_ranges_from_cascaded_passes(
return True
return False
+ def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area, target_mem_type_set):
+ for ps in sg.passes:
+ if ps.placement == PassPlacement.MemoryOnly:
+ # For memory only passes, e.g. Reshape. Add input and output tensor to the same LiveRange
+ input_tensor = ps.inputs[0]
+ output_tensor = ps.outputs[0]
+ # If the input or output tensor is tied to a Cpu tensor, i.e. a subgraph input
+ # or output, fuse the live-range with the Cpu tensors' live-range instead.
+ input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor is not None else input_tensor
+ output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor is not None else output_tensor
+ if not tensor_should_be_ignored(input_tensor, target_mem_area, target_mem_type_set) and not (
+ tensor_should_be_ignored(output_tensor, target_mem_area, target_mem_type_set)
+ ):
+ lr_graph.fuse_ranges(input_tensor, output_tensor)
+
# Merge only memory operations in the NPU subgraphs
if sg.placement == PassPlacement.Npu:
- merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area)
+ merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area, target_mem_type_set)
for cps in sg.cascaded_passes:
cps.time = lr_graph.current_time
@@ -259,19 +275,21 @@ def extract_live_ranges_from_cascaded_passes(
is_element_wise = cps.is_element_wise
for tens in cps.inputs:
- if tensor_should_be_ignored(tens, target_mem_area):
+ if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
continue
rng = lr_graph.get_or_create_range(tens)
rng.mark_usage(time_for_pass)
cps_primary_op = cps.passes[0].primary_op
- if cps_primary_op and cps_primary_op.type == "NpuOp" and target_mem_area in set((MemArea.Sram, MemArea.Dram)):
+
+ if cps_primary_op and cps_primary_op.type == "NpuOp" and MemType.Permanent_CPU not in target_mem_type_set:
# If the primary-op is an NpuOp that means this is where an Npu subgraph
# is called. Go into said subgraph and extract live ranges before continuing.
npu_sg = cps_primary_op.attrs["subgraph"]
lr_graph = extract_live_ranges_from_cascaded_passes(
npu_sg,
target_mem_area,
+ target_mem_type_set,
mark_output_tensors_overlapping_with_input_tensors,
use_ifm_ofm_overlap,
False,
@@ -282,13 +300,13 @@ def extract_live_ranges_from_cascaded_passes(
cps.time = time_for_pass
for tens in cps.intermediates:
- if tensor_should_be_ignored(tens, target_mem_area):
+ if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
continue
rng = lr_graph.get_or_create_range(tens)
rng.mark_usage(time_for_pass)
for tens in cps.outputs:
- if tensor_should_be_ignored(tens, target_mem_area):
+ if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
continue
rng = lr_graph.get_or_create_range(tens)
output_time = time_for_pass
@@ -303,8 +321,8 @@ def extract_live_ranges_from_cascaded_passes(
if (
ifm_tensor is not None
and ofm_tensor is not None
- and not tensor_should_be_ignored(ifm_tensor, target_mem_area)
- and not tensor_should_be_ignored(ofm_tensor, target_mem_area)
+ and not tensor_should_be_ignored(ifm_tensor, target_mem_area, target_mem_type_set)
+ and not tensor_should_be_ignored(ofm_tensor, target_mem_area, target_mem_type_set)
):
lr_graph.allowed_overlaps[(ifm_tensor, ofm_tensor)] = calc_allowed_ofm_ifm_overlap_for_cascaded_pass(
cps
@@ -318,7 +336,7 @@ def extract_live_ranges_from_cascaded_passes(
end_time = max(end_time, rng.end_time)
for tens in sg.output_tensors:
- if tensor_should_be_ignored(tens, target_mem_area):
+ if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
continue
rng = lr_graph.get_or_create_range(tens)
rng.mark_usage(end_time)
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index c4f2bae2..705f839b 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -18,6 +18,7 @@
from . import rewrite_graph
from . import weight_compressor
from .errors import OperatorError
+from .tensor import MemType
from .tensor import TensorFormat
from .tensor import TensorPurpose
from .tflite_mapping import custom_prefix
@@ -254,11 +255,13 @@ def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
else:
assert 0, "Cannot resolve tensor purpose %s and %s for tensor %s" % (tens.purpose, purpose, tens)
tens.mem_area = arch.tensor_storage_mem_area[tens.purpose]
+ tens.mem_type = arch.tensor_storage_mem_type[tens.purpose]
if len(tens.ops) == 1 and tens.ops[0].type == "Const":
tens.mem_area = (
arch.permanent_storage_mem_area
) # special case constants, as they must be in permanent storage
+ tens.mem_type = MemType.Permanent_NPU
def rewrite_mark_tensor_purpose(op, arch):
# find disconnected outputs and mark as parameters
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index ea35c087..247e6cce 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -137,6 +137,7 @@ class Subgraph:
self.flash_tensor = None
self.memory_used = {}
+ self.memory_used_per_type = {}
def __str__(self):
return "<nng.Subgraph '%s', n_passes=%d, n_cascaded_passes=%d>" % (
@@ -349,9 +350,15 @@ class Subgraph:
for idx, op in enumerate(all_ops):
print(idx, op.type, op.name)
for idx, tens in enumerate(op.inputs):
- print(" Input %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens))
+ print(
+ " Input %02d %20s %20s %20s %s"
+ % (idx, tens.purpose.name, tens.mem_area.name, tens.mem_type.name, tens)
+ )
for idx, tens in enumerate(op.outputs):
- print(" Output %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens))
+ print(
+ " Output %02d %20s %20s %20s %s"
+ % (idx, tens.purpose.name, tens.mem_area.name, tens.mem_type.name, tens)
+ )
print()
def print_graph_with_tensor_quantization(self):
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index 18d38f3f..bd13a3ec 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -24,14 +24,16 @@ from .data_type import DataType
from .nn_graph import PassPlacement
from .operation import Operation
from .tensor import MemArea
+from .tensor import MemType
from .tensor import Tensor
from .tensor import TensorFormat
from .tensor import TensorPurpose
-def make_memory_tensor(name, mem_area, sz, want_values, arch):
+def make_memory_tensor(name, mem_area, mem_type, sz, want_values, arch):
tens = Tensor([sz], DataType.uint8, name)
tens.mem_area = mem_area
+ tens.mem_type = mem_type
tens.purpose = TensorPurpose.FeatureMap
tens.set_format(TensorFormat.NHWC, arch)
if want_values:
@@ -58,7 +60,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
return scratch_tens, flash_tens
flash_area = arch.permanent_storage_mem_area
- scratch_area = MemArea.Sram
+ scratch_area = arch.feature_map_storage_mem_area
flash_size = sg.memory_used.get(flash_area, 0)
scratch_size = sg.memory_used.get(scratch_area, 0)
@@ -85,9 +87,13 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
if flash_tens == scratch_tens is None:
# First Npu subgraph, create scratch and flash tensors
- sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch)
+ sg.scratch_tensor = make_memory_tensor(
+ sg.name + "_scratch", scratch_area, MemType.Scratch, scratch_size, False, arch
+ )
sg.scratch_tensor.purpose = TensorPurpose.Scratch
- sg.flash_tensor = make_memory_tensor(sg.name + "_flash", flash_area, flash_size, True, arch)
+ sg.flash_tensor = make_memory_tensor(
+ sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch
+ )
else:
sg.scratch_tensor = scratch_tens
sg.scratch_tensor.shape[0] += scratch_size
@@ -108,13 +114,15 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
- if ps.ifm_tensor is not None and ps.ifm_tensor.mem_area != MemArea.Sram:
+ if ps.ifm_tensor is not None and ps.ifm_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor)
- if ps.ifm2_tensor is not None and ps.ifm2_tensor.mem_area != MemArea.Sram:
+ if ps.ifm2_tensor is not None and (
+ ps.ifm2_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast)
+ ):
copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor)
sg.command_stream_tensor = make_memory_tensor(
- sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch
+ sg.name + "_command_stream", flash_area, MemType.Permanent_CPU, command_stream_size_bytes, True, arch
)
sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
@@ -156,4 +164,5 @@ def rewrite_npu_call_ops(nng, sg, arch):
prev_cps.sram_used += sz
if callee.scratch_tensor is not None:
- cps.sram_used += callee.scratch_tensor.storage_size()
+ if callee.scratch_tensor.mem_area == MemArea.Sram:
+ cps.sram_used += callee.scratch_tensor.storage_size()
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index c46016d7..9dd290a9 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -50,7 +50,7 @@ from .numeric_util import round_up
from .numeric_util import round_up_to_int
from .operation import NpuBlockType
from .shared_buffer_allocation import SharedBufferAllocation
-from .tensor import MemArea
+from .tensor import MemType
from .tensor import TensorBlockTraversal
from .tensor import TensorFormat
@@ -79,8 +79,9 @@ class CmdMode(IntEnum):
class BasePointerIndex(IntEnum):
- ReadOnly = 0 # base address slot index for weights and scaling
- Scratch = 1 # base address slot index for scratch memory area
+ WeightTensor = 0 # base address index for the Weight tensor
+ ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
+ ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
# TODO: Replace with definitions from ethos_u55_regs
@@ -322,12 +323,20 @@ def get_op_padding_lt(cmd):
def generate_register_command_stream(nng, sg, arch, verbose=False):
emit = CommandStreamEmitter()
- base_ptr_idx_map = {
- MemArea.Sram: BasePointerIndex.Scratch,
- MemArea.OnChipFlash: BasePointerIndex.ReadOnly,
- MemArea.OffChipFlash: BasePointerIndex.ReadOnly,
- MemArea.Dram: BasePointerIndex.ReadOnly,
- }
+ if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+ base_ptr_idx_map = {
+ MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+ MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+ MemType.Scratch: BasePointerIndex.ScratchTensor,
+ MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
+ }
+ else:
+ base_ptr_idx_map = {
+ MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+ MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+ MemType.Scratch: BasePointerIndex.ScratchTensor,
+ MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
+ }
# Maps an AccumulatorType enum to the corresponding acc_format value
acc_format_map = {
@@ -377,8 +386,8 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
param = min(param, 0xFFFF) # Clamp to allowable wait amount
if relative_dep[CommandType.DMA] is not None:
- param = relative_dep[CommandType.DMA][0]
- param = min(param, 0xF) # Clamp to allowable wait amount
+ # TODO This can be optimized for yoda
+ param = 0
emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
for cmd in cmd_stream:
@@ -394,10 +403,10 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
else:
sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
- # TODO: Yoda support needs to use feature_maps_not_in_fast_storage and force_outputs_to_fast_storage
- emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_area])
+ emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
- emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_area])
+ emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
+
emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
dma_channel = 0
@@ -682,10 +691,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index)
- # Select weight/scale region depending on where permanent storage was defined
- weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_area]
- if arch.permanent_storage_mem_area == MemArea.Sram:
- weight_region = BasePointerIndex.ReadOnly
+ weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr)
emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len)
@@ -699,9 +705,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr
)
# Emit base address for NPU to access scale & bias data
- scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_area]
- if arch.permanent_storage_mem_area == MemArea.Sram:
- scale_region = BasePointerIndex.ReadOnly
+ scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr)
emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16))
@@ -850,10 +854,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
else:
assert False
- if tens.mem_area == MemArea.Sram:
- emit.cmd0_with_param(region_op, BasePointerIndex.Scratch)
- else:
- emit.cmd0_with_param(region_op, BasePointerIndex.ReadOnly)
+ emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
for idx, addr in enumerate(addresses):
if addr is None:
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 0b594310..be104b88 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -38,6 +38,7 @@ from .operation import NpuBlockType
from .shared_buffer_allocation import find_block_configs_suitable_for_pass_and_shared_buffer
from .shared_buffer_allocation import shared_buffer_allocation_for_pass_and_block_config
from .tensor import MemArea
+from .tensor import MemType
from .tensor import TensorFormat
from .tensor import TensorPurpose
from .tensor import TensorSubPurpose
@@ -833,6 +834,7 @@ class DynamicProgrammingScheduler:
for rewrite_op, tens, sub_purpose, param_a, param_b, ps in strat.rewrite_list:
if rewrite_op == SchedulerRewrite.ChangeTensorSubPurpose:
tens.mem_area = self.arch.fast_storage_mem_area
+ tens.mem_type = MemType.Scratch_fast
tens.set_new_sub_purpose(sub_purpose, param_a, param_b)
else:
assert 0, "unknown rewrite_op " + str(rewrite_op)
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 9bbb9db5..c90d9876 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -201,7 +201,10 @@ def write_pass_metrics_csv(nng, pass_filename):
for k in indices[2]:
res += round_up_to_int(ps.bandwidths[i, j, k])
stats.append(res)
- stats += [ps.sram_used]
+ try:
+ stats += [ps.sram_used]
+ except AttributeError:
+ stats += [0]
writer.writerow(stats)
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 42d95262..3990164d 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -26,6 +26,27 @@ from .numeric_util import round_up_divide
from .range_set import MemoryRangeSet
+class MemType(enum.IntFlag):
+ Unknown = 0
+ Permanent_NPU = 1
+ Permanent_CPU = 2
+ Scratch = 3
+ Scratch_fast = 4
+ Size = Scratch_fast + 1
+
+ def display_name(self):
+ return ("Unknown", "Permanent_NPU", "Permanent_CPU", "Scratch", "Scratch_fast", "Size")[self.value]
+
+ def identifier_name(self):
+ return ("unknown", "permanent_npu", "permanent_cpu", "scratch", "scratch_fast", "size")[self.value]
+
+ def all():
+ return (MemType.Permanent_NPU, MemType.Permanent_CPU, MemType.Scratch, MemType.Scratch_fast)
+
+ def __str__(self):
+ return self.name
+
+
class MemArea(enum.IntFlag):
Unknown = 0
Sram = 1
@@ -209,6 +230,7 @@ class Tensor:
"quant_values",
"compressed_values",
"mem_area",
+ "mem_type",
"format",
"purpose",
"sub_purpose",
@@ -252,6 +274,7 @@ class Tensor:
self.quant_values = None
self.compressed_values = None
self.mem_area = MemArea.Unknown
+ self.mem_type = MemType.Unknown
self.format = TensorFormat.Unknown
self.purpose = TensorPurpose.Unknown
self.sub_purpose = TensorSubPurpose.Standard
@@ -291,6 +314,7 @@ class Tensor:
res.values = self.values
res.quant_values = self.quant_values
res.mem_area = self.mem_area
+ res.mem_type = self.mem_type
res.format = self.format
res.purpose = self.purpose
res.sub_purpose = self.sub_purpose
@@ -312,6 +336,7 @@ class Tensor:
def clone_into_fast_storage(self, arch):
res = self.clone(suffix="_fast_storage")
res.mem_area = arch.fast_storage_mem_area
+ res.mem_type = MemType.Scratch_fast
return res
def copy_compressed_weight_info(self, src_tens):
@@ -641,6 +666,11 @@ class Tensor:
assert address_offset <= self.storage_size()
return address_offset
+ def is_allocated_in_tensor_arena(self, scratch_tensor_mem_area):
+ if self.mem_area == scratch_tensor_mem_area and (self.mem_type in set((MemType.Scratch, MemType.Scratch_fast))):
+ return True
+ return False
+
def __str__(self):
return "<nng.Tensor '%s' shape=%s dtype=%s>" % (self.name, self.shape, self.dtype)
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index e3952df3..f29296d1 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -25,6 +25,7 @@ from . import numeric_util
from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
from .nn_graph import TensorAllocator
from .tensor import MemArea
+from .tensor import MemType
def linear_allocate_live_ranges(live_ranges, alloc_granularity=16):
@@ -66,12 +67,13 @@ def mark_sram_used_for_cascaded_passes(sg, lrs):
ps.sram_used = sram_used
-def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation):
+def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation):
if verbose_allocation:
- if mem_area == MemArea.Sram:
- print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
- else:
+ if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)):
print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
+ else:
+ print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
+
for start_time, start, end, name, end_time in sorted(
(
lr.start_time,
@@ -99,6 +101,7 @@ def allocate_tensors(
sg,
arch,
mem_area,
+ mem_type_set,
use_ifm_ofm_overlap=True,
tensor_allocator=TensorAllocator.Greedy,
verbose_allocation=False,
@@ -109,6 +112,7 @@ def allocate_tensors(
lrs = live_range.extract_live_ranges_from_cascaded_passes(
sg,
mem_area,
+ mem_type_set,
mark_output_tensors_overlapping_with_input_tensors=False,
use_ifm_ofm_overlap=use_ifm_ofm_overlap,
ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
@@ -120,16 +124,26 @@ def allocate_tensors(
if tens_alloc == TensorAllocator.Greedy:
total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation)
elif tens_alloc == TensorAllocator.LinearAlloc:
- total_sz = linear_allocate_live_ranges(lrs)
+ total_sz = linear_allocate_live_ranges(lrs, 16)
else:
assert 0
- sg.memory_used[mem_area] = total_sz
+ if sg.memory_used.get(mem_area, 0) == 0:
+ sg.memory_used[mem_area] = total_sz
+ else:
+ sg.memory_used[mem_area] += total_sz
+
+ # Keep track of how much should be used for scratch or permanent storage for NPU
+ for mem_type in mem_type_set:
+ if sg.memory_used_per_type.get(mem_type, 0) == 0:
+ sg.memory_used_per_type[mem_type] = total_sz
+ else:
+ sg.memory_used_per_type[mem_type] += total_sz
nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
- print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation)
+ print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation)
if mem_area == MemArea.Sram:
# Mark Sram usage for all subgraphs
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index 8db3e5b8..7e805e31 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -22,7 +22,7 @@ from flatbuffers import encode
from flatbuffers.builder import UOffsetTFlags
from .nn_graph import PassPlacement
-from .tensor import MemArea
+from .tensor import MemType
from .tensor import TensorPurpose
from .tflite import Buffer
from .tflite import Metadata
@@ -74,6 +74,7 @@ class TFLiteSerialiser:
self.nng = nng
self.scratch_buf_id = 0 # Always assign scratch to buffer 0
+ self.scratch_fast_buf_id = 1 # Always assign scratch_fast to buffer 1
self.buffer_offsets_map = {}
self.buffers_to_write = [] # have an empty array there
@@ -140,11 +141,16 @@ class TFLiteSerialiser:
scratch_tensor_mem_area = None # all tensors are initialised to MemArea.Unknown
buffer_map = {}
+
buf_idx = 1
for tens in tensors:
- if tens.mem_area == scratch_tensor_mem_area:
+ # Set buffer ids depending on allocation
+ if tens.is_allocated_in_tensor_arena(scratch_tensor_mem_area):
buffer_map[tens] = self.scratch_buf_id
+ elif tens.mem_type == MemType.Scratch_fast:
+ # For Scratch_fast when not co-allocated with scratch in the TensorArena:
+ buffer_map[tens] = self.scratch_fast_buf_id
else:
buffer_map[tens] = buf_idx
buf_idx += 1
@@ -229,11 +235,9 @@ class TFLiteSerialiser:
if tens.purpose == TensorPurpose.Scratch:
tens_shape = [0]
- self.buffers_to_write[self.scratch_buf_id] = values.flatten().view(np.uint8)
buf_id = self.buffer_map[tens]
- if buf_id != self.scratch_buf_id:
- self.buffers_to_write[buf_id] = values.flatten().view(np.uint8)
+ self.buffers_to_write[buf_id] = values.flatten().view(np.uint8)
shape = self.write_int_vector(tens_shape)
@@ -396,7 +400,8 @@ class TFLiteSerialiser:
# Ensure that the order of the offsets match the order of the tensors
for tens, idx in self.tensor_map.items():
- if tens.mem_area == MemArea.Sram:
+ # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area
+ if tens.mem_type in set((MemType.Scratch, MemType.Scratch_fast)):
offsets[idx] = np.int32(tens.address)
metadata_buffer = np.array([version, subgraph_idx, nbr_tensors] + offsets)