13 files changed, 234 insertions, 113 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index fef2c40f..e33c5d55 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -28,6 +28,7 @@ from .numeric_util import round_up_divide
 from .operation import NpuBlockType
 from .supported_operators import SupportedOperators
 from .tensor import MemArea
+from .tensor import MemType
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 
@@ -168,11 +169,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions
 
         is_yoda_system = "yoda-" in self.accelerator_config
 
-        if is_yoda_system:
-            self.sram_size = 256 * 1024
-        else:
-            self.sram_size = 200 * 1024 * 1024
-
         self.ncores = accel_config.cores
         self.ofm_ublock = accel_config.ofm_ublock
         self.ifm_ublock = accel_config.ifm_ublock
@@ -233,7 +229,8 @@ Note the difference between ArchitectureFeatures and CompilerOptions
         self.default_weight_format = TensorFormat.WeightsCompressed
         self.default_feature_map_format = TensorFormat.NHWC
 
-        if permanent_storage != MemArea.OffChipFlash:
+        # This is to ignore permanent_storage = On/OffChipflash for Yoda
+        if not is_yoda_system and permanent_storage != MemArea.OffChipFlash:
             self.permanent_storage_mem_area = permanent_storage
 
         self.tensor_storage_mem_area = {
@@ -243,10 +240,10 @@ Note the difference between ArchitectureFeatures and CompilerOptions
             TensorPurpose.FeatureMap: self.feature_map_storage_mem_area,
         }
 
-        self.tensor_load_mem_area = dict(self.tensor_storage_mem_area)
-
-        if self.tensor_storage_mem_area[TensorPurpose.Weights] in (MemArea.OffChipFlash,):
-            self.tensor_load_mem_area[TensorPurpose.Weights] = MemArea.Sram
+        self.tensor_storage_mem_type = {
+            TensorPurpose.Weights: MemType.Permanent_NPU,
+            TensorPurpose.FeatureMap: MemType.Scratch,
+        }
 
         self.min_block_sizes = {
             NpuBlockType.Default: (dpu_min_height, dpu_min_width),
@@ -278,7 +275,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
         self.max_sram_used_weight = 1000
 
         if is_yoda_system:
-            self.max_sram_used_weight = 0
+            self.max_sram_used_weight = 1000
 
         # Shared Buffer Block allocations
         self.shram_bank_size = 1024  # bytes
@@ -589,14 +586,21 @@ Note the difference between ArchitectureFeatures and CompilerOptions
 
             self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")]
             self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")]
+
+            if self.fast_storage_mem_area != self.feature_map_storage_mem_area:
+                raise Exception(
+                    "Invalid memory configuration fast_storage_mem_area must be same as feature_map_storage_mem_area"
+                )
             self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")]
-            if self.permanent_storage_mem_area not in set((MemArea.OnChipFlash, MemArea.OffChipFlash)):
+            if self.permanent_storage_mem_area not in set((MemArea.OnChipFlash, MemArea.OffChipFlash, MemArea.Dram)):
                 raise Exception(
                     "Invalid permanent_storage_mem_area = "
                     + str(self.permanent_storage_mem_area)
-                    + " (must be 'OnChipFlash' or 'OffChipFlash'). To store the weights and other constant data in SRAM"
-                    " select 'OnChipFlash'"
+                    + " (must be 'OnChipFlash', 'OffChipFlash' or 'DRAM')."
+                    " To store the weights and other constant data in SRAM on ethosu-55 select 'OnChipFlash'"
                 )
+            self.sram_size = 1024 * int(self.__sys_config("sram_size_kb", "204800"))
+
         except Exception:
             print("Error: Reading System Configuration in vela configuration file, section {}".format(section_key))
             raise
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 9c345dba..e495f1ce 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -33,7 +33,7 @@ from . import weight_compressor
 from .nn_graph import PassPlacement
 from .nn_graph import TensorAllocator
 from .rewrite_graph import verify_graph_health
-from .tensor import MemArea
+from .tensor import MemType
 
 
 class CompilerOptions:
@@ -120,9 +120,6 @@ def compiler_driver(nng, arch, options, scheduler_options):
     # block config, and calc and pack the scales and biases
     weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
 
-    # Memory area for all non-constant tensors (Cpu and Npu)
-    non_const_mem_area = MemArea.Sram
-
     # LiveRanges for constant tensors for all Npu subgraphs
     permanent_storage = arch.permanent_storage_mem_area
     lr_graph_flash = live_range.LiveRangeGraph()
@@ -135,7 +132,11 @@ def compiler_driver(nng, arch, options, scheduler_options):
     for sg in nng.subgraphs:
         if sg.placement == PassPlacement.Npu:
             lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
-                sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash
+                sg,
+                permanent_storage,
+                MemType.Permanent_NPU,
+                ignore_subgraph_input_output_tensors=True,
+                lr_graph=lr_graph_flash,
             )
 
     if len(nng.subgraphs) > 1:
@@ -143,12 +144,12 @@ def compiler_driver(nng, arch, options, scheduler_options):
         # processed first during serialization into tensors
         first_npu_sg = nng.subgraphs[1]
         assert first_npu_sg.placement == PassPlacement.Npu
-        # Use the linear allocator for constant tensors
         tensor_allocation.allocate_tensors(
             nng,
             first_npu_sg,
             arch,
             permanent_storage,
+            set((MemType.Permanent_NPU,)),
             scheduler_options.use_ifm_ofm_overlap,
             TensorAllocator.LinearAlloc,
             options.verbose_allocation,
@@ -159,19 +160,36 @@ def compiler_driver(nng, arch, options, scheduler_options):
     # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
     # will start at the root subgraph's input and traverse from top to bottom. When
     # it comes across an Npu-op it will extract live ranges for it's corresponding
-    # Npu subgraph and add them to the root's live range graph. Finally, all of the
-    # non-constant tensors are allocated together
+    # Npu subgraph and add them to the root's live range graph.
+    # The non-constant tensors are stored either in arch.feature_map_storage_mem_area or
+    # arch.fast_storage_mem_area.
+    # When these memory areas are the same, all non-constant tensors are allocated together.
+    # Otherwise they are allocated separately.
+
     root_sg = nng.get_root_subgraph()
-    tensor_allocation.allocate_tensors(
-        nng,
-        root_sg,
-        arch,
-        non_const_mem_area,
-        scheduler_options.use_ifm_ofm_overlap,
-        options.tensor_allocator,
-        options.verbose_allocation,
-        options.show_minimum_possible_allocation,
-    )
+
+    alloc_list = []
+    if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+        mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
+        alloc_list.append(mem_alloc_scratch)
+    else:
+        mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
+        mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
+        alloc_list.append(mem_alloc_scratch)
+        alloc_list.append(mem_alloc_scratch_fast)
+
+    for alloc in alloc_list:
+        tensor_allocation.allocate_tensors(
+            nng,
+            root_sg,
+            arch,
+            alloc[0],
+            alloc[1],
+            scheduler_options.use_ifm_ofm_overlap,
+            options.tensor_allocator,
+            options.verbose_allocation,
+            options.show_minimum_possible_allocation,
+        )
 
     # Generate command streams and serialise Npu-ops into tensors
     for sg in nng.subgraphs:
@@ -194,6 +212,7 @@ def compiler_driver(nng, arch, options, scheduler_options):
         root_sg,
         arch,
         permanent_storage,
+        set((MemType.Permanent_CPU,)),
         scheduler_options.use_ifm_ofm_overlap,
         TensorAllocator.LinearAlloc,
         options.verbose_allocation,
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
index 7049a05f..5c05fc8f 100644
--- a/ethosu/vela/insert_dma.py
+++ b/ethosu/vela/insert_dma.py
@@ -19,6 +19,7 @@ from . import rewrite_graph
 from .operation import NpuBlockType
 from .operation import Operation
 from .tensor import MemArea
+from .tensor import MemType
 from .tensor import TensorPurpose
 
 
@@ -30,29 +31,34 @@ def insert_dma_cmd(op, arch):
         return op  # Already rewritten
     for idx, tens in enumerate(op.inputs):
 
-        if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area:
-            if tens.purpose == TensorPurpose.Weights or (
-                tens.purpose == TensorPurpose.FeatureMap and op.type in binary_elementwise_op and tens.shape != []
+        if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
+            # Tensor is in permanent storage
+            # Only when permanent storage differs from fast storage, there is a point moving the data
+            if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and (
+                arch.permanent_storage_mem_area != arch.fast_storage_mem_area
             ):
-                only_vector_product_consumers = True
-                for oper in tens.consumers():
-                    if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
-                        only_vector_product_consumers = False
-                        break
+                if tens.purpose == TensorPurpose.Weights or (
+                    tens.purpose == TensorPurpose.FeatureMap and op.type in binary_elementwise_op and tens.shape != []
+                ):
+                    only_vector_product_consumers = True
+                    for oper in tens.consumers():
+                        if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
+                            only_vector_product_consumers = False
+                            break
 
-                # Tensor products has no need for DMA, tensors are only read once and can be in flash.
-                # Other operations re-reads tensors, this is better done from SRAM.
-                if not only_vector_product_consumers:
-                    # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
-                    new_tens = tens.clone_into_fast_storage(arch)
-                    dma_cmd = Operation("DMA", tens.ops[0].name + "_dma")
-                    dma_cmd.inputs = [tens]
-                    dma_cmd.outputs = [new_tens]
-                    dma_cmd.attrs["source"] = tens.mem_area
-                    dma_cmd.attrs["destination"] = new_tens.mem_area
-                    dma_cmd.run_on_npu = True
-                    new_tens.ops = [dma_cmd]
-                    op.inputs[idx] = new_tens
+                    # Tensor products has no need for DMA, tensors are only read once and can be in flash.
+                    # Other operations re-reads tensors, this is better done from SRAM.
+                    if not only_vector_product_consumers:
+                        # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
+                        new_tens = tens.clone_into_fast_storage(arch)
+                        dma_cmd = Operation("DMA", tens.ops[0].name + "_dma")
+                        dma_cmd.inputs = [tens]
+                        dma_cmd.outputs = [new_tens]
+                        dma_cmd.attrs["source"] = tens.mem_area
+                        dma_cmd.attrs["destination"] = new_tens.mem_area
+                        dma_cmd.run_on_npu = True
+                        new_tens.ops = [dma_cmd]
+                        op.inputs[idx] = new_tens
     return op
 
 
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 2a35a119..8fe3d571 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -18,7 +18,7 @@
 # Can work with either a pass packed subgraph or a scheduled subgraph.
 from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass
 from .nn_graph import PassPlacement
-from .tensor import MemArea
+from .tensor import MemType
 from .tensor import Tensor
 
 
@@ -220,6 +220,7 @@ def extract_live_ranges_from_passes(
 def extract_live_ranges_from_cascaded_passes(
     sg,
     target_mem_area,
+    target_mem_type_set,
     mark_output_tensors_overlapping_with_input_tensors=False,
     use_ifm_ofm_overlap=True,
     ignore_subgraph_input_output_tensors=False,
@@ -236,8 +237,8 @@ def extract_live_ranges_from_cascaded_passes(
         lr_graph.ignore_tensors.update(sg.input_tensors)
         lr_graph.ignore_tensors.update(sg.output_tensors)
 
-    def tensor_should_be_ignored(tens, target_mem_area):
-        if tens.mem_area != target_mem_area:
+    def tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
+        if tens.mem_area != target_mem_area or tens.mem_type not in target_mem_type_set:
             return True
         if tens in lr_graph.ignore_tensors:
             return True
@@ -247,9 +248,24 @@ def extract_live_ranges_from_cascaded_passes(
             return True
         return False
 
+    def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area, target_mem_type_set):
+        for ps in sg.passes:
+            if ps.placement == PassPlacement.MemoryOnly:
+                # For memory only passes, e.g. Reshape. Add input and output tensor to the same LiveRange
+                input_tensor = ps.inputs[0]
+                output_tensor = ps.outputs[0]
+                # If the input or output tensor is tied to a Cpu tensor, i.e. a subgraph input
+                # or output, fuse the live-range with the Cpu tensors' live-range instead.
+                input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor is not None else input_tensor
+                output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor is not None else output_tensor
+                if not tensor_should_be_ignored(input_tensor, target_mem_area, target_mem_type_set) and not (
+                    tensor_should_be_ignored(output_tensor, target_mem_area, target_mem_type_set)
+                ):
+                    lr_graph.fuse_ranges(input_tensor, output_tensor)
+
     # Merge only memory operations in the NPU subgraphs
     if sg.placement == PassPlacement.Npu:
-        merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area)
+        merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area, target_mem_type_set)
 
     for cps in sg.cascaded_passes:
         cps.time = lr_graph.current_time
@@ -259,19 +275,21 @@ def extract_live_ranges_from_cascaded_passes(
         is_element_wise = cps.is_element_wise
 
         for tens in cps.inputs:
-            if tensor_should_be_ignored(tens, target_mem_area):
+            if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
                 continue
             rng = lr_graph.get_or_create_range(tens)
             rng.mark_usage(time_for_pass)
 
         cps_primary_op = cps.passes[0].primary_op
-        if cps_primary_op and cps_primary_op.type == "NpuOp" and target_mem_area in set((MemArea.Sram, MemArea.Dram)):
+
+        if cps_primary_op and cps_primary_op.type == "NpuOp" and MemType.Permanent_CPU not in target_mem_type_set:
             # If the primary-op is an NpuOp that means this is where an Npu subgraph
             # is called. Go into said subgraph and extract live ranges before continuing.
             npu_sg = cps_primary_op.attrs["subgraph"]
             lr_graph = extract_live_ranges_from_cascaded_passes(
                 npu_sg,
                 target_mem_area,
+                target_mem_type_set,
                 mark_output_tensors_overlapping_with_input_tensors,
                 use_ifm_ofm_overlap,
                 False,
@@ -282,13 +300,13 @@ def extract_live_ranges_from_cascaded_passes(
             cps.time = time_for_pass
 
         for tens in cps.intermediates:
-            if tensor_should_be_ignored(tens, target_mem_area):
+            if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
                 continue
             rng = lr_graph.get_or_create_range(tens)
             rng.mark_usage(time_for_pass)
 
         for tens in cps.outputs:
-            if tensor_should_be_ignored(tens, target_mem_area):
+            if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
                 continue
             rng = lr_graph.get_or_create_range(tens)
             output_time = time_for_pass
@@ -303,8 +321,8 @@ def extract_live_ranges_from_cascaded_passes(
             if (
                 ifm_tensor is not None
                 and ofm_tensor is not None
-                and not tensor_should_be_ignored(ifm_tensor, target_mem_area)
-                and not tensor_should_be_ignored(ofm_tensor, target_mem_area)
+                and not tensor_should_be_ignored(ifm_tensor, target_mem_area, target_mem_type_set)
+                and not tensor_should_be_ignored(ofm_tensor, target_mem_area, target_mem_type_set)
             ):
                 lr_graph.allowed_overlaps[(ifm_tensor, ofm_tensor)] = calc_allowed_ofm_ifm_overlap_for_cascaded_pass(
                     cps
@@ -318,7 +336,7 @@ def extract_live_ranges_from_cascaded_passes(
         end_time = max(end_time, rng.end_time)
 
     for tens in sg.output_tensors:
-        if tensor_should_be_ignored(tens, target_mem_area):
+        if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
             continue
         rng = lr_graph.get_or_create_range(tens)
         rng.mark_usage(end_time)
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index c4f2bae2..705f839b 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -18,6 +18,7 @@
 from . import rewrite_graph
 from . import weight_compressor
 from .errors import OperatorError
+from .tensor import MemType
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 from .tflite_mapping import custom_prefix
@@ -254,11 +255,13 @@ def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
         else:
             assert 0, "Cannot resolve tensor purpose %s and %s for tensor %s" % (tens.purpose, purpose, tens)
         tens.mem_area = arch.tensor_storage_mem_area[tens.purpose]
+        tens.mem_type = arch.tensor_storage_mem_type[tens.purpose]
 
         if len(tens.ops) == 1 and tens.ops[0].type == "Const":
             tens.mem_area = (
                 arch.permanent_storage_mem_area
             )  # special case constants, as they must be in permanent storage
+            tens.mem_type = MemType.Permanent_NPU
 
     def rewrite_mark_tensor_purpose(op, arch):
         # find disconnected outputs and mark as parameters
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index ea35c087..247e6cce 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -137,6 +137,7 @@ class Subgraph:
         self.flash_tensor = None
 
         self.memory_used = {}
+        self.memory_used_per_type = {}
 
     def __str__(self):
         return "<nng.Subgraph '%s',  n_passes=%d, n_cascaded_passes=%d>" % (
@@ -349,9 +350,15 @@ class Subgraph:
         for idx, op in enumerate(all_ops):
             print(idx, op.type, op.name)
             for idx, tens in enumerate(op.inputs):
-                print("    Input  %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens))
+                print(
+                    "    Input  %02d %20s %20s %20s %s"
+                    % (idx, tens.purpose.name, tens.mem_area.name, tens.mem_type.name, tens)
+                )
             for idx, tens in enumerate(op.outputs):
-                print("    Output %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens))
+                print(
+                    "    Output %02d %20s %20s %20s %s"
+                    % (idx, tens.purpose.name, tens.mem_area.name, tens.mem_type.name, tens)
+                )
             print()
 
     def print_graph_with_tensor_quantization(self):
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index 18d38f3f..bd13a3ec 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -24,14 +24,16 @@ from .data_type import DataType
 from .nn_graph import PassPlacement
 from .operation import Operation
 from .tensor import MemArea
+from .tensor import MemType
 from .tensor import Tensor
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 
 
-def make_memory_tensor(name, mem_area, sz, want_values, arch):
+def make_memory_tensor(name, mem_area, mem_type, sz, want_values, arch):
     tens = Tensor([sz], DataType.uint8, name)
     tens.mem_area = mem_area
+    tens.mem_type = mem_type
     tens.purpose = TensorPurpose.FeatureMap
     tens.set_format(TensorFormat.NHWC, arch)
     if want_values:
@@ -58,7 +60,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
         return scratch_tens, flash_tens
 
     flash_area = arch.permanent_storage_mem_area
-    scratch_area = MemArea.Sram
+    scratch_area = arch.feature_map_storage_mem_area
 
     flash_size = sg.memory_used.get(flash_area, 0)
     scratch_size = sg.memory_used.get(scratch_area, 0)
@@ -85,9 +87,13 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
 
     if flash_tens == scratch_tens is None:
         # First Npu subgraph, create scratch and flash tensors
-        sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch)
+        sg.scratch_tensor = make_memory_tensor(
+            sg.name + "_scratch", scratch_area, MemType.Scratch, scratch_size, False, arch
+        )
         sg.scratch_tensor.purpose = TensorPurpose.Scratch
-        sg.flash_tensor = make_memory_tensor(sg.name + "_flash", flash_area, flash_size, True, arch)
+        sg.flash_tensor = make_memory_tensor(
+            sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch
+        )
     else:
         sg.scratch_tensor = scratch_tens
         sg.scratch_tensor.shape[0] += scratch_size
@@ -108,13 +114,15 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
 
                     copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
 
-                if ps.ifm_tensor is not None and ps.ifm_tensor.mem_area != MemArea.Sram:
+                if ps.ifm_tensor is not None and ps.ifm_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
                     copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor)
-                if ps.ifm2_tensor is not None and ps.ifm2_tensor.mem_area != MemArea.Sram:
+                if ps.ifm2_tensor is not None and (
+                    ps.ifm2_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast)
+                ):
                     copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor)
 
     sg.command_stream_tensor = make_memory_tensor(
-        sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch
+        sg.name + "_command_stream", flash_area, MemType.Permanent_CPU, command_stream_size_bytes, True, arch
     )
     sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
 
@@ -156,4 +164,5 @@ def rewrite_npu_call_ops(nng, sg, arch):
                         prev_cps.sram_used += sz
 
                     if callee.scratch_tensor is not None:
-                        cps.sram_used += callee.scratch_tensor.storage_size()
+                        if callee.scratch_tensor.mem_area == MemArea.Sram:
+                            cps.sram_used += callee.scratch_tensor.storage_size()
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index c46016d7..9dd290a9 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -50,7 +50,7 @@ from .numeric_util import round_up
 from .numeric_util import round_up_to_int
 from .operation import NpuBlockType
 from .shared_buffer_allocation import SharedBufferAllocation
-from .tensor import MemArea
+from .tensor import MemType
 from .tensor import TensorBlockTraversal
 from .tensor import TensorFormat
 
@@ -79,8 +79,9 @@ class CmdMode(IntEnum):
 
 
 class BasePointerIndex(IntEnum):
-    ReadOnly = 0  # base address slot index for weights and scaling
-    Scratch = 1  # base address slot index for scratch memory area
+    WeightTensor = 0  # base address index for the Weight tensor
+    ScratchTensor = 1  # base address index for the Scratch_tensor in the TensorArena
+    ScratchFastTensor = 2  # base address for the Scratch_fast_tensor
 
 
 # TODO: Replace with definitions from ethos_u55_regs
@@ -322,12 +323,20 @@ def get_op_padding_lt(cmd):
 def generate_register_command_stream(nng, sg, arch, verbose=False):
     emit = CommandStreamEmitter()
 
-    base_ptr_idx_map = {
-        MemArea.Sram: BasePointerIndex.Scratch,
-        MemArea.OnChipFlash: BasePointerIndex.ReadOnly,
-        MemArea.OffChipFlash: BasePointerIndex.ReadOnly,
-        MemArea.Dram: BasePointerIndex.ReadOnly,
-    }
+    if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+        base_ptr_idx_map = {
+            MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+            MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+            MemType.Scratch: BasePointerIndex.ScratchTensor,
+            MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
+        }
+    else:
+        base_ptr_idx_map = {
+            MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+            MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+            MemType.Scratch: BasePointerIndex.ScratchTensor,
+            MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
+        }
 
     # Maps an AccumulatorType enum to the corresponding acc_format value
     acc_format_map = {
@@ -377,8 +386,8 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
                 param = min(param, 0xFFFF)  # Clamp to allowable wait amount
 
         if relative_dep[CommandType.DMA] is not None:
-            param = relative_dep[CommandType.DMA][0]
-            param = min(param, 0xF)  # Clamp to allowable wait amount
+            # TODO This can be optimized for yoda
+            param = 0
             emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
 
     for cmd in cmd_stream:
@@ -394,10 +403,10 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
             else:
                 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
 
-            # TODO: Yoda support needs to use feature_maps_not_in_fast_storage and force_outputs_to_fast_storage
-            emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_area])
+            emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
             emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
-            emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_area])
+            emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
+
             emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
             emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
             dma_channel = 0
@@ -682,10 +691,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
                 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
                 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
                 weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index)
-                # Select weight/scale region depending on where permanent storage was defined
-                weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_area]
-                if arch.permanent_storage_mem_area == MemArea.Sram:
-                    weight_region = BasePointerIndex.ReadOnly
+                weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
                 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
                 emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr)
                 emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len)
@@ -699,9 +705,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
                         cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr
                     )
                     # Emit base address for NPU to access scale & bias data
-                    scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_area]
-                    if arch.permanent_storage_mem_area == MemArea.Sram:
-                        scale_region = BasePointerIndex.ReadOnly
+                    scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
                     emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
                     emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr)
                     emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16))
@@ -850,10 +854,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
                     else:
                         assert False
 
-                if tens.mem_area == MemArea.Sram:
-                    emit.cmd0_with_param(region_op, BasePointerIndex.Scratch)
-                else:
-                    emit.cmd0_with_param(region_op, BasePointerIndex.ReadOnly)
+                emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
 
                 for idx, addr in enumerate(addresses):
                     if addr is None:
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 0b594310..be104b88 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -38,6 +38,7 @@ from .operation import NpuBlockType
 from .shared_buffer_allocation import find_block_configs_suitable_for_pass_and_shared_buffer
 from .shared_buffer_allocation import shared_buffer_allocation_for_pass_and_block_config
 from .tensor import MemArea
+from .tensor import MemType
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 from .tensor import TensorSubPurpose
@@ -833,6 +834,7 @@ class DynamicProgrammingScheduler:
             for rewrite_op, tens, sub_purpose, param_a, param_b, ps in strat.rewrite_list:
                 if rewrite_op == SchedulerRewrite.ChangeTensorSubPurpose:
                     tens.mem_area = self.arch.fast_storage_mem_area
+                    tens.mem_type = MemType.Scratch_fast
                     tens.set_new_sub_purpose(sub_purpose, param_a, param_b)
                 else:
                     assert 0, "unknown rewrite_op " + str(rewrite_op)
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 9bbb9db5..c90d9876 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -201,7 +201,10 @@ def write_pass_metrics_csv(nng, pass_filename):
                             for k in indices[2]:
                                 res += round_up_to_int(ps.bandwidths[i, j, k])
                         stats.append(res)
-                    stats += [ps.sram_used]
+                    try:
+                        stats += [ps.sram_used]
+                    except AttributeError:
+                        stats += [0]
 
                     writer.writerow(stats)
 
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 42d95262..3990164d 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -26,6 +26,27 @@ from .numeric_util import round_up_divide
 from .range_set import MemoryRangeSet
 
 
+class MemType(enum.IntFlag):
+    Unknown = 0
+    Permanent_NPU = 1
+    Permanent_CPU = 2
+    Scratch = 3
+    Scratch_fast = 4
+    Size = Scratch_fast + 1
+
+    def display_name(self):
+        return ("Unknown", "Permanent_NPU", "Permanent_CPU", "Scratch", "Scratch_fast", "Size")[self.value]
+
+    def identifier_name(self):
+        return ("unknown", "permanent_npu", "permanent_cpu", "scratch", "scratch_fast", "size")[self.value]
+
+    def all():
+        return (MemType.Permanent_NPU, MemType.Permanent_CPU, MemType.Scratch, MemType.Scratch_fast)
+
+    def __str__(self):
+        return self.name
+
+
 class MemArea(enum.IntFlag):
     Unknown = 0
     Sram = 1
@@ -209,6 +230,7 @@ class Tensor:
         "quant_values",
         "compressed_values",
         "mem_area",
+        "mem_type",
         "format",
         "purpose",
         "sub_purpose",
@@ -252,6 +274,7 @@ class Tensor:
         self.quant_values = None
         self.compressed_values = None
         self.mem_area = MemArea.Unknown
+        self.mem_type = MemType.Unknown
         self.format = TensorFormat.Unknown
         self.purpose = TensorPurpose.Unknown
         self.sub_purpose = TensorSubPurpose.Standard
@@ -291,6 +314,7 @@ class Tensor:
         res.values = self.values
         res.quant_values = self.quant_values
         res.mem_area = self.mem_area
+        res.mem_type = self.mem_type
         res.format = self.format
         res.purpose = self.purpose
         res.sub_purpose = self.sub_purpose
@@ -312,6 +336,7 @@ class Tensor:
     def clone_into_fast_storage(self, arch):
         res = self.clone(suffix="_fast_storage")
         res.mem_area = arch.fast_storage_mem_area
+        res.mem_type = MemType.Scratch_fast
         return res
 
     def copy_compressed_weight_info(self, src_tens):
@@ -641,6 +666,11 @@ class Tensor:
         assert address_offset <= self.storage_size()
         return address_offset
 
+    def is_allocated_in_tensor_arena(self, scratch_tensor_mem_area):
+        if self.mem_area == scratch_tensor_mem_area and (self.mem_type in set((MemType.Scratch, MemType.Scratch_fast))):
+            return True
+        return False
+
     def __str__(self):
         return "<nng.Tensor '%s' shape=%s dtype=%s>" % (self.name, self.shape, self.dtype)
 
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index e3952df3..f29296d1 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -25,6 +25,7 @@ from . import numeric_util
 from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
 from .nn_graph import TensorAllocator
 from .tensor import MemArea
+from .tensor import MemType
 
 
 def linear_allocate_live_ranges(live_ranges, alloc_granularity=16):
@@ -66,12 +67,13 @@ def mark_sram_used_for_cascaded_passes(sg, lrs):
             ps.sram_used = sram_used
 
 
-def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation):
+def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation):
     if verbose_allocation:
-        if mem_area == MemArea.Sram:
-            print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
-        else:
+        if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)):
             print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
+        else:
+            print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
+
         for start_time, start, end, name, end_time in sorted(
             (
                 lr.start_time,
@@ -99,6 +101,7 @@ def allocate_tensors(
     sg,
     arch,
     mem_area,
+    mem_type_set,
     use_ifm_ofm_overlap=True,
     tensor_allocator=TensorAllocator.Greedy,
     verbose_allocation=False,
@@ -109,6 +112,7 @@ def allocate_tensors(
     lrs = live_range.extract_live_ranges_from_cascaded_passes(
         sg,
         mem_area,
+        mem_type_set,
         mark_output_tensors_overlapping_with_input_tensors=False,
         use_ifm_ofm_overlap=use_ifm_ofm_overlap,
         ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
@@ -120,16 +124,26 @@ def allocate_tensors(
         if tens_alloc == TensorAllocator.Greedy:
             total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation)
         elif tens_alloc == TensorAllocator.LinearAlloc:
-            total_sz = linear_allocate_live_ranges(lrs)
+            total_sz = linear_allocate_live_ranges(lrs, 16)
         else:
             assert 0
 
-        sg.memory_used[mem_area] = total_sz
+        if sg.memory_used.get(mem_area, 0) == 0:
+            sg.memory_used[mem_area] = total_sz
+        else:
+            sg.memory_used[mem_area] += total_sz
+
+        # Keep track of how much should be used for scratch or permanent storage for NPU
+        for mem_type in mem_type_set:
+            if sg.memory_used_per_type.get(mem_type, 0) == 0:
+                sg.memory_used_per_type[mem_type] = total_sz
+            else:
+                sg.memory_used_per_type[mem_type] += total_sz
 
         nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
         nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
 
-        print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation)
+        print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation)
 
         if mem_area == MemArea.Sram:
             # Mark Sram usage for all subgraphs
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index 8db3e5b8..7e805e31 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -22,7 +22,7 @@ from flatbuffers import encode
 from flatbuffers.builder import UOffsetTFlags
 
 from .nn_graph import PassPlacement
-from .tensor import MemArea
+from .tensor import MemType
 from .tensor import TensorPurpose
 from .tflite import Buffer
 from .tflite import Metadata
@@ -74,6 +74,7 @@ class TFLiteSerialiser:
         self.nng = nng
 
         self.scratch_buf_id = 0  # Always assign scratch to buffer 0
+        self.scratch_fast_buf_id = 1  # Always assign scratch_fast to buffer 1
         self.buffer_offsets_map = {}
         self.buffers_to_write = []  # have an empty array there
 
@@ -140,11 +141,16 @@ class TFLiteSerialiser:
             scratch_tensor_mem_area = None  # all tensors are initialised to MemArea.Unknown
 
         buffer_map = {}
+
         buf_idx = 1
 
         for tens in tensors:
-            if tens.mem_area == scratch_tensor_mem_area:
+            # Set buffer ids depending on allocation
+            if tens.is_allocated_in_tensor_arena(scratch_tensor_mem_area):
                 buffer_map[tens] = self.scratch_buf_id
+            elif tens.mem_type == MemType.Scratch_fast:
+                # For Scratch_fast when not co-allocated with scratch in the TensorArena:
+                buffer_map[tens] = self.scratch_fast_buf_id
             else:
                 buffer_map[tens] = buf_idx
                 buf_idx += 1
@@ -229,11 +235,9 @@ class TFLiteSerialiser:
 
         if tens.purpose == TensorPurpose.Scratch:
             tens_shape = [0]
-            self.buffers_to_write[self.scratch_buf_id] = values.flatten().view(np.uint8)
 
         buf_id = self.buffer_map[tens]
-        if buf_id != self.scratch_buf_id:
-            self.buffers_to_write[buf_id] = values.flatten().view(np.uint8)
+        self.buffers_to_write[buf_id] = values.flatten().view(np.uint8)
 
         shape = self.write_int_vector(tens_shape)
 
@@ -396,7 +400,8 @@ class TFLiteSerialiser:
 
         # Ensure that the order of the offsets match the order of the tensors
         for tens, idx in self.tensor_map.items():
-            if tens.mem_area == MemArea.Sram:
+            # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area
+            if tens.mem_type in set((MemType.Scratch, MemType.Scratch_fast)):
                 offsets[idx] = np.int32(tens.address)
 
         metadata_buffer = np.array([version, subgraph_idx, nbr_tensors] + offsets)