From 268394d797db60d07eeace05a2c57e927da0ea15 Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Thu, 13 Aug 2020 13:24:59 +0200
Subject: MLBEDSW-1974: Set Scratch buffers size

Set the actual size of the Scratch and Fast Scratch buffer and remove both
Scratch buffers from the subgraph inputs.

Signed-off-by: Jacob Bohlin <jacob.bohlin@arm.com>
Change-Id: I9e4213f48289d9136cdd4cd43c668d37c6af8530
---
 ethosu/vela/compiler_driver.py   |  6 ++++++
 ethosu/vela/npu_serialisation.py |  5 +----
 ethosu/vela/tflite_writer.py     | 23 ++---------------------
 3 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 05bf65a4..1d7521b1 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -291,6 +291,12 @@ def compiler_driver(nng, arch, options, scheduler_options):
 
     npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
 
+    # Set Scratch and Fast_scratch Tensor size
+    if scratch_tens is not None:
+        scratch_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch, 0)])
+    if scratch_fast_tens is not None:
+        scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])
+
     # Allocate all Cpu constant tensors, this is done last because the Npu-ops
     # have to be serialized into flash and scratch tensors first
     tensor_allocation.allocate_tensors(
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index 7989fa90..0bd03001 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -103,11 +103,8 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fas
         sg.flash_tensor = make_memory_tensor(
             sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch
         )
-        # Scratch fast tensor size set to 0. This forces a minimal allocation in the tensor arena
-        # which causes a slot in the basep registers to be reserved, so that the scratch fast tensor
-        # address can be overridden.
         sg.scratch_fast_tensor = make_memory_tensor(
-            sg.name + "_scratch_fast", scratch_fast_area, MemType.Scratch, 0, False, arch
+            sg.name + "_scratch_fast", scratch_fast_area, MemType.Scratch_fast, 0, False, arch
         )
         sg.scratch_fast_tensor.purpose = TensorPurpose.Scratch
     else:
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index e24aa282..0f20878d 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -24,7 +24,6 @@ from flatbuffers.builder import UOffsetTFlags
 from .nn_graph import PassPlacement
 from .operation import Op
 from .tensor import MemType
-from .tensor import TensorPurpose
 from .tflite import Buffer
 from .tflite import Metadata
 from .tflite import Model
@@ -234,9 +233,6 @@ class TFLiteSerialiser:
             tens_shape = [tens_shape[idx] for idx in reorder]
             values = values.transpose(reorder)
 
-        if tens.purpose == TensorPurpose.Scratch:
-            tens_shape = [0]
-
         buf_id = self.buffer_map[tens]
         self.buffers_to_write[buf_id] = values.flatten().view(np.uint8)
 
@@ -327,11 +323,6 @@ class TFLiteSerialiser:
 
         scratch_tensors = [tens for tens in all_tensors if tens.name.endswith("scratch")]
 
-        scratch_fast_tensor = None
-        for tens in all_tensors:
-            if tens.name.endswith("scratch_fast"):
-                scratch_fast_tensor = tens
-
         if len(scratch_tensors) == 0:
             scratch_tensor = None
         else:
@@ -347,16 +338,6 @@ class TFLiteSerialiser:
         assert all(inp in sg.original_inputs for inp in sg.input_tensors)
         inputs = [self.tensor_map[tens] for tens in sg.original_inputs if tens in self.tensor_map]
 
-        # Add the Scratch Tensors as input to the NPU subgraph to get them allocated by TensorFlow Lite Micro
-        scratch_tensor_idx = self.tensor_map.get(scratch_tensor, None)
-        scratch_fast_tensor_idx = self.tensor_map.get(scratch_fast_tensor, None)
-
-        if scratch_tensor_idx is not None and scratch_tensor_idx not in inputs:
-            inputs.append(scratch_tensor_idx)
-
-        if scratch_fast_tensor_idx is not None and scratch_fast_tensor_idx not in inputs:
-            inputs.append(scratch_fast_tensor_idx)
-
         inputs_offset = self.write_int_vector(inputs)
         outputs_offset = self.write_int_vector(
             [self.tensor_map[tens] for tens in sg.output_tensors if tens in self.tensor_map]
@@ -424,8 +405,8 @@ class TFLiteSerialiser:
         # Ensure that the order of the offsets match the order of the tensors
         for tens, idx in self.tensor_map.items():
             # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area
-            if tens.mem_type in set((MemType.Scratch, MemType.Scratch_fast)) and tens.address is not None:
-                offsets[idx] = np.int32(tens.address)
+            if tens.mem_type in set((MemType.Scratch, MemType.Scratch_fast)):
+                offsets[idx] = np.int32(tens.address) if tens.address is not None else np.int32(0)
 
         self.nng.metadata.append(("OfflineMemoryAllocation", np.array([version, subgraph_idx, nbr_tensors] + offsets)))
 
-- 
cgit v1.2.1