3 files changed, 14 insertions, 55 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index a50f262e..16531c2c 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -60,7 +60,6 @@ from .nn_graph import Subgraph
 from .live_range import ofm_can_reuse_ifm
 from .numeric_util import round_down
 from .numeric_util import round_up
-from .operation import NpuBlockType
 from .operation import Op
 from .shape4d import Shape4D
 from .tensor import MemArea
@@ -213,6 +212,14 @@ class SchedulerOperation:
             ps.ofm_tensor.format,
         )
 
+        # LUT must be placed in shram area. The copy is done by DMA
+        # generated by the high level command stream generator.
+        for idx, tens in enumerate(self.parent_op.inputs):
+            if tens.purpose == TensorPurpose.LUT:
+                new_tens = tens.clone_into_shram(self.arch)
+                new_tens.consumer_list.append(self.parent_op)
+                self.parent_op.inputs[idx] = new_tens
+
         # Input volume width and height required to produce the smallest possible stripe
         self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
 
@@ -1379,52 +1386,6 @@ class Scheduler:
             )
         assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"
 
-    def move_constant_data(self):
-        """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA
-        command in the high-level command stream"""
-        for sched_op in self.sched_ops:
-            parent_op = sched_op.parent_op
-            is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
-            max_ifm_shram_avail = (
-                (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
-                * self.arch.shram_bank_size
-                // 2
-            )
-
-            for idx, tens in enumerate(parent_op.inputs):
-                if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
-                    # Tensor is in permanent storage
-                    # Only when permanent storage differs from feature map storage, there is a point moving the data
-                    if (
-                        tens.mem_area in self.arch.permanent_storage_mem_area
-                        and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
-                    ) or tens.purpose == TensorPurpose.LUT:
-                        if tens.purpose == TensorPurpose.LUT or (
-                            # For elementwise broadcast
-                            tens.purpose == TensorPurpose.FeatureMap
-                            and sched_op.op_type.is_binary_elementwise_op()
-                            and tens.shape != []
-                            and sched_op.ifm.shape != sched_op.ofm.shape
-                            and parent_op.write_shape is None
-                            and tens.storage_size() > max_ifm_shram_avail
-                        ):
-                            only_vector_product_consumers = all(
-                                oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
-                                for oper in tens.consumers()
-                            )
-
-                            if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
-                                new_tens = tens.clone_into_fast_storage(self.arch)
-                                if tens.purpose == TensorPurpose.LUT:
-                                    new_tens.mem_area = MemArea.Shram
-
-                                new_tens.consumer_list.append(parent_op)
-                                parent_op.inputs[idx] = new_tens
-                                # If the index is out of range, IFM and IFM2 are the same tensor
-                                # and pass inputs don't have duplicates
-                                if idx < len(sched_op.parent_ps.inputs):
-                                    sched_op.parent_ps.inputs[idx] = new_tens
-
     def print_schedule(self, schedule: Schedule):
         print(f"Schedule: '{schedule.name}'")
         for sched_op in self.sched_ops:
@@ -1634,7 +1595,6 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
 
             scheduler.create_scheduler_representation(arch)
             sg.sched_ops = scheduler.sched_ops
-            scheduler.move_constant_data()
 
             # Create the Max schedule template
             max_schedule_template = scheduler.create_initial_schedule()
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 008cd05e..86306cad 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -501,10 +501,9 @@ class Tensor:
 
         return res
 
-    def clone_into_fast_storage(self, arch) -> "Tensor":
-        res = self.clone(suffix="_fast_storage")
-        res.mem_area = arch.fast_storage_mem_area
-        res.mem_type = MemType.Scratch_fast
+    def clone_into_shram(self, arch) -> "Tensor":
+        res = self.clone(suffix="_shram")
+        res.mem_area = MemArea.Shram
         res.src_tensor = self
         return res
 
diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
index 58e72bbf..e52b4896 100644
--- a/ethosu/vela/test/test_lut.py
+++ b/ethosu/vela/test/test_lut.py
@@ -36,7 +36,7 @@ def set_256_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.uint8, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
 
 
@@ -44,7 +44,7 @@ def set_1K_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
 
 
@@ -52,7 +52,7 @@ def set_2K_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(512), k=512)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)