MLBEDSW-7352: Refactoring move_constant_data

Refactoring move_constant_data in the scheduler. The use case currently only work for LUT tensor, so simplifying the logic. In order to make it work for other tensors one would also have to take into consideration memory usage when building cascades and also the use_fast_storage_for_feature_maps would be effected. Change-Id: Ic8de53b65a2c17d34515002d7f184d0ab1830222 Signed-off-by: Johan Alfven <johan.alfven@arm.com>
author: Johan Alfven <johan.alfven@arm.com> 2023-03-09 08:36:10 +0100
committer: Fredrik Svedberg <fredrik.svedberg@arm.com> 2023-03-16 16:12:36 +0000
commit: 126558e26df26830c2d331ec0041dc9a4f1a0d38 (patch)
tree: 597de0b202cfcd3950faf9c7f4e71e56ed0d867d
parent: a5e1b6224d8436365e7f0bdb0afef060423fba57 (diff)
download: ethos-u-vela-126558e26df26830c2d331ec0041dc9a4f1a0d38.tar.gz
3 files changed, 14 insertions, 55 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index a50f262e..16531c2c 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -60,7 +60,6 @@ from .nn_graph import Subgraph
 from .live_range import ofm_can_reuse_ifm
 from .numeric_util import round_down
 from .numeric_util import round_up
-from .operation import NpuBlockType
 from .operation import Op
 from .shape4d import Shape4D
 from .tensor import MemArea
@@ -213,6 +212,14 @@ class SchedulerOperation:
             ps.ofm_tensor.format,
         )
 
+        # LUT must be placed in shram area. The copy is done by DMA
+        # generated by the high level command stream generator.
+        for idx, tens in enumerate(self.parent_op.inputs):
+            if tens.purpose == TensorPurpose.LUT:
+                new_tens = tens.clone_into_shram(self.arch)
+                new_tens.consumer_list.append(self.parent_op)
+                self.parent_op.inputs[idx] = new_tens
+
         # Input volume width and height required to produce the smallest possible stripe
         self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
 
@@ -1379,52 +1386,6 @@ class Scheduler:
             )
         assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"
 
-    def move_constant_data(self):
-        """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA
-        command in the high-level command stream"""
-        for sched_op in self.sched_ops:
-            parent_op = sched_op.parent_op
-            is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
-            max_ifm_shram_avail = (
-                (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
-                * self.arch.shram_bank_size
-                // 2
-            )
-
-            for idx, tens in enumerate(parent_op.inputs):
-                if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
-                    # Tensor is in permanent storage
-                    # Only when permanent storage differs from feature map storage, there is a point moving the data
-                    if (
-                        tens.mem_area in self.arch.permanent_storage_mem_area
-                        and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
-                    ) or tens.purpose == TensorPurpose.LUT:
-                        if tens.purpose == TensorPurpose.LUT or (
-                            # For elementwise broadcast
-                            tens.purpose == TensorPurpose.FeatureMap
-                            and sched_op.op_type.is_binary_elementwise_op()
-                            and tens.shape != []
-                            and sched_op.ifm.shape != sched_op.ofm.shape
-                            and parent_op.write_shape is None
-                            and tens.storage_size() > max_ifm_shram_avail
-                        ):
-                            only_vector_product_consumers = all(
-                                oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
-                                for oper in tens.consumers()
-                            )
-
-                            if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
-                                new_tens = tens.clone_into_fast_storage(self.arch)
-                                if tens.purpose == TensorPurpose.LUT:
-                                    new_tens.mem_area = MemArea.Shram
-
-                                new_tens.consumer_list.append(parent_op)
-                                parent_op.inputs[idx] = new_tens
-                                # If the index is out of range, IFM and IFM2 are the same tensor
-                                # and pass inputs don't have duplicates
-                                if idx < len(sched_op.parent_ps.inputs):
-                                    sched_op.parent_ps.inputs[idx] = new_tens
-
     def print_schedule(self, schedule: Schedule):
         print(f"Schedule: '{schedule.name}'")
         for sched_op in self.sched_ops:
@@ -1634,7 +1595,6 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
 
             scheduler.create_scheduler_representation(arch)
             sg.sched_ops = scheduler.sched_ops
-            scheduler.move_constant_data()
 
             # Create the Max schedule template
             max_schedule_template = scheduler.create_initial_schedule()
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 008cd05e..86306cad 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -501,10 +501,9 @@ class Tensor:
 
         return res
 
-    def clone_into_fast_storage(self, arch) -> "Tensor":
-        res = self.clone(suffix="_fast_storage")
-        res.mem_area = arch.fast_storage_mem_area
-        res.mem_type = MemType.Scratch_fast
+    def clone_into_shram(self, arch) -> "Tensor":
+        res = self.clone(suffix="_shram")
+        res.mem_area = MemArea.Shram
         res.src_tensor = self
         return res
 
diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
index 58e72bbf..e52b4896 100644
--- a/ethosu/vela/test/test_lut.py
+++ b/ethosu/vela/test/test_lut.py
@@ -36,7 +36,7 @@ def set_256_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.uint8, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
 
 
@@ -44,7 +44,7 @@ def set_1K_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
 
 
@@ -52,7 +52,7 @@ def set_2K_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(512), k=512)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
author	Johan Alfven <johan.alfven@arm.com>	2023-03-09 08:36:10 +0100
committer	Fredrik Svedberg <fredrik.svedberg@arm.com>	2023-03-16 16:12:36 +0000
commit	126558e26df26830c2d331ec0041dc9a4f1a0d38 (patch)
tree	597de0b202cfcd3950faf9c7f4e71e56ed0d867d
parent	a5e1b6224d8436365e7f0bdb0afef060423fba57 (diff)
download	ethos-u-vela-126558e26df26830c2d331ec0041dc9a4f1a0d38.tar.gz