aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfven <johan.alfven@arm.com>2023-03-09 08:36:10 +0100
committerFredrik Svedberg <fredrik.svedberg@arm.com>2023-03-16 16:12:36 +0000
commit126558e26df26830c2d331ec0041dc9a4f1a0d38 (patch)
tree597de0b202cfcd3950faf9c7f4e71e56ed0d867d
parenta5e1b6224d8436365e7f0bdb0afef060423fba57 (diff)
downloadethos-u-vela-126558e26df26830c2d331ec0041dc9a4f1a0d38.tar.gz
MLBEDSW-7352: Refactoring move_constant_data
Refactoring move_constant_data in the scheduler. The use case currently only work for LUT tensor, so simplifying the logic. In order to make it work for other tensors one would also have to take into consideration memory usage when building cascades and also the use_fast_storage_for_feature_maps would be effected. Change-Id: Ic8de53b65a2c17d34515002d7f184d0ab1830222 Signed-off-by: Johan Alfven <johan.alfven@arm.com>
-rw-r--r--ethosu/vela/scheduler.py56
-rw-r--r--ethosu/vela/tensor.py7
-rw-r--r--ethosu/vela/test/test_lut.py6
3 files changed, 14 insertions, 55 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index a50f262e..16531c2c 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -60,7 +60,6 @@ from .nn_graph import Subgraph
from .live_range import ofm_can_reuse_ifm
from .numeric_util import round_down
from .numeric_util import round_up
-from .operation import NpuBlockType
from .operation import Op
from .shape4d import Shape4D
from .tensor import MemArea
@@ -213,6 +212,14 @@ class SchedulerOperation:
ps.ofm_tensor.format,
)
+ # LUT must be placed in shram area. The copy is done by DMA
+ # generated by the high level command stream generator.
+ for idx, tens in enumerate(self.parent_op.inputs):
+ if tens.purpose == TensorPurpose.LUT:
+ new_tens = tens.clone_into_shram(self.arch)
+ new_tens.consumer_list.append(self.parent_op)
+ self.parent_op.inputs[idx] = new_tens
+
# Input volume width and height required to produce the smallest possible stripe
self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
@@ -1379,52 +1386,6 @@ class Scheduler:
)
assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"
- def move_constant_data(self):
- """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA
- command in the high-level command stream"""
- for sched_op in self.sched_ops:
- parent_op = sched_op.parent_op
- is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
- max_ifm_shram_avail = (
- (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
- * self.arch.shram_bank_size
- // 2
- )
-
- for idx, tens in enumerate(parent_op.inputs):
- if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
- # Tensor is in permanent storage
- # Only when permanent storage differs from feature map storage, there is a point moving the data
- if (
- tens.mem_area in self.arch.permanent_storage_mem_area
- and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
- ) or tens.purpose == TensorPurpose.LUT:
- if tens.purpose == TensorPurpose.LUT or (
- # For elementwise broadcast
- tens.purpose == TensorPurpose.FeatureMap
- and sched_op.op_type.is_binary_elementwise_op()
- and tens.shape != []
- and sched_op.ifm.shape != sched_op.ofm.shape
- and parent_op.write_shape is None
- and tens.storage_size() > max_ifm_shram_avail
- ):
- only_vector_product_consumers = all(
- oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
- for oper in tens.consumers()
- )
-
- if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
- new_tens = tens.clone_into_fast_storage(self.arch)
- if tens.purpose == TensorPurpose.LUT:
- new_tens.mem_area = MemArea.Shram
-
- new_tens.consumer_list.append(parent_op)
- parent_op.inputs[idx] = new_tens
- # If the index is out of range, IFM and IFM2 are the same tensor
- # and pass inputs don't have duplicates
- if idx < len(sched_op.parent_ps.inputs):
- sched_op.parent_ps.inputs[idx] = new_tens
-
def print_schedule(self, schedule: Schedule):
print(f"Schedule: '{schedule.name}'")
for sched_op in self.sched_ops:
@@ -1634,7 +1595,6 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
scheduler.create_scheduler_representation(arch)
sg.sched_ops = scheduler.sched_ops
- scheduler.move_constant_data()
# Create the Max schedule template
max_schedule_template = scheduler.create_initial_schedule()
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 008cd05e..86306cad 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -501,10 +501,9 @@ class Tensor:
return res
- def clone_into_fast_storage(self, arch) -> "Tensor":
- res = self.clone(suffix="_fast_storage")
- res.mem_area = arch.fast_storage_mem_area
- res.mem_type = MemType.Scratch_fast
+ def clone_into_shram(self, arch) -> "Tensor":
+ res = self.clone(suffix="_shram")
+ res.mem_area = MemArea.Shram
res.src_tensor = self
return res
diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
index 58e72bbf..e52b4896 100644
--- a/ethosu/vela/test/test_lut.py
+++ b/ethosu/vela/test/test_lut.py
@@ -36,7 +36,7 @@ def set_256_lut(op, key, arch):
random.seed(key)
values = random.choices(range(256), k=256)
lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.uint8, values, TensorPurpose.LUT)
- scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+ scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
op.set_activation_lut(scratch_lut_tensor)
@@ -44,7 +44,7 @@ def set_1K_lut(op, key, arch):
random.seed(key)
values = random.choices(range(256), k=256)
lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, TensorPurpose.LUT)
- scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+ scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
op.set_activation_lut(scratch_lut_tensor)
@@ -52,7 +52,7 @@ def set_2K_lut(op, key, arch):
random.seed(key)
values = random.choices(range(512), k=512)
lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, TensorPurpose.LUT)
- scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+ scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
op.set_activation_lut(scratch_lut_tensor)