diff options
author | Johan Alfven <johan.alfven@arm.com> | 2023-03-09 08:36:10 +0100 |
---|---|---|
committer | Fredrik Svedberg <fredrik.svedberg@arm.com> | 2023-03-16 16:12:36 +0000 |
commit | 126558e26df26830c2d331ec0041dc9a4f1a0d38 (patch) | |
tree | 597de0b202cfcd3950faf9c7f4e71e56ed0d867d | |
parent | a5e1b6224d8436365e7f0bdb0afef060423fba57 (diff) | |
download | ethos-u-vela-126558e26df26830c2d331ec0041dc9a4f1a0d38.tar.gz |
MLBEDSW-7352: Refactoring move_constant_data
Refactoring move_constant_data in the scheduler. The use case currently
only work for LUT tensor, so simplifying the logic. In order to make it
work for other tensors one would also have to take into consideration
memory usage when building cascades and also the
use_fast_storage_for_feature_maps would be effected.
Change-Id: Ic8de53b65a2c17d34515002d7f184d0ab1830222
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
-rw-r--r-- | ethosu/vela/scheduler.py | 56 | ||||
-rw-r--r-- | ethosu/vela/tensor.py | 7 | ||||
-rw-r--r-- | ethosu/vela/test/test_lut.py | 6 |
3 files changed, 14 insertions, 55 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index a50f262e..16531c2c 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -60,7 +60,6 @@ from .nn_graph import Subgraph from .live_range import ofm_can_reuse_ifm from .numeric_util import round_down from .numeric_util import round_up -from .operation import NpuBlockType from .operation import Op from .shape4d import Shape4D from .tensor import MemArea @@ -213,6 +212,14 @@ class SchedulerOperation: ps.ofm_tensor.format, ) + # LUT must be placed in shram area. The copy is done by DMA + # generated by the high level command stream generator. + for idx, tens in enumerate(self.parent_op.inputs): + if tens.purpose == TensorPurpose.LUT: + new_tens = tens.clone_into_shram(self.arch) + new_tens.consumer_list.append(self.parent_op) + self.parent_op.inputs[idx] = new_tens + # Input volume width and height required to produce the smallest possible stripe self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input() @@ -1379,52 +1386,6 @@ class Scheduler: ) assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit" - def move_constant_data(self): - """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA - command in the high-level command stream""" - for sched_op in self.sched_ops: - parent_op = sched_op.parent_op - is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs) - max_ifm_shram_avail = ( - (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks) - * self.arch.shram_bank_size - // 2 - ) - - for idx, tens in enumerate(parent_op.inputs): - if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast): - # Tensor is in permanent storage - # Only when permanent storage differs from feature map storage, there is a point moving the data - if ( - tens.mem_area in self.arch.permanent_storage_mem_area - and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area - ) or tens.purpose == TensorPurpose.LUT: - if tens.purpose == TensorPurpose.LUT or ( - # For elementwise broadcast - tens.purpose == TensorPurpose.FeatureMap - and sched_op.op_type.is_binary_elementwise_op() - and tens.shape != [] - and sched_op.ifm.shape != sched_op.ofm.shape - and parent_op.write_shape is None - and tens.storage_size() > max_ifm_shram_avail - ): - only_vector_product_consumers = all( - oper and oper.type.npu_block_type == NpuBlockType.VectorProduct - for oper in tens.consumers() - ) - - if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT: - new_tens = tens.clone_into_fast_storage(self.arch) - if tens.purpose == TensorPurpose.LUT: - new_tens.mem_area = MemArea.Shram - - new_tens.consumer_list.append(parent_op) - parent_op.inputs[idx] = new_tens - # If the index is out of range, IFM and IFM2 are the same tensor - # and pass inputs don't have duplicates - if idx < len(sched_op.parent_ps.inputs): - sched_op.parent_ps.inputs[idx] = new_tens - def print_schedule(self, schedule: Schedule): print(f"Schedule: '{schedule.name}'") for sched_op in self.sched_ops: @@ -1634,7 +1595,6 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o scheduler.create_scheduler_representation(arch) sg.sched_ops = scheduler.sched_ops - scheduler.move_constant_data() # Create the Max schedule template max_schedule_template = scheduler.create_initial_schedule() diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 008cd05e..86306cad 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -501,10 +501,9 @@ class Tensor: return res - def clone_into_fast_storage(self, arch) -> "Tensor": - res = self.clone(suffix="_fast_storage") - res.mem_area = arch.fast_storage_mem_area - res.mem_type = MemType.Scratch_fast + def clone_into_shram(self, arch) -> "Tensor": + res = self.clone(suffix="_shram") + res.mem_area = MemArea.Shram res.src_tensor = self return res diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py index 58e72bbf..e52b4896 100644 --- a/ethosu/vela/test/test_lut.py +++ b/ethosu/vela/test/test_lut.py @@ -36,7 +36,7 @@ def set_256_lut(op, key, arch): random.seed(key) values = random.choices(range(256), k=256) lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.uint8, values, TensorPurpose.LUT) - scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + scratch_lut_tensor = lut_tensor.clone_into_shram(arch) op.set_activation_lut(scratch_lut_tensor) @@ -44,7 +44,7 @@ def set_1K_lut(op, key, arch): random.seed(key) values = random.choices(range(256), k=256) lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, TensorPurpose.LUT) - scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + scratch_lut_tensor = lut_tensor.clone_into_shram(arch) op.set_activation_lut(scratch_lut_tensor) @@ -52,7 +52,7 @@ def set_2K_lut(op, key, arch): random.seed(key) values = random.choices(range(512), k=512) lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, TensorPurpose.LUT) - scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + scratch_lut_tensor = lut_tensor.clone_into_shram(arch) op.set_activation_lut(scratch_lut_tensor) |