diff options
-rw-r--r-- | ethosu/vela/scheduler.py | 56 | ||||
-rw-r--r-- | ethosu/vela/tensor.py | 7 | ||||
-rw-r--r-- | ethosu/vela/test/test_lut.py | 6 |
3 files changed, 14 insertions, 55 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index a50f262e..16531c2c 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -60,7 +60,6 @@ from .nn_graph import Subgraph from .live_range import ofm_can_reuse_ifm from .numeric_util import round_down from .numeric_util import round_up -from .operation import NpuBlockType from .operation import Op from .shape4d import Shape4D from .tensor import MemArea @@ -213,6 +212,14 @@ class SchedulerOperation: ps.ofm_tensor.format, ) + # LUT must be placed in shram area. The copy is done by DMA + # generated by the high level command stream generator. + for idx, tens in enumerate(self.parent_op.inputs): + if tens.purpose == TensorPurpose.LUT: + new_tens = tens.clone_into_shram(self.arch) + new_tens.consumer_list.append(self.parent_op) + self.parent_op.inputs[idx] = new_tens + # Input volume width and height required to produce the smallest possible stripe self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input() @@ -1379,52 +1386,6 @@ class Scheduler: ) assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit" - def move_constant_data(self): - """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA - command in the high-level command stream""" - for sched_op in self.sched_ops: - parent_op = sched_op.parent_op - is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs) - max_ifm_shram_avail = ( - (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks) - * self.arch.shram_bank_size - // 2 - ) - - for idx, tens in enumerate(parent_op.inputs): - if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast): - # Tensor is in permanent storage - # Only when permanent storage differs from feature map storage, there is a point moving the data - if ( - tens.mem_area in self.arch.permanent_storage_mem_area - and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area - ) or tens.purpose == TensorPurpose.LUT: - if tens.purpose == TensorPurpose.LUT or ( - # For elementwise broadcast - tens.purpose == TensorPurpose.FeatureMap - and sched_op.op_type.is_binary_elementwise_op() - and tens.shape != [] - and sched_op.ifm.shape != sched_op.ofm.shape - and parent_op.write_shape is None - and tens.storage_size() > max_ifm_shram_avail - ): - only_vector_product_consumers = all( - oper and oper.type.npu_block_type == NpuBlockType.VectorProduct - for oper in tens.consumers() - ) - - if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT: - new_tens = tens.clone_into_fast_storage(self.arch) - if tens.purpose == TensorPurpose.LUT: - new_tens.mem_area = MemArea.Shram - - new_tens.consumer_list.append(parent_op) - parent_op.inputs[idx] = new_tens - # If the index is out of range, IFM and IFM2 are the same tensor - # and pass inputs don't have duplicates - if idx < len(sched_op.parent_ps.inputs): - sched_op.parent_ps.inputs[idx] = new_tens - def print_schedule(self, schedule: Schedule): print(f"Schedule: '{schedule.name}'") for sched_op in self.sched_ops: @@ -1634,7 +1595,6 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o scheduler.create_scheduler_representation(arch) sg.sched_ops = scheduler.sched_ops - scheduler.move_constant_data() # Create the Max schedule template max_schedule_template = scheduler.create_initial_schedule() diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 008cd05e..86306cad 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -501,10 +501,9 @@ class Tensor: return res - def clone_into_fast_storage(self, arch) -> "Tensor": - res = self.clone(suffix="_fast_storage") - res.mem_area = arch.fast_storage_mem_area - res.mem_type = MemType.Scratch_fast + def clone_into_shram(self, arch) -> "Tensor": + res = self.clone(suffix="_shram") + res.mem_area = MemArea.Shram res.src_tensor = self return res diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py index 58e72bbf..e52b4896 100644 --- a/ethosu/vela/test/test_lut.py +++ b/ethosu/vela/test/test_lut.py @@ -36,7 +36,7 @@ def set_256_lut(op, key, arch): random.seed(key) values = random.choices(range(256), k=256) lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.uint8, values, TensorPurpose.LUT) - scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + scratch_lut_tensor = lut_tensor.clone_into_shram(arch) op.set_activation_lut(scratch_lut_tensor) @@ -44,7 +44,7 @@ def set_1K_lut(op, key, arch): random.seed(key) values = random.choices(range(256), k=256) lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, TensorPurpose.LUT) - scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + scratch_lut_tensor = lut_tensor.clone_into_shram(arch) op.set_activation_lut(scratch_lut_tensor) @@ -52,7 +52,7 @@ def set_2K_lut(op, key, arch): random.seed(key) values = random.choices(range(512), k=512) lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, TensorPurpose.LUT) - scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + scratch_lut_tensor = lut_tensor.clone_into_shram(arch) op.set_activation_lut(scratch_lut_tensor) |