From a71efe00bcbac0e601a0e3812bba89da452f2aff Mon Sep 17 00:00:00 2001 From: William Isaksson Date: Wed, 12 Jul 2023 12:28:05 +0000 Subject: MLBEDSW-7754: Performance estimator is not using write/read shapes - npu_performance now uses write/read shapes instead of using ifm/ofms for memory cycle estimations. - also fixes a would be bug in the tflite_graph_optimiser, where one read shape is not Shape4D. Change-Id: I2067069a713d2cf9e65a5cc227e803de79940fff Signed-off-by: William Isaksson --- ethosu/vela/npu_performance.py | 6 +++--- ethosu/vela/scheduler.py | 33 +++++++++++++++++++++++++++------ ethosu/vela/tflite_graph_optimiser.py | 2 +- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index eb9f66c6..dfb7006b 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -618,15 +618,15 @@ def estimate_full_op_performance( macs = 0 query = PerformanceQuery(op.op_type.npu_block_type) - query.ifm_shape = op.ifm.shape + query.ifm_shape = op.ifm_read_shape query.ifm_format = op.ifm.format query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area # Mem Area is set directly on parent_tens query.ifm_bits = op.ifm.dtype.size_in_bits() - query.ifm2_shape = op.ifm2 and op.ifm2.shape + query.ifm2_shape = op.ifm2_read_shape query.ifm2_format = op.ifm2 and op.ifm2.format query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() - query.ofm_shape = op.ofm.shape + query.ofm_shape = op.ofm_write_shape query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area query.ofm_bits = op.ofm.dtype.size_in_bits() query.ofm_format = op.ofm.format diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 8188b5bb..cd716ef5 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -260,6 +260,27 @@ class SchedulerOperation: self.parent_ps.ifm_tensor, ) + @property + def ofm_write_shape(self): + if self.ofm: + ofm_write_shape = self.parent_op.write_shape + return ofm_write_shape if ofm_write_shape else self.ofm.shape + return None + + @property + def ifm_read_shape(self): + if self.ifm: + ifm_read_shape = self.parent_op.read_shapes[1] if self.reversed_operands else self.parent_op.read_shapes[0] + return ifm_read_shape if ifm_read_shape else self.ifm.shape + return None + + @property + def ifm2_read_shape(self): + if self.ifm2: + ifm2_read_shape = self.parent_op.read_shapes[0] if self.reversed_operands else self.parent_op.read_shapes[1] + return ifm2_read_shape if ifm2_read_shape else self.ifm2.shape + return None + def add_ifm_connection(self, conn: "Connection"): """Add input connection to another SchedulerOperation or Subgraph Input""" conn.consumers.append(self) @@ -565,15 +586,15 @@ class Scheduler: def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth): query = npu_performance.PerformanceQuery(op.op_type.npu_block_type) - query.ifm_shape = op.ifm.shape + query.ifm_shape = op.ifm_read_shape query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area query.ifm_bits = op.ifm.dtype.size_in_bits() query.ifm_format = op.ifm.format - query.ifm2_shape = op.ifm2 and op.ifm2.shape + query.ifm2_shape = op.ifm2_read_shape query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() query.ifm2_format = op.ifm2 and op.ifm2.format - query.ofm_shape = op.ofm.shape.with_depth(ofm_depth) + query.ofm_shape = op.ofm_write_shape.with_depth(ofm_depth) query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area query.ofm_bits = op.ofm.dtype.size_in_bits() query.ofm_format = op.ofm.format @@ -588,15 +609,15 @@ class Scheduler: def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth): query = npu_performance.PerformanceQuery(op.op_type.npu_block_type) - query.ifm_shape = op.ifm.shape + query.ifm_shape = op.ifm_read_shape query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area query.ifm_bits = op.ifm.dtype.size_in_bits() query.ifm_format = op.ifm.format - query.ifm2_shape = op.ifm2 and op.ifm2.shape + query.ifm2_shape = op.ifm2_read_shape query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() query.ifm2_format = op.ifm2 and op.ifm2.format - query.ofm_shape = op.ofm.shape.with_depth(ofm_depth) + query.ofm_shape = op.ofm_write_shape.with_depth(ofm_depth) query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area query.ofm_bits = op.ofm.dtype.size_in_bits() query.ofm_format = op.ofm.format diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index c7fe6cd9..ef6b90b5 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -150,7 +150,7 @@ def rewrite_split_ops(tens, arch, nng): read_shape = None else: # the read shape is relative to each start offset - read_shape = [oe - os for oe, os in zip(offset_end, offset_start)] + read_shape = Shape4D([oe - os for oe, os in zip(offset_end, offset_start)]) # For Split the offset cannot be extracted from the tensor so it has to # be calculated from the index of the output tensor -- cgit v1.2.1