From 2ff8c45528cb38c28c7b7ac40bbeb3c8a374934d Mon Sep 17 00:00:00 2001 From: wilisa01 Date: Wed, 3 May 2023 11:26:22 +0000 Subject: MLBEDSW-7397: Wrong mem_area used in scheduler Performance estimation now uses the parent_tensor mem_area instead of the scheduler_op mem_area, because the mem_area is only set on the parent_tensor by the scheduler. Signed-off-by: wilisa01 Change-Id: I11f73686bfbd6958a8920c5e264a5f95cc3f23d1 --- ethosu/vela/npu_performance.py | 35 +++++++++++++++++++++-------------- ethosu/vela/scheduler.py | 12 ++++++------ 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 2325a9c3..eb9f66c6 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -194,7 +194,7 @@ def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits): def _estimate_memory_transfer_efficiency( - arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer + arch, is_read, mem_area, format, element_bits, block_size, shape4D, to_transfer ): burst_len = 8 @@ -620,14 +620,14 @@ def estimate_full_op_performance( query = PerformanceQuery(op.op_type.npu_block_type) query.ifm_shape = op.ifm.shape query.ifm_format = op.ifm.format - query.ifm_memory_area = op.ifm.mem_area + query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area # Mem Area is set directly on parent_tens query.ifm_bits = op.ifm.dtype.size_in_bits() query.ifm2_shape = op.ifm2 and op.ifm2.shape query.ifm2_format = op.ifm2 and op.ifm2.format - query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area + query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() query.ofm_shape = op.ofm.shape - query.ofm_memory_area = op.ofm.mem_area + query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area query.ofm_bits = op.ofm.dtype.size_in_bits() query.ofm_format = op.ofm.format query.kernel = op.kernel @@ -715,31 +715,38 @@ def estimate_full_op_performance( cycles_a[PassCycles.Npu] += max(dma_transfer_cycles - slack_cycles, 0) # OFM write - ofm = op.parent_op.ofm + ofm = op.ofm.connection.parent_tens bw = access.ofm_write * ofm.element_size() bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw - scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency( - arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw + scaled_bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency( + arch, + False, + query.ofm_memory_area, + query.ofm_format, + query.ofm_bits, + query.config.ofm_block, + query.ofm_shape, + bw, ) # IFM read - ifm = op.parent_op.ifm2 if op.reversed_operands else op.parent_op.ifm + ifm = op.ifm.connection.parent_tens bw = access.ifm_read[0] * ifm.element_size() - bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw - scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( - arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw + bws[query.ifm_memory_area][ifm.purpose][BandwidthDirection.Read] += bw + scaled_bws[query.ifm_memory_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( + arch, True, query.ifm_memory_area, query.ifm_format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw ) if query.ifm2_shape: - ifm2 = op.parent_op.ifm if op.reversed_operands else op.parent_op.ifm2 + ifm2 = op.ifm2.connection.parent_tens bw = access.ifm_read[1] * ifm2.element_size() bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( arch, True, query.ifm2_memory_area, - ifm2.format, - op.ifm2.dtype.size_in_bits(), + query.ifm2_format, + query.ifm2_bits, query.config.ifm_block, query.ifm2_shape, bw, diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 4befad49..8188b5bb 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -566,15 +566,15 @@ class Scheduler: def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth): query = npu_performance.PerformanceQuery(op.op_type.npu_block_type) query.ifm_shape = op.ifm.shape - query.ifm_memory_area = op.ifm.mem_area + query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area query.ifm_bits = op.ifm.dtype.size_in_bits() query.ifm_format = op.ifm.format query.ifm2_shape = op.ifm2 and op.ifm2.shape - query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area + query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() query.ifm2_format = op.ifm2 and op.ifm2.format query.ofm_shape = op.ofm.shape.with_depth(ofm_depth) - query.ofm_memory_area = op.ofm.mem_area + query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area query.ofm_bits = op.ofm.dtype.size_in_bits() query.ofm_format = op.ofm.format if op.parent_op.bias: @@ -589,15 +589,15 @@ class Scheduler: def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth): query = npu_performance.PerformanceQuery(op.op_type.npu_block_type) query.ifm_shape = op.ifm.shape - query.ifm_memory_area = op.ifm.mem_area + query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area query.ifm_bits = op.ifm.dtype.size_in_bits() query.ifm_format = op.ifm.format query.ifm2_shape = op.ifm2 and op.ifm2.shape - query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area + query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() query.ifm2_format = op.ifm2 and op.ifm2.format query.ofm_shape = op.ofm.shape.with_depth(ofm_depth) - query.ofm_memory_area = op.ofm.mem_area + query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area query.ofm_bits = op.ofm.dtype.size_in_bits() query.ofm_format = op.ofm.format if op.parent_op.bias: -- cgit v1.2.1