aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorwilisa01 <william.isaksson@arm.com>2023-05-03 11:26:22 +0000
committerFredrik Svedberg <fredrik.svedberg@arm.com>2023-07-31 14:12:59 +0000
commit2ff8c45528cb38c28c7b7ac40bbeb3c8a374934d (patch)
tree5985c2d2da8e88d831c0bb4b64b75a14867be26f
parenta4f8411f870defaba52175717b40afdd41ae0d40 (diff)
downloadethos-u-vela-2ff8c45528cb38c28c7b7ac40bbeb3c8a374934d.tar.gz
MLBEDSW-7397: Wrong mem_area used in scheduler
Performance estimation now uses the parent_tensor mem_area instead of the scheduler_op mem_area, because the mem_area is only set on the parent_tensor by the scheduler. Signed-off-by: wilisa01 <william.isaksson@arm.com> Change-Id: I11f73686bfbd6958a8920c5e264a5f95cc3f23d1
-rw-r--r--ethosu/vela/npu_performance.py35
-rw-r--r--ethosu/vela/scheduler.py12
2 files changed, 27 insertions, 20 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 2325a9c..eb9f66c 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -194,7 +194,7 @@ def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
def _estimate_memory_transfer_efficiency(
- arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
+ arch, is_read, mem_area, format, element_bits, block_size, shape4D, to_transfer
):
burst_len = 8
@@ -620,14 +620,14 @@ def estimate_full_op_performance(
query = PerformanceQuery(op.op_type.npu_block_type)
query.ifm_shape = op.ifm.shape
query.ifm_format = op.ifm.format
- query.ifm_memory_area = op.ifm.mem_area
+ query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area # Mem Area is set directly on parent_tens
query.ifm_bits = op.ifm.dtype.size_in_bits()
query.ifm2_shape = op.ifm2 and op.ifm2.shape
query.ifm2_format = op.ifm2 and op.ifm2.format
- query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
+ query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area
query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
query.ofm_shape = op.ofm.shape
- query.ofm_memory_area = op.ofm.mem_area
+ query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area
query.ofm_bits = op.ofm.dtype.size_in_bits()
query.ofm_format = op.ofm.format
query.kernel = op.kernel
@@ -715,31 +715,38 @@ def estimate_full_op_performance(
cycles_a[PassCycles.Npu] += max(dma_transfer_cycles - slack_cycles, 0)
# OFM write
- ofm = op.parent_op.ofm
+ ofm = op.ofm.connection.parent_tens
bw = access.ofm_write * ofm.element_size()
bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
- scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
- arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
+ scaled_bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
+ arch,
+ False,
+ query.ofm_memory_area,
+ query.ofm_format,
+ query.ofm_bits,
+ query.config.ofm_block,
+ query.ofm_shape,
+ bw,
)
# IFM read
- ifm = op.parent_op.ifm2 if op.reversed_operands else op.parent_op.ifm
+ ifm = op.ifm.connection.parent_tens
bw = access.ifm_read[0] * ifm.element_size()
- bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
- scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
- arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
+ bws[query.ifm_memory_area][ifm.purpose][BandwidthDirection.Read] += bw
+ scaled_bws[query.ifm_memory_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
+ arch, True, query.ifm_memory_area, query.ifm_format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
)
if query.ifm2_shape:
- ifm2 = op.parent_op.ifm if op.reversed_operands else op.parent_op.ifm2
+ ifm2 = op.ifm2.connection.parent_tens
bw = access.ifm_read[1] * ifm2.element_size()
bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
arch,
True,
query.ifm2_memory_area,
- ifm2.format,
- op.ifm2.dtype.size_in_bits(),
+ query.ifm2_format,
+ query.ifm2_bits,
query.config.ifm_block,
query.ifm2_shape,
bw,
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 4befad4..8188b5b 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -566,15 +566,15 @@ class Scheduler:
def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
query.ifm_shape = op.ifm.shape
- query.ifm_memory_area = op.ifm.mem_area
+ query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area
query.ifm_bits = op.ifm.dtype.size_in_bits()
query.ifm_format = op.ifm.format
query.ifm2_shape = op.ifm2 and op.ifm2.shape
- query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
+ query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area
query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
query.ifm2_format = op.ifm2 and op.ifm2.format
query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
- query.ofm_memory_area = op.ofm.mem_area
+ query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area
query.ofm_bits = op.ofm.dtype.size_in_bits()
query.ofm_format = op.ofm.format
if op.parent_op.bias:
@@ -589,15 +589,15 @@ class Scheduler:
def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):
query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
query.ifm_shape = op.ifm.shape
- query.ifm_memory_area = op.ifm.mem_area
+ query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area
query.ifm_bits = op.ifm.dtype.size_in_bits()
query.ifm_format = op.ifm.format
query.ifm2_shape = op.ifm2 and op.ifm2.shape
- query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
+ query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area
query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
query.ifm2_format = op.ifm2 and op.ifm2.format
query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
- query.ofm_memory_area = op.ofm.mem_area
+ query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area
query.ofm_bits = op.ofm.dtype.size_in_bits()
query.ofm_format = op.ofm.format
if op.parent_op.bias: