aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/npu_performance.py
diff options
context:
space:
mode:
authorRickard Bolin <rickard.bolin@arm.com>2022-05-16 09:11:06 +0000
committerRickard Bolin <rickard.bolin@arm.com>2022-05-16 15:20:20 +0000
commitfd8b500085d1ac1cca54a71631d21713a3c21f09 (patch)
tree4a8d1c7809dc1eb748f0f0b9ba2736e5d7bb5e69 /ethosu/vela/npu_performance.py
parent6f4cb0362a2f00b3045565de2c27f72997b2998b (diff)
downloadethos-u-vela-fd8b500085d1ac1cca54a71631d21713a3c21f09.tar.gz
MLBEDSW-6263: Use separate tensors for double buffering
Uses separate tensors for the individual weight buffers in case of weight double buffering. Each weight buffer tensor gets its own individual live range. This patch is a clone of a previously reverted patch, but with some additional bug fixes applied. Signed-off-by: Rickard Bolin <rickard.bolin@arm.com> Change-Id: I868c70d15821eb9f1399186f2da6e7345f6ee343
Diffstat (limited to 'ethosu/vela/npu_performance.py')
-rw-r--r--ethosu/vela/npu_performance.py14
1 files changed, 7 insertions, 7 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 81d0be7e..0c8a9073 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -620,8 +620,8 @@ def estimate_full_op_performance(
prev_cost = schedule.cost_map[prev_op] if prev_op else None
if op.parent_op.bias:
query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
- if cost.buffered_weight_tensor:
- query.const_memory_area = cost.buffered_weight_tensor.mem_area
+ if cost.buffered_weight_tensors:
+ query.const_memory_area = cost.buffered_weight_tensors[0].mem_area
else:
query.const_memory_area = cost.npu_weights_tensor.mem_area
@@ -649,7 +649,7 @@ def estimate_full_op_performance(
# LUT read from SHRAM TODO remove?
scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
- if cost.npu_weights_tensor and cost.buffered_weight_tensor:
+ if cost.npu_weights_tensor and cost.buffered_weight_tensors:
# DMA Weight Transfer
sz = 0
# Get the size of the first DMA
@@ -661,10 +661,10 @@ def estimate_full_op_performance(
total_sz = len(cost.npu_weights_tensor.buffer)
bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
- bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
+ bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
ws_first_transfer_cycles = measure_mem2mem_cycles(
- arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz
+ arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz
)
# Add cycles for Weight + Scale Transfer
@@ -720,7 +720,7 @@ def estimate_full_op_performance(
bw = access.const_read[0] * bandwidth_compression_scale_approx
bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
- if not cost.buffered_weight_tensor:
+ if not cost.buffered_weight_tensors:
scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
if access.const_read[1] > 0:
@@ -728,7 +728,7 @@ def estimate_full_op_performance(
bw = access.const_read[1] * op.parent_op.bias.element_size()
bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
- if not cost.buffered_weight_tensor:
+ if not cost.buffered_weight_tensors:
scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
update_summary_cycles(arch, scaled_bws, cycles_a)