From fd8b500085d1ac1cca54a71631d21713a3c21f09 Mon Sep 17 00:00:00 2001
From: Rickard Bolin <rickard.bolin@arm.com>
Date: Mon, 16 May 2022 09:11:06 +0000
Subject: MLBEDSW-6263: Use separate tensors for double buffering

Uses separate tensors for the individual weight buffers
in case of weight double buffering.

Each weight buffer tensor gets its own individual live range.

This patch is a clone of a previously reverted patch, but with some
additional bug fixes applied.

Signed-off-by: Rickard Bolin <rickard.bolin@arm.com>
Change-Id: I868c70d15821eb9f1399186f2da6e7345f6ee343
---
 ethosu/vela/npu_performance.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'ethosu/vela/npu_performance.py')

diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 81d0be7e..0c8a9073 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -620,8 +620,8 @@ def estimate_full_op_performance(
     prev_cost = schedule.cost_map[prev_op] if prev_op else None
     if op.parent_op.bias:
         query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
-        if cost.buffered_weight_tensor:
-            query.const_memory_area = cost.buffered_weight_tensor.mem_area
+        if cost.buffered_weight_tensors:
+            query.const_memory_area = cost.buffered_weight_tensors[0].mem_area
         else:
             query.const_memory_area = cost.npu_weights_tensor.mem_area
 
@@ -649,7 +649,7 @@ def estimate_full_op_performance(
             # LUT read from SHRAM TODO remove?
             scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
 
-    if cost.npu_weights_tensor and cost.buffered_weight_tensor:
+    if cost.npu_weights_tensor and cost.buffered_weight_tensors:
         # DMA Weight Transfer
         sz = 0
         # Get the size of the first DMA
@@ -661,10 +661,10 @@ def estimate_full_op_performance(
 
         total_sz = len(cost.npu_weights_tensor.buffer)
         bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
-        bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
+        bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
 
         ws_first_transfer_cycles = measure_mem2mem_cycles(
-            arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz
+            arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz
         )
 
         # Add cycles for Weight + Scale Transfer
@@ -720,7 +720,7 @@ def estimate_full_op_performance(
         bw = access.const_read[0] * bandwidth_compression_scale_approx
         bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
 
-        if not cost.buffered_weight_tensor:
+        if not cost.buffered_weight_tensors:
             scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
 
     if access.const_read[1] > 0:
@@ -728,7 +728,7 @@ def estimate_full_op_performance(
         bw = access.const_read[1] * op.parent_op.bias.element_size()
         bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
 
-        if not cost.buffered_weight_tensor:
+        if not cost.buffered_weight_tensors:
             scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
 
     update_summary_cycles(arch, scaled_bws, cycles_a)
-- 
cgit v1.2.1