From fd8b500085d1ac1cca54a71631d21713a3c21f09 Mon Sep 17 00:00:00 2001 From: Rickard Bolin Date: Mon, 16 May 2022 09:11:06 +0000 Subject: MLBEDSW-6263: Use separate tensors for double buffering Uses separate tensors for the individual weight buffers in case of weight double buffering. Each weight buffer tensor gets its own individual live range. This patch is a clone of a previously reverted patch, but with some additional bug fixes applied. Signed-off-by: Rickard Bolin Change-Id: I868c70d15821eb9f1399186f2da6e7345f6ee343 --- ethosu/vela/npu_performance.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'ethosu/vela/npu_performance.py') diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 81d0be7e..0c8a9073 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -620,8 +620,8 @@ def estimate_full_op_performance( prev_cost = schedule.cost_map[prev_op] if prev_op else None if op.parent_op.bias: query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth) - if cost.buffered_weight_tensor: - query.const_memory_area = cost.buffered_weight_tensor.mem_area + if cost.buffered_weight_tensors: + query.const_memory_area = cost.buffered_weight_tensors[0].mem_area else: query.const_memory_area = cost.npu_weights_tensor.mem_area @@ -649,7 +649,7 @@ def estimate_full_op_performance( # LUT read from SHRAM TODO remove? scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw - if cost.npu_weights_tensor and cost.buffered_weight_tensor: + if cost.npu_weights_tensor and cost.buffered_weight_tensors: # DMA Weight Transfer sz = 0 # Get the size of the first DMA @@ -661,10 +661,10 @@ def estimate_full_op_performance( total_sz = len(cost.npu_weights_tensor.buffer) bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz - bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz + bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz ws_first_transfer_cycles = measure_mem2mem_cycles( - arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz + arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz ) # Add cycles for Weight + Scale Transfer @@ -720,7 +720,7 @@ def estimate_full_op_performance( bw = access.const_read[0] * bandwidth_compression_scale_approx bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw - if not cost.buffered_weight_tensor: + if not cost.buffered_weight_tensors: scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw if access.const_read[1] > 0: @@ -728,7 +728,7 @@ def estimate_full_op_performance( bw = access.const_read[1] * op.parent_op.bias.element_size() bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw - if not cost.buffered_weight_tensor: + if not cost.buffered_weight_tensors: scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw update_summary_cycles(arch, scaled_bws, cycles_a) -- cgit v1.2.1