diff options
author | Louis Verhaard <louis.verhaard@arm.com> | 2022-03-01 11:26:58 +0100 |
---|---|---|
committer | Fredrik Svedberg <fredrik.svedberg@arm.com> | 2022-03-30 13:00:15 +0000 |
commit | cc5f4de1c35ba44fca7ff6295c6ae846f8242344 (patch) | |
tree | 68c4f8124a3ee6ec6f7fceb32a1d8aec11ac9a86 /ethosu/vela/npu_performance.py | |
parent | a19b4671dd0594181a2789930cc98bf5dc41ded4 (diff) | |
download | ethos-u-vela-cc5f4de1c35ba44fca7ff6295c6ae846f8242344.tar.gz |
MLBEDSW-6263: Use separate tensors for double buffering
Uses separate tensors for the individual weight buffers
in case of weight double buffering.
Each weight buffer tensor gets its own individual live range.
Change-Id: I724a8c61a7045615fbd2ed9535663076ac8edd13
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
Diffstat (limited to 'ethosu/vela/npu_performance.py')
-rw-r--r-- | ethosu/vela/npu_performance.py | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 8c4aee63..4ffca496 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -608,8 +608,8 @@ def estimate_full_op_performance( prev_cost = schedule.cost_map[prev_op] if prev_op else None if op.parent_op.bias: query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth) - if cost.buffered_weight_tensor: - query.const_memory_area = cost.buffered_weight_tensor.mem_area + if cost.buffered_weight_tensors: + query.const_memory_area = cost.buffered_weight_tensors[0].mem_area else: query.const_memory_area = cost.npu_weights_tensor.mem_area @@ -637,7 +637,7 @@ def estimate_full_op_performance( # LUT read from SHRAM TODO remove? scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw - if cost.npu_weights_tensor and cost.buffered_weight_tensor: + if cost.npu_weights_tensor and cost.buffered_weight_tensors: # DMA Weight Transfer sz = 0 # Get the size of the first DMA @@ -649,10 +649,10 @@ def estimate_full_op_performance( total_sz = len(cost.npu_weights_tensor.buffer) bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz - bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz + bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz ws_first_transfer_cycles = measure_mem2mem_cycles( - arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz + arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz ) # Add cycles for Weight + Scale Transfer @@ -708,7 +708,7 @@ def estimate_full_op_performance( bw = access.const_read[0] * bandwidth_compression_scale_approx bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw - if not cost.buffered_weight_tensor: + if not cost.buffered_weight_tensors: scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw if access.const_read[1] > 0: @@ -716,7 +716,7 @@ def estimate_full_op_performance( bw = access.const_read[1] * op.parent_op.bias.element_size() bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw - if not cost.buffered_weight_tensor: + if not cost.buffered_weight_tensors: scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw update_summary_cycles(arch, scaled_bws, cycles_a) |