aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfvén <johan.alfven@arm.com>2022-05-15 14:54:51 +0200
committertim.hall <tim.hall@arm.com>2022-05-19 14:44:01 +0000
commit0f98de67b2f929a1297326721eb421f0a44ef216 (patch)
tree3f76d7776aa391c42ed5900e5aff96c23d519c34
parent1e363b10a6d4ce0fc062e34df0182b847b08850d (diff)
downloadethos-u-vela-0f98de67b2f929a1297326721eb421f0a44ef216.tar.gz
MLBEDSW-6384: Updated weight buffering cycle calculation
- The npu cycles are not correct calculated when only one weight buffer is used, since weights can not be fetched in parallel. - Added new calculation in the single buffer case. Signed-off-by: Johan Alfven <johan.alfven@arm.com> Change-Id: I8568912d11d137a298225ab77b8b3272613c76f6
-rw-r--r--ethosu/vela/npu_performance.py17
1 files changed, 13 insertions, 4 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index b7607e6..0e2e3ca 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -50,6 +50,7 @@ from .shape4d import Shape4D
from .tensor import BandwidthDirection
from .tensor import MemArea
from .tensor import TensorPurpose
+from .tensor import TensorSubPurpose
from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype
from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type
from .weight_compressor import WeightKey
@@ -674,10 +675,18 @@ def estimate_full_op_performance(
)
# Add cycles for Weight + Scale Transfer
- cycles_a[PassCycles.Npu] = max(
- cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
- cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
- )
+ if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer:
+ # Double buffer - weights can be fetched in parallel
+ cycles_a[PassCycles.Npu] = max(
+ cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
+ cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
+ )
+ else:
+ # Standard buffer - weights can not be fetched in parallel so weight transfer
+ # must be included in the result
+ cycles_a[PassCycles.Npu] = (
+ cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles)
+ )
# Add cycles for LUT Transfer
cycles_a[PassCycles.Npu] += lut_transfer_cycles