diff options
author | Johan Alfvén <johan.alfven@arm.com> | 2022-05-15 14:54:51 +0200 |
---|---|---|
committer | tim.hall <tim.hall@arm.com> | 2022-05-19 14:44:01 +0000 |
commit | 0f98de67b2f929a1297326721eb421f0a44ef216 (patch) | |
tree | 3f76d7776aa391c42ed5900e5aff96c23d519c34 | |
parent | 1e363b10a6d4ce0fc062e34df0182b847b08850d (diff) | |
download | ethos-u-vela-0f98de67b2f929a1297326721eb421f0a44ef216.tar.gz |
MLBEDSW-6384: Updated weight buffering cycle calculation
- The npu cycles are not correct calculated when only
one weight buffer is used, since weights can not
be fetched in parallel.
- Added new calculation in the single buffer case.
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
Change-Id: I8568912d11d137a298225ab77b8b3272613c76f6
-rw-r--r-- | ethosu/vela/npu_performance.py | 17 |
1 files changed, 13 insertions, 4 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index b7607e6d..0e2e3ca2 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -50,6 +50,7 @@ from .shape4d import Shape4D from .tensor import BandwidthDirection from .tensor import MemArea from .tensor import TensorPurpose +from .tensor import TensorSubPurpose from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type from .weight_compressor import WeightKey @@ -674,10 +675,18 @@ def estimate_full_op_performance( ) # Add cycles for Weight + Scale Transfer - cycles_a[PassCycles.Npu] = max( - cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles, - cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0), - ) + if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer: + # Double buffer - weights can be fetched in parallel + cycles_a[PassCycles.Npu] = max( + cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles, + cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0), + ) + else: + # Standard buffer - weights can not be fetched in parallel so weight transfer + # must be included in the result + cycles_a[PassCycles.Npu] = ( + cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles) + ) # Add cycles for LUT Transfer cycles_a[PassCycles.Npu] += lut_transfer_cycles |