aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/npu_performance.py
diff options
context:
space:
mode:
authorPatrik Gustavsson <patrik.gustavsson@arm.com>2021-01-21 08:28:55 +0100
committerpatrik.gustavsson <patrik.gustavsson@arm.com>2021-01-28 09:09:58 +0000
commit3a26920b7cd302364d68830eb6e374311ce17f22 (patch)
tree359913c1fbfc443611865d628414182554b0710e /ethosu/vela/npu_performance.py
parente22ba8cb3090886b2d80a2df0e599dbf4cd7f483 (diff)
downloadethos-u-vela-3a26920b7cd302364d68830eb6e374311ce17f22.tar.gz
MLBEDSW-3772 Reshape removal
-Removed reshapes in the original graph -Removed the addition of reshapes to the optimized graph -Reshapes with different ifm/ofm quantisation will remain Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com> Change-Id: I94862be53dac0d7434815e2aee5ca678228495f8
Diffstat (limited to 'ethosu/vela/npu_performance.py')
-rw-r--r--ethosu/vela/npu_performance.py56
1 files changed, 44 insertions, 12 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index c2418d73..3acd5e6c 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -117,15 +117,21 @@ def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversa
return min(ifm_depth, ifm_blk_depth)
-def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, dpu_cycles=0):
+def get_minimal_cmd_cycles(
+ arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, ifm_shape4D, ofm_shape4D, dpu_cycles=0
+):
ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
cycles_ifm_blk = (
- estimate_memory_transfer_efficiency(arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk)
+ estimate_memory_transfer_efficiency(
+ arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk, shape4D=ifm_shape4D
+ )
/ arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]
)
cycles_ofm_blk = (
- estimate_memory_transfer_efficiency(arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk)
+ estimate_memory_transfer_efficiency(
+ arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk, shape4D=ofm_shape4D
+ )
/ arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
)
return (
@@ -204,7 +210,14 @@ def estimate_output_cycles(
if primary_op.type.is_elementwise_op() and block_config is not None:
num_elems_blk = block_config.width * block_config.height * block_config.depth
cycle_cmd = get_minimal_cmd_cycles(
- arch, ifm_tensor, ofm_tensor, block_config, block_config, num_elems_blk * cycle_per_elem
+ arch,
+ ifm_tensor,
+ ofm_tensor,
+ block_config,
+ block_config,
+ num_elems_blk * cycle_per_elem,
+ primary_op.ifm_shapes[0],
+ primary_op.ofm_shapes[0],
)
cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
@@ -343,7 +356,15 @@ def estimate_conv_pooling_cycles(
cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
cycles_cmd = get_minimal_cmd_cycles(
- arch, ifm_tensor, ofm_tensor, ifm_block, ofm_block, cycles_dpu_blk, cycles_output_blk
+ arch,
+ ifm_tensor,
+ ofm_tensor,
+ ifm_block,
+ ofm_block,
+ cycles_dpu_blk,
+ ifm_tens_shape,
+ ofm_tens_shape,
+ cycles_output_blk,
)
cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
cycles_output_blk = max(cycles_output_blk, cycles_cmd)
@@ -356,7 +377,9 @@ def estimate_conv_pooling_cycles(
return total_cycles
-def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block_size: Block, replace_bw=None):
+def estimate_memory_transfer_efficiency(
+ arch, mem_area, direction, tensor, block_size: Block, replace_bw=None, shape4D=None
+):
if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):
return tensor.bandwidth() if replace_bw is None else replace_bw
@@ -368,9 +391,10 @@ def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block
tens = tensor.clone()
if not tens.avoid_NHCWB16:
tens.set_format(TensorFormat.NHCWB16, arch)
+ strides = tens.get_strides(shape4D=shape4D)
if tens.format == TensorFormat.NHCWB16:
- if tens.get_strides()[1] == block_size.depth:
+ if strides[1] == block_size.depth:
burst_len = elem_size * block_size.depth * block_size.width
elif is_ifm:
burst_len = 16 * elem_size * block_size.width
@@ -379,12 +403,12 @@ def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block
else:
assert tens.format == TensorFormat.NHWC
if is_ifm:
- if tens.get_strides()[3] == block_size.depth:
+ if strides[3] == block_size.depth:
burst_len = elem_size * block_size.depth * block_size.width
else:
burst_len = elem_size * block_size.depth
else:
- if block_size.depth <= 16 and tens.get_strides()[3] == block_size.depth:
+ if block_size.depth <= 16 and strides[3] == block_size.depth:
burst_len = elem_size * block_size.depth * block_size.width
else:
burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
@@ -585,12 +609,12 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
scaled_bws[arch.fast_storage_mem_area][tens.purpose][
BandwidthDirection.Write
] += estimate_memory_transfer_efficiency(
- arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block
+ arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0],
)
else:
bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency(
- arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block
+ arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0]
)
for tens in ps.intermediates:
@@ -612,8 +636,16 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
bw = tens.bandwidth()
bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
+
+ op_shape = None
+ if ps.placement == PassPlacement.Npu and primary_op:
+ if tens == ps.ifm_tensor:
+ op_shape = ps.ifm_shapes[0]
+ elif tens == ps.ifm2_tensor:
+ op_shape = ps.ifm_shapes[1]
+
scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency(
- arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw
+ arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw, op_shape
)
# quick build access counts for only current pass, even though these aren't the final numbers