aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/npu_performance.py
diff options
context:
space:
mode:
authorPatrik Gustavsson <patrik.gustavsson@arm.com>2020-12-16 13:08:06 +0100
committerPatrik Gustavsson <patrik.gustavsson@arm.com>2020-12-21 07:34:05 +0100
commitbf31d647dc5df47410ee577b12427ddf076d816b (patch)
tree85ddd620916565aa8565d072b764ca4918b405a1 /ethosu/vela/npu_performance.py
parent2349d429d926e258e9a61d34c7fd97660ab9fb98 (diff)
downloadethos-u-vela-bf31d647dc5df47410ee577b12427ddf076d816b.tar.gz
MLBEDSW-3645 4D class for op ifm/ofm shapes
Add 4D shape class for op Ifm/ofm shapes Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com> Change-Id: Ic0a98da9d2f9d085605e39a9ab5a26bad6e702a3
Diffstat (limited to 'ethosu/vela/npu_performance.py')
-rw-r--r--ethosu/vela/npu_performance.py41
1 files changed, 21 insertions, 20 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index c2ec4424..4ca46831 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -48,7 +48,7 @@ def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_conf
if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
op = ps2.primary_op
- ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0][-1], op.ifm.dtype.size_in_bits())
+ ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())
else:
ifm_block_depth = block_config_ps2[-1]
@@ -231,9 +231,9 @@ def estimate_conv_pooling_cycles(
arch.config.ofm_ublock.height == 2
and npu_block_type
in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
- and ofm_tens_shape[1] == 1
+ and ofm_tens_shape.height == 1
# Optimisation only applies for even width tensors
- and ofm_tens_shape[2] % 2 == 0
+ and ofm_tens_shape.width % 2 == 0
and kernel_dims[0] == 1
):
ofm_ublock.width = 4
@@ -319,14 +319,14 @@ def estimate_conv_pooling_cycles(
cycles_dpu_blk += delay_cycles
if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
- cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape[3], ifm_block.depth)
+ cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)
cycles_dpu_blk /= arch.ncores
num_ofm_blk = (
- numeric_util.round_up_divide(ofm_tens_shape[1], ofm_block.height)
- * numeric_util.round_up_divide(ofm_tens_shape[2], ofm_block.width)
- * numeric_util.round_up_divide(ofm_tens_shape[3], ofm_block.depth)
+ numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)
+ * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)
+ * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)
)
cycles_output_blk = estimate_output_cycles(
@@ -336,7 +336,7 @@ def estimate_conv_pooling_cycles(
if scale_tensor:
cycles_bias_blk = (
10
- * min(ofm_block.depth, ofm_tens_shape[3])
+ * min(ofm_block.depth, ofm_tens_shape.depth)
* arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
/ 256
)
@@ -420,8 +420,8 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
npu_block_type = primary_op.type.npu_block_type
ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
- ifm_tensor_shape = list(ps.primary_op.ifm_shapes[0])
- ofm_tensor_shape = list(ps.primary_op.ofm_shapes[0])
+ ifm_tensor_shape = ps.primary_op.ifm_shapes[0].clone()
+ ofm_tensor_shape = ps.primary_op.ofm_shapes[0].clone()
if npu_block_type == NpuBlockType.ReduceSum:
block_traversal = TensorBlockTraversal.DepthFirst
@@ -434,7 +434,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
else:
block_traversal = TensorBlockTraversal.Default
ifm_block_depth = get_ifm_block_depth(
- npu_block_type, ifm_tensor_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
+ npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
)
ifm_block = arch.get_ifm_block_size(
ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
@@ -448,11 +448,12 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
NpuBlockType.ReduceSum,
):
# extent the ifm to full dimension
- batch_size = ifm_tensor_shape[0]
+
+ batch_size = ifm_tensor_shape.batch
# add in padding
- ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2] # height += top and bottom
- ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3] # width += left and right
+ ifm_tensor_shape.height += explicit_padding[0] + explicit_padding[2] # height += top and bottom
+ ifm_tensor_shape.width += explicit_padding[1] + explicit_padding[3] # width += left and right
if npu_block_type != NpuBlockType.Pooling:
if npu_block_type == NpuBlockType.ReduceSum:
@@ -468,9 +469,9 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
nn_ops = (
- int(ofm_tensor_shape[0])
- * int(ofm_tensor_shape[1])
- * int(ofm_tensor_shape[2])
+ int(ofm_tensor_shape.batch)
+ * int(ofm_tensor_shape.height)
+ * int(ofm_tensor_shape.width)
* int(weight_tensor_shape[0])
* int(weight_tensor_shape[1])
* int(weight_tensor_shape[2])
@@ -481,7 +482,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
primary_op.attrs["ksize"][1],
primary_op.attrs["ksize"][2],
1,
- ifm_tensor_shape[3],
+ ifm_tensor_shape.depth,
]
weight_tensor_bandwidth_shape = weight_tensor_shape
weight_tensor_element_size = 0
@@ -504,8 +505,8 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None,
replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple
weight_read_multiple = numeric_util.round_up_divide(
- ofm_tensor_shape[1], ofm_block.height
- ) * numeric_util.round_up_divide(ofm_tensor_shape[2], ofm_block.width)
+ ofm_tensor_shape.height, ofm_block.height
+ ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)
replacement_read_bws[weight_tensor] = (
batch_size
* shape_num_elements(weight_tensor_bandwidth_shape)