From e5204a6d1837e2d4e9601b4da5a0c47e713257bd Mon Sep 17 00:00:00 2001 From: Diqing Zhong Date: Tue, 13 Oct 2020 11:42:37 +0200 Subject: Vela: Fix perf estimation for conv 1D reshape Change-Id: I8f139381d0e01e8ac70d89c4a312ee3000fb5fa1 Signed-off-by: Diqing Zhong --- ethosu/vela/npu_performance.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 19579520..41d75f45 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -283,16 +283,32 @@ def estimate_output_cycles( def estimate_conv_pooling_cycles( arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor ): + ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth) + ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1) + ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1) + + if ( + arch.config.ofm_ublock.height == 2 + and npu_block_type + in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) + and ofm_tens_shape[1] == 1 + # Optimisation only applies for even width tensors + and ofm_tens_shape[2] % 2 == 0 + and kernel_dims[0] == 1 + ): + ofm_ublock.width = 4 + ofm_ublock.height = 1 + block_config.height = 1 + num_ublk = ( - (block_config.width // arch.config.ofm_ublock.width) - * (block_config.height // arch.config.ofm_ublock.height) - * (block_config.depth // arch.config.ofm_ublock.depth) + numeric_util.round_up_divide(block_config.width, ofm_ublock.width) + * (block_config.height // ofm_ublock.height) + * (block_config.depth // ofm_ublock.depth) ) num_ofm_blk = 0 total_cycles = 0 num_elems_blk = block_config.width * block_config.height * block_config.depth - ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1) - ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1) + use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor) sub_kernel_limits = arch.sub_kernel_limits[npu_block_type] -- cgit v1.2.1