diff options
-rw-r--r-- | ethosu/vela/npu_performance.py | 26 |
1 files changed, 21 insertions, 5 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 19579520..41d75f45 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -283,16 +283,32 @@ def estimate_output_cycles( def estimate_conv_pooling_cycles( arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor ): + ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth) + ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1) + ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1) + + if ( + arch.config.ofm_ublock.height == 2 + and npu_block_type + in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) + and ofm_tens_shape[1] == 1 + # Optimisation only applies for even width tensors + and ofm_tens_shape[2] % 2 == 0 + and kernel_dims[0] == 1 + ): + ofm_ublock.width = 4 + ofm_ublock.height = 1 + block_config.height = 1 + num_ublk = ( - (block_config.width // arch.config.ofm_ublock.width) - * (block_config.height // arch.config.ofm_ublock.height) - * (block_config.depth // arch.config.ofm_ublock.depth) + numeric_util.round_up_divide(block_config.width, ofm_ublock.width) + * (block_config.height // ofm_ublock.height) + * (block_config.depth // ofm_ublock.depth) ) num_ofm_blk = 0 total_cycles = 0 num_elems_blk = block_config.width * block_config.height * block_config.depth - ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1) - ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1) + use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor) sub_kernel_limits = arch.sub_kernel_limits[npu_block_type] |