aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/npu_performance.py
diff options
context:
space:
mode:
Diffstat (limited to 'ethosu/vela/npu_performance.py')
-rw-r--r--ethosu/vela/npu_performance.py26
1 files changed, 21 insertions, 5 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 19579520..41d75f45 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -283,16 +283,32 @@ def estimate_output_cycles(
def estimate_conv_pooling_cycles(
arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor
):
+ ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
+ ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)
+ ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1)
+
+ if (
+ arch.config.ofm_ublock.height == 2
+ and npu_block_type
+ in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
+ and ofm_tens_shape[1] == 1
+ # Optimisation only applies for even width tensors
+ and ofm_tens_shape[2] % 2 == 0
+ and kernel_dims[0] == 1
+ ):
+ ofm_ublock.width = 4
+ ofm_ublock.height = 1
+ block_config.height = 1
+
num_ublk = (
- (block_config.width // arch.config.ofm_ublock.width)
- * (block_config.height // arch.config.ofm_ublock.height)
- * (block_config.depth // arch.config.ofm_ublock.depth)
+ numeric_util.round_up_divide(block_config.width, ofm_ublock.width)
+ * (block_config.height // ofm_ublock.height)
+ * (block_config.depth // ofm_ublock.depth)
)
num_ofm_blk = 0
total_cycles = 0
num_elems_blk = block_config.width * block_config.height * block_config.depth
- ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)
- ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1)
+
use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]