aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDiqing Zhong <diqing.zhong@arm.com>2020-10-13 11:42:37 +0200
committertim.hall <tim.hall@arm.com>2020-11-11 11:14:53 +0000
commite5204a6d1837e2d4e9601b4da5a0c47e713257bd (patch)
tree72a8b1a2931a39391e7851e03f22d2bbcbd23623
parent42e833d64918b666e81f957c56919d01bb6212cd (diff)
downloadethos-u-vela-e5204a6d1837e2d4e9601b4da5a0c47e713257bd.tar.gz
Vela: Fix perf estimation for conv 1D reshape
Change-Id: I8f139381d0e01e8ac70d89c4a312ee3000fb5fa1 Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
-rw-r--r--ethosu/vela/npu_performance.py26
1 files changed, 21 insertions, 5 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 1957952..41d75f4 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -283,16 +283,32 @@ def estimate_output_cycles(
def estimate_conv_pooling_cycles(
arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor
):
+ ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
+ ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)
+ ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1)
+
+ if (
+ arch.config.ofm_ublock.height == 2
+ and npu_block_type
+ in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
+ and ofm_tens_shape[1] == 1
+ # Optimisation only applies for even width tensors
+ and ofm_tens_shape[2] % 2 == 0
+ and kernel_dims[0] == 1
+ ):
+ ofm_ublock.width = 4
+ ofm_ublock.height = 1
+ block_config.height = 1
+
num_ublk = (
- (block_config.width // arch.config.ofm_ublock.width)
- * (block_config.height // arch.config.ofm_ublock.height)
- * (block_config.depth // arch.config.ofm_ublock.depth)
+ numeric_util.round_up_divide(block_config.width, ofm_ublock.width)
+ * (block_config.height // ofm_ublock.height)
+ * (block_config.depth // ofm_ublock.depth)
)
num_ofm_blk = 0
total_cycles = 0
num_elems_blk = block_config.width * block_config.height * block_config.depth
- ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)
- ofm_tens_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1)
+
use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]