aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDiqing Zhong <diqing.zhong@arm.com>2020-11-16 16:15:56 +0100
committerpatrik.gustavsson <patrik.gustavsson@arm.com>2020-11-19 13:30:03 +0000
commit986e3198d18d857112f15c2236fceec2397b5801 (patch)
treecdc7140d922e65fd894de5de5c685547e414c7ed
parentc8a22f198c4fb23735962fee00c062b0325ca6a6 (diff)
downloadethos-u-vela-986e3198d18d857112f15c2236fceec2397b5801.tar.gz
MLBEDSW-3476: Fix performance regression
- Improve the conv estimation when the block size is very small - Estimate cycles on bias/scale channel Signed-off-by: Diqing Zhong <diqing.zhong@arm.com> Change-Id: I275770b7f013b0812fc1ffe91f42ad07727c9dc7
-rw-r--r--ethosu/vela/npu_performance.py56
1 files changed, 45 insertions, 11 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 80f2c27..29e0df9 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -276,11 +276,20 @@ def estimate_output_cycles(
cycle_per_elem = max(
arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
)
+
return num_elems * cycle_per_elem
def estimate_conv_pooling_cycles(
- arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor
+ arch,
+ npu_block_type,
+ primary_op,
+ block_config: Block,
+ block_traversal,
+ kernel_dims,
+ ifm_tensor,
+ ofm_tensor,
+ scale_tensor=None,
):
ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
ifm_tens_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)
@@ -299,11 +308,11 @@ def estimate_conv_pooling_cycles(
ofm_ublock.height = 1
block_config.height = 1
- num_ublk = (
- numeric_util.round_up_divide(block_config.width, ofm_ublock.width)
- * (block_config.height // ofm_ublock.height)
- * (block_config.depth // ofm_ublock.depth)
+ num_ublk_xy = numeric_util.round_up_divide(block_config.width, ofm_ublock.width) * (
+ block_config.height // ofm_ublock.height
)
+ num_ublk_z = block_config.depth // ofm_ublock.depth
+
num_ofm_blk = 0
total_cycles = 0
num_elems_blk = block_config.width * block_config.height * block_config.depth
@@ -325,29 +334,36 @@ def estimate_conv_pooling_cycles(
npu_block_type, ifm_tens_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, block_config.depth
)
cycles_dpu_blk = 0
+ cycles_wb = 32 * ofm_ublock.depth // 8
for num_kernel_elems in sub_kernel_size:
if npu_block_type == NpuBlockType.Pooling:
- cycles = max(4, num_kernel_elems) * num_ublk
+ cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
cycles *= 2
elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
- cycles = 4 * numeric_util.round_up_divide(num_kernel_elems, 4) * num_ublk
+ cycles = 4 * num_ublk_xy
if ifm_tensor.dtype.size_in_bits() == 16:
cycles *= 2
+ cycles = max(cycles_wb, cycles) * numeric_util.round_up_divide(num_kernel_elems, 4) * num_ublk_z
elif (
(npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
or npu_block_type == NpuBlockType.VectorProduct
or npu_block_type == NpuBlockType.ReduceSum
):
- cycles = 4 * num_kernel_elems * num_ublk * numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)
+ cycles = (
+ max(cycles_wb, 4 * num_ublk_xy)
+ * num_kernel_elems
+ * num_ublk_z
+ * numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)
+ )
else:
assert block_traversal == TensorBlockTraversal.PartKernelFirst
divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
- cycles = 4 * (
+ cycles = max(cycles_wb, 4 * num_ublk_xy) * (
numeric_util.round_up_divide(num_kernel_elems, divider)
* numeric_util.round_up_divide(ifm_blk_depth, 8)
- * num_ublk
+ * num_ublk_z
* numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)
)
cycles_dpu_blk += cycles
@@ -364,6 +380,16 @@ def estimate_conv_pooling_cycles(
arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, None, use_acc_40bits
)
+ if scale_tensor:
+ if scale_tensor.mem_area is MemArea.Sram:
+ latency = 32
+ elif scale_tensor.mem_area is MemArea.Dram:
+ latency = 500
+ else:
+ latency = 64
+ cycles_bias_blk = 10 * min(block_config.depth, ofm_tens_shape[3]) * latency / 256
+ cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
+
if cycles_dpu_blk > cycles_output_blk:
total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
else:
@@ -576,7 +602,15 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
macs[MacCount.NeuralNetworkMacs] += nn_ops
macs[MacCount.HardwareMacs] += num_mac_ops
cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
- arch, npu_block_type, primary_op, ofm_block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor,
+ arch,
+ npu_block_type,
+ primary_op,
+ ofm_block,
+ block_traversal,
+ kernel_dims,
+ ifm_tensor,
+ ofm_tensor,
+ ps.scale_tensor,
)
elif npu_block_type == NpuBlockType.VectorProduct:
nn_macs = (