aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDiqing Zhong <diqing.zhong@arm.com>2020-11-24 14:38:20 +0100
committerpatrik.gustavsson <patrik.gustavsson@arm.com>2020-11-25 15:33:08 +0000
commitef0c7fe4d8c4e4d8f0b5af08111c93df9476a857 (patch)
treef8bb210fd6a5ee04d0be1dec3a0ee79e26d8ef9a
parent9785cc7cc3a6f853f9d52228e1a651b41f8f189f (diff)
downloadethos-u-vela-ef0c7fe4d8c4e4d8f0b5af08111c93df9476a857.tar.gz
MLBEDSW-3530: Fix performance issue
- Improve conv estimation by adding delay cycles - Estimate minimal block cmd cycles Change-Id: Ibea818e8e820731fc7d05c948d5d1abd22e17089 Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
-rw-r--r--ethosu/vela/npu_performance.py165
1 files changed, 124 insertions, 41 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index d1be5a5..9a6e8cd 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -35,6 +35,7 @@ from .operation import Op
from .shared_buffer_allocation import is_acc_40bits_used
from .tensor import MemArea
from .tensor import shape_num_elements
+from .tensor import Tensor
from .tensor import TensorBlockTraversal
from .tensor import TensorFormat
from .tensor import TensorPurpose
@@ -206,8 +207,39 @@ def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversa
return min(ifm_depth, ifm_blk_depth)
+def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, dpu_cycles=0):
+ latencies_rd = {MemArea.Sram: 32, MemArea.Dram: 500, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64}
+ latencies_wr = {MemArea.Sram: 32, MemArea.Dram: 250, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64}
+ ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
+ ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
+ cycles_ifm_blk = (
+ estimate_memory_bandwidth(arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk)
+ / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]
+ )
+ cycles_ofm_blk = (
+ estimate_memory_bandwidth(arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk)
+ / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
+ )
+ return (
+ latencies_rd[ifm_tensor.mem_area]
+ + cycles_ifm_blk
+ + dpu_cycles
+ + output_cycles
+ + latencies_wr[ofm_tensor.mem_area]
+ + cycles_ofm_blk
+ ) / 4
+
+
def estimate_output_cycles(
- arch, npu_block_type, primary_op, num_elems, ifm_tensor, ofm_tensor, ifm2_tensor, use_acc_40bits=False
+ arch,
+ npu_block_type,
+ primary_op,
+ num_elems,
+ ifm_tensor,
+ ofm_tensor,
+ use_acc_40bits=False,
+ ifm2_tensor=None,
+ block_config: Block = None,
):
faf = None if primary_op.activation is None else primary_op.activation.op_type
if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:
@@ -261,6 +293,13 @@ def estimate_output_cycles(
arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
)
+ if primary_op.type.is_elementwise_op() and block_config is not None:
+ num_elems_blk = block_config.width * block_config.height * block_config.depth
+ cycle_cmd = get_minimal_cmd_cycles(
+ arch, ifm_tensor, ofm_tensor, block_config, block_config, num_elems_blk * cycle_per_elem
+ )
+ cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
+
return num_elems * cycle_per_elem
@@ -268,7 +307,8 @@ def estimate_conv_pooling_cycles(
arch,
npu_block_type,
primary_op,
- block_config: Block,
+ ifm_block: Block,
+ ofm_block: Block,
block_traversal,
kernel_dims,
ifm_tensor,
@@ -290,17 +330,15 @@ def estimate_conv_pooling_cycles(
):
ofm_ublock.width = 4
ofm_ublock.height = 1
- block_config.height = 1
-
- num_ublk_xy = numeric_util.round_up_divide(block_config.width, ofm_ublock.width) * (
- block_config.height // ofm_ublock.height
- )
- num_ublk_z = block_config.depth // ofm_ublock.depth
+ ofm_block.height = 1
+ num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
+ num_ublk_y = ofm_block.height // ofm_ublock.height
+ num_ublk_xy = num_ublk_x * num_ublk_y
+ num_ublk_z = ofm_block.depth // ofm_ublock.depth
num_ofm_blk = 0
total_cycles = 0
- num_elems_blk = block_config.width * block_config.height * block_config.depth
-
+ num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth
use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
@@ -314,14 +352,12 @@ def estimate_conv_pooling_cycles(
]
sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
- ifm_blk_depth = get_ifm_block_depth(
- npu_block_type, ifm_tens_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, block_config.depth
- )
cycles_dpu_blk = 0
cycles_wb = 32 * ofm_ublock.depth // 8
for num_kernel_elems in sub_kernel_size:
if npu_block_type == NpuBlockType.Pooling:
+ num_kernel_steps = 1
cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
cycles *= 2
@@ -329,39 +365,64 @@ def estimate_conv_pooling_cycles(
cycles = 4 * num_ublk_xy
if ifm_tensor.dtype.size_in_bits() == 16:
cycles *= 2
- cycles = max(cycles_wb, cycles) * numeric_util.round_up_divide(num_kernel_elems, 4) * num_ublk_z
+ num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
+ cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
elif (
(npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
or npu_block_type == NpuBlockType.VectorProduct
or npu_block_type == NpuBlockType.ReduceSum
):
- cycles = (
- max(cycles_wb, 4 * num_ublk_xy)
- * num_kernel_elems
- * num_ublk_z
- * numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)
- )
+ num_kernel_steps = num_kernel_elems
+ cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
else:
assert block_traversal == TensorBlockTraversal.PartKernelFirst
divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
+ num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
cycles = max(cycles_wb, 4 * num_ublk_xy) * (
- numeric_util.round_up_divide(num_kernel_elems, divider)
- * numeric_util.round_up_divide(ifm_blk_depth, 8)
- * num_ublk_z
- * numeric_util.round_up_divide(ifm_tens_shape[3], ifm_blk_depth)
+ num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
)
+
+ delay_cycles = 0
+ if arch.accelerator_config is Accelerator.Ethos_U55_32:
+ delay = 7 if use_acc_40bits else 3
+ if num_ublk_x == 1 and num_ublk_y == 1:
+ if num_ublk_z == 1:
+ delay_cycles = delay * num_kernel_steps
+ elif num_kernel_steps > 1:
+ delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
+ if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
+ delay_cycles += delay * num_ublk_z
+ else:
+ delay = (
+ 3
+ if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)
+ else 2
+ )
+ if num_ublk_x == 1 and num_ublk_y == 1:
+ if num_ublk_z == 1:
+ delay_cycles = delay * num_kernel_steps
+ elif num_kernel_steps > 1:
+ delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
+
+ if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:
+ delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
+
cycles_dpu_blk += cycles
+ cycles_dpu_blk += delay_cycles
+
+ if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
+ cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape[3], ifm_block.depth)
cycles_dpu_blk /= arch.ncores
num_ofm_blk = (
- numeric_util.round_up_divide(ofm_tens_shape[1], block_config.height)
- * numeric_util.round_up_divide(ofm_tens_shape[2], block_config.width)
- * numeric_util.round_up_divide(ofm_tens_shape[3], block_config.depth)
+ numeric_util.round_up_divide(ofm_tens_shape[1], ofm_block.height)
+ * numeric_util.round_up_divide(ofm_tens_shape[2], ofm_block.width)
+ * numeric_util.round_up_divide(ofm_tens_shape[3], ofm_block.depth)
)
cycles_output_blk = estimate_output_cycles(
- arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, None, use_acc_40bits
+ arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits
)
if scale_tensor:
@@ -371,9 +432,15 @@ def estimate_conv_pooling_cycles(
latency = 500
else:
latency = 64
- cycles_bias_blk = 10 * min(block_config.depth, ofm_tens_shape[3]) * latency / 256
+ cycles_bias_blk = 10 * min(ofm_block.depth, ofm_tens_shape[3]) * latency / 256
cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
+ cycles_cmd = get_minimal_cmd_cycles(
+ arch, ifm_tensor, ofm_tensor, ifm_block, ofm_block, cycles_dpu_blk, cycles_output_blk
+ )
+ cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
+ cycles_output_blk = max(cycles_output_blk, cycles_cmd)
+
if cycles_dpu_blk > cycles_output_blk:
total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
else:
@@ -449,11 +516,27 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
assert primary_op.type.npu_block_type == ps.npu_block_type
npu_block_type = primary_op.type.npu_block_type
- block_traversal = TensorBlockTraversal.Default
ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
ifm_tensor_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)
+ if npu_block_type == NpuBlockType.ReduceSum:
+ block_traversal = TensorBlockTraversal.DepthFirst
+ elif npu_block_type in (
+ NpuBlockType.ConvolutionMxN,
+ NpuBlockType.ConvolutionDepthWise,
+ NpuBlockType.VectorProduct,
+ ):
+ block_traversal = weight_tensor.block_traversal
+ else:
+ block_traversal = TensorBlockTraversal.Default
+ ifm_block_depth = get_ifm_block_depth(
+ npu_block_type, ifm_tensor_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
+ )
+ ifm_block = arch.get_ifm_block_size(
+ ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
+ )
+
if npu_block_type in set(
(
NpuBlockType.ConvolutionMxN,
@@ -476,13 +559,11 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
strides = primary_op.attrs["strides"]
if npu_block_type != NpuBlockType.Pooling:
if npu_block_type == NpuBlockType.ReduceSum:
- block_traversal = TensorBlockTraversal.DepthFirst
weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]
weight_tensor_bandwidth_shape = [0] * 4
weight_tensor_element_size = 0
weight_tensor_bandwidth_compression_scale = 0.0
else:
- block_traversal = weight_tensor.block_traversal
weight_tensor_shape = weight_tensor.shape
weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape
weight_tensor_element_size = weight_tensor.element_size()
@@ -580,13 +661,13 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
* numeric_util.round_up(weight_tensor_shape[3], ofm_block.depth)
* n_kernel_xy
)
-
macs[MacCount.NeuralNetworkMacs] += nn_ops
macs[MacCount.HardwareMacs] += num_mac_ops
cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
arch,
npu_block_type,
primary_op,
+ ifm_block,
ofm_block,
block_traversal,
kernel_dims,
@@ -601,10 +682,9 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
* numeric_util.round_up(weight_tensor.shape[-1], block_config[3])
)
num_mac_ops = nn_macs
- block_traversal = weight_tensor.block_traversal
cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
- arch, npu_block_type, primary_op, ofm_block, block_traversal, [1, 1], ifm_tensor, ofm_tensor,
+ arch, npu_block_type, primary_op, ifm_block, ofm_block, block_traversal, [1, 1], ifm_tensor, ofm_tensor,
)
macs[MacCount.NeuralNetworkMacs] += nn_macs
macs[MacCount.HardwareMacs] += num_mac_ops
@@ -623,14 +703,17 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
elif npu_block_type == NpuBlockType.ElementWise:
# Work out how many elements we have and calculate performance.
cycles[PassCycles.Npu] = estimate_output_cycles(
- arch, npu_block_type, primary_op, ofm_tensor.elements(), ps.ifm_tensor, ps.ofm_tensor, ps.ifm2_tensor
+ arch,
+ npu_block_type,
+ primary_op,
+ ofm_tensor.elements(),
+ ps.ifm_tensor,
+ ps.ofm_tensor,
+ None,
+ ps.ifm2_tensor,
+ ofm_block,
)
- ifm_block_depth = get_ifm_block_depth(
- npu_block_type, ifm_tensor_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
- )
- ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, primary_op.kernel)
-
prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)
if prev_npu_pass is None:
# cycles for DMA ops in first pass