aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDiqing Zhong <diqing.zhong@arm.com>2020-10-02 13:18:42 +0200
committertim.hall <tim.hall@arm.com>2020-11-11 11:14:53 +0000
commit42e833d64918b666e81f957c56919d01bb6212cd (patch)
tree7aab6627226a996e8b9bc89654f7b93b2670fbb2
parent09387e207aa736c464cf95c8a57609aa21b65d44 (diff)
downloadethos-u-vela-42e833d64918b666e81f957c56919d01bb6212cd.tar.gz
MLBEDSW-3146: memory transfers cycle estimation
- DMA ops cycle estimation for the first pass - fix a bug in ifm_blk_depth calculation - fix a bug in sram bandwidth calculation - merge dpu and elementwise cycles into npu cycles - use str.format() in performance print Change-Id: I78895416f47fc3c652743c5da13fc45630322371 Signed-off-by: Diqing Zhong <diqing.zhong@arm.com> (cherry picked from commit 5245e97a62c2fe54250f99b06e778f3e0c6dc376) (cherry picked from commit 16e415677403fc04a90b1a7ec554761d38315640)
-rw-r--r--ethosu/vela/npu_performance.py81
-rw-r--r--ethosu/vela/stats_writer.py78
2 files changed, 91 insertions, 68 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 4d221be..1957952 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -57,21 +57,19 @@ def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_conf
class PassCycles(enum.IntEnum):
- Dpu = 0
- ElementWise = 1
- Cpu = 2
- SramAccess = 3
- TotalPerPass = 4
- DramAccess = 5
- OnChipFlashAccess = 6
- OffChipFlashAccess = 7
- Total = 8
- Size = 9
+ Npu = 0
+ Cpu = 1
+ SramAccess = 2
+ TotalPerPass = 3
+ DramAccess = 4
+ OnChipFlashAccess = 5
+ OffChipFlashAccess = 6
+ Total = 7
+ Size = 8
def display_name(self):
return (
- "DPU",
- "Element wise",
+ "NPU",
"CPU",
"SRAM Access",
"Total per Pass",
@@ -84,8 +82,7 @@ class PassCycles(enum.IntEnum):
def identifier_name(self):
return (
- "dpu",
- "element_wise",
+ "npu",
"cpu",
"sram_access",
"total_per_pass",
@@ -99,8 +96,7 @@ class PassCycles(enum.IntEnum):
@staticmethod
def all():
return (
- PassCycles.Dpu,
- PassCycles.ElementWise,
+ PassCycles.Npu,
PassCycles.Cpu,
PassCycles.SramAccess,
PassCycles.DramAccess,
@@ -213,7 +209,21 @@ def get_n_blocks_and_area(
return total_blocks, total_area, block_setup
-def get_output_cycle_estimate(
+def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):
+ ifm_blk_depth = ofm_blk_depth
+
+ if npu_block_type == NpuBlockType.ConvolutionMxN or npu_block_type == NpuBlockType.ReduceSum:
+ if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
+ ifm_blk_depth = 16
+ elif ifm_elemwidth == 8:
+ ifm_blk_depth = 32
+ else:
+ ifm_blk_depth = 8
+
+ return min(ifm_depth, ifm_blk_depth)
+
+
+def estimate_output_cycles(
arch, npu_block_type, primary_op, num_elems, ifm_tensor, ofm_tensor, ifm2_tensor, use_acc_40bits=False
):
faf = primary_op.activation
@@ -270,7 +280,7 @@ def get_output_cycle_estimate(
return num_elems * cycle_per_elem
-def get_conv_pooling_cycle_estimate(
+def estimate_conv_pooling_cycles(
arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor
):
num_ublk = (
@@ -296,15 +306,9 @@ def get_conv_pooling_cycle_estimate(
]
sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
- ifm_blk_depth = 0
- if npu_block_type != NpuBlockType.Pooling:
- if ifm_tensor.dtype.size_in_bits() == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
- ifm_blk_depth = 16
- elif ifm_tensor.dtype.size_in_bits() == 8:
- ifm_blk_depth = 32
- else:
- ifm_blk_depth = 8
-
+ ifm_blk_depth = get_ifm_block_depth(
+ npu_block_type, ifm_tens_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, block_config.depth
+ )
cycles_dpu_blk = 0
for num_kernel_elems in sub_kernel_size:
@@ -341,7 +345,7 @@ def get_conv_pooling_cycle_estimate(
* numeric_util.round_up_divide(ofm_tens_shape[3], block_config.depth)
)
- cycles_output_blk = get_output_cycle_estimate(
+ cycles_output_blk = estimate_output_cycles(
arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, None, use_acc_40bits
)
@@ -379,6 +383,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
assert primary_op.type.npu_block_type == ps.npu_block_type
npu_block_type = primary_op.type.npu_block_type
+ block_traversal = TensorBlockTraversal.Default
ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
@@ -395,15 +400,13 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
ifm_tensor_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1)
ifm_tensor_bandwidth_shape = numeric_util.full_shape(4, ifm_tensor.bandwidth_shape, 1)
- batch_size = ifm_tensor.shape[0]
+ batch_size = ifm_tensor_shape[0]
ifm_depth = ifm_tensor_bandwidth_shape[3]
# add in padding
ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2] # height += top and bottom
ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3] # width += left and right
- block_traversal = TensorBlockTraversal.Default
-
strides = primary_op.attrs["strides"]
if npu_block_type != NpuBlockType.Pooling:
if npu_block_type == NpuBlockType.ReduceSum:
@@ -514,7 +517,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
macs[MacCount.NeuralNetworkMacs] += nn_ops
macs[MacCount.HardwareMacs] += num_mac_ops
- cycles[PassCycles.Dpu] = get_conv_pooling_cycle_estimate(
+ cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
arch,
npu_block_type,
primary_op,
@@ -532,7 +535,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
)
num_mac_ops = nn_macs
- cycles[PassCycles.Dpu] = get_conv_pooling_cycle_estimate(
+ cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
arch,
npu_block_type,
primary_op,
@@ -558,9 +561,19 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
weight_read_multiple = non_zero_fraction
elif npu_block_type == NpuBlockType.ElementWise:
# Work out how many elements we have and calculate performance.
- cycles[PassCycles.ElementWise] = get_output_cycle_estimate(
+ cycles[PassCycles.Npu] = estimate_output_cycles(
arch, npu_block_type, primary_op, ofm_tensor.elements(), ps.ifm_tensor, ps.ofm_tensor, ps.ifm2_tensor
)
+
+ prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)
+ if prev_npu_pass is None:
+ # cycles for DMA ops in first pass
+ dma_ops = (op for op in ps.ops if op.type == Op.DMA)
+ for dma_op in dma_ops:
+ mem_area = dma_op.attrs["source"]
+ for tens in dma_op.inputs:
+ cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]
+
# apply the desired rewrites
for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
if ps != ps_to_rewrite:
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 6fd68f8..3cd769f 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -154,15 +154,14 @@ def write_pass_metrics_csv(nng, pass_filename):
for mem_area in mem_areas_to_report():
for purpose, purpose_candidates in purpose_list:
for direction, direction_candidates in direction_list:
- label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
+ label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction)
bandwidth_names.append(label)
bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
all_macs = MacCount.all()
all_cycles = (
PassCycles.Total,
- PassCycles.Dpu,
- PassCycles.ElementWise,
+ PassCycles.Npu,
PassCycles.Cpu,
PassCycles.SramAccess,
PassCycles.DramAccess,
@@ -253,16 +252,16 @@ def print_performance_metrics_for_strat(
if name:
print("", file=f)
print("Network summary for", name, file=f)
- print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f)
- print("System configuration %20s" % (arch.system_config,), file=f)
- print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f)
+ print("Accelerator configuration {:20}".format(arch.accelerator_config), file=f)
+ print("System configuration {:20}".format(arch.system_config), file=f)
+ print("Accelerator clock {:12d} MHz".format(int(arch.npu_clock / 1e6)), file=f)
for mem_area, label in mem_area_labels:
print(
- "Design peak %-25s %12.2f GB/s"
- % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
+ "Design peak {:25} {:12.2f} GB/s".format(
+ label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
+ ),
file=f,
)
-
print(file=f)
for mem_area, label in mem_area_labels:
if mem_area not in memory_used:
@@ -272,18 +271,19 @@ def print_performance_metrics_for_strat(
extra = ""
if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
- extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
+ extra = " ({:.2f} bits per element)".format(bits_per_element[mem_area])
- print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
+ print("Total {:25} {:12.2f} KiB{}".format(aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
print(file=f)
- print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
+ print("{:d} passes fused into {:d}".format(num_passes, num_cascaded_passes), file=f)
n_cpu_operations = len(cpu_operations)
if n_operations > 0:
print(
- "%d/%d (%4.1f %%) operations falling back to the CPU"
- % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
+ "{:d}/{:d} ({:4.1%}) operations falling back to the CPU".format(
+ n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100
+ ),
file=f,
)
@@ -294,8 +294,9 @@ def print_performance_metrics_for_strat(
return " ".join(str(list(tens.shape)) for tens in lst)
print(
- "CPU operation: %s, inputs %s, outputs %s"
- % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
+ "CPU operation: {} inputs {}, outputs {}".format(
+ op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)
+ ),
file=f,
)
@@ -308,38 +309,46 @@ def print_performance_metrics_for_strat(
fm_bws = bws[TensorPurpose.FeatureMap]
aug_label = label + " bandwidth"
print(
- "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
+ "Average {:25} {:12.2f} GB/s".format(aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0),
file=f,
)
print(
- "Input %-25s %12.2f MB/batch"
- % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
+ "Input {:25} {:12.2f} MB/batch".format(
+ aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0
+ ),
file=f,
)
- print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
+ print("Weight {:25} {:12.2f} MB/batch".format(aug_label, np.sum(weight_bws) / 1000.0 / 1000.0), file=f)
print(
- "Output %-25s %12.2f MB/batch"
- % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
+ "Output {:25} {:12.2f} MB/batch".format(
+ aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0
+ ),
file=f,
)
- print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
+ print("Total {:25} {:12.2f} MB/batch".format(aug_label, total_bw / 1000.0 / 1000.0), file=f)
print(
- "Total %-25s per input %9.2f MB/inference (batch size %d)"
- % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
+ "Total {:25} per input {:9.2f} MB/inference (batch size {:d})".format(
+ aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size
+ ),
file=f,
)
print(file=f)
- print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
- print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
print(
- "Network Tops/s %12.2f Tops/s"
- % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
+ "Neural network macs {:12d} MACs/batch".format(int(macs[MacCount.NeuralNetworkMacs])),
+ file=f,
+ )
+ print("Hardware macs {:12d} MACs/batch".format(int(macs[MacCount.HardwareMacs])), file=f)
+ print(
+ "Network Tops/s {:12.2f} Tops/s".format(
+ macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12
+ ),
file=f,
)
print(
- "Hardware Tops/s %12.2f Tops/s"
- % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
+ "Hardware Tops/s {:12.2f} Tops/s".format(
+ macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12
+ ),
file=f,
)
print(file=f)
@@ -347,12 +356,13 @@ def print_performance_metrics_for_strat(
for kind in PassCycles.all():
aug_label = kind.display_name() + " cycles"
cyc = cycles[kind]
- print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f)
+ print("{:30} {:12d} cycles/batch".format(aug_label, int(cyc)), file=f)
print(file=f)
print(
- "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)"
- % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
+ "Batch Inference time {:7.2f} ms, {:7.2f} inferences/s (batch size {:d})".format(
+ midpoint_inference_time * 1000, midpoint_fps, batch_size
+ ),
file=f,
)
print(file=f)