From 42e833d64918b666e81f957c56919d01bb6212cd Mon Sep 17 00:00:00 2001 From: Diqing Zhong Date: Fri, 2 Oct 2020 13:18:42 +0200 Subject: MLBEDSW-3146: memory transfers cycle estimation - DMA ops cycle estimation for the first pass - fix a bug in ifm_blk_depth calculation - fix a bug in sram bandwidth calculation - merge dpu and elementwise cycles into npu cycles - use str.format() in performance print Change-Id: I78895416f47fc3c652743c5da13fc45630322371 Signed-off-by: Diqing Zhong (cherry picked from commit 5245e97a62c2fe54250f99b06e778f3e0c6dc376) (cherry picked from commit 16e415677403fc04a90b1a7ec554761d38315640) --- ethosu/vela/npu_performance.py | 81 ++++++++++++++++++++++++------------------ ethosu/vela/stats_writer.py | 78 ++++++++++++++++++++++------------------ 2 files changed, 91 insertions(+), 68 deletions(-) diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 4d221bea..19579520 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -57,21 +57,19 @@ def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_conf class PassCycles(enum.IntEnum): - Dpu = 0 - ElementWise = 1 - Cpu = 2 - SramAccess = 3 - TotalPerPass = 4 - DramAccess = 5 - OnChipFlashAccess = 6 - OffChipFlashAccess = 7 - Total = 8 - Size = 9 + Npu = 0 + Cpu = 1 + SramAccess = 2 + TotalPerPass = 3 + DramAccess = 4 + OnChipFlashAccess = 5 + OffChipFlashAccess = 6 + Total = 7 + Size = 8 def display_name(self): return ( - "DPU", - "Element wise", + "NPU", "CPU", "SRAM Access", "Total per Pass", @@ -84,8 +82,7 @@ class PassCycles(enum.IntEnum): def identifier_name(self): return ( - "dpu", - "element_wise", + "npu", "cpu", "sram_access", "total_per_pass", @@ -99,8 +96,7 @@ class PassCycles(enum.IntEnum): @staticmethod def all(): return ( - PassCycles.Dpu, - PassCycles.ElementWise, + PassCycles.Npu, PassCycles.Cpu, PassCycles.SramAccess, PassCycles.DramAccess, @@ -213,7 +209,21 @@ def get_n_blocks_and_area( return total_blocks, total_area, block_setup -def get_output_cycle_estimate( +def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth): + ifm_blk_depth = ofm_blk_depth + + if npu_block_type == NpuBlockType.ConvolutionMxN or npu_block_type == NpuBlockType.ReduceSum: + if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst: + ifm_blk_depth = 16 + elif ifm_elemwidth == 8: + ifm_blk_depth = 32 + else: + ifm_blk_depth = 8 + + return min(ifm_depth, ifm_blk_depth) + + +def estimate_output_cycles( arch, npu_block_type, primary_op, num_elems, ifm_tensor, ofm_tensor, ifm2_tensor, use_acc_40bits=False ): faf = primary_op.activation @@ -270,7 +280,7 @@ def get_output_cycle_estimate( return num_elems * cycle_per_elem -def get_conv_pooling_cycle_estimate( +def estimate_conv_pooling_cycles( arch, npu_block_type, primary_op, block_config: Block, block_traversal, kernel_dims, ifm_tensor, ofm_tensor ): num_ublk = ( @@ -296,15 +306,9 @@ def get_conv_pooling_cycle_estimate( ] sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x) - ifm_blk_depth = 0 - if npu_block_type != NpuBlockType.Pooling: - if ifm_tensor.dtype.size_in_bits() == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst: - ifm_blk_depth = 16 - elif ifm_tensor.dtype.size_in_bits() == 8: - ifm_blk_depth = 32 - else: - ifm_blk_depth = 8 - + ifm_blk_depth = get_ifm_block_depth( + npu_block_type, ifm_tens_shape[3], ifm_tensor.dtype.size_in_bits(), block_traversal, block_config.depth + ) cycles_dpu_blk = 0 for num_kernel_elems in sub_kernel_size: @@ -341,7 +345,7 @@ def get_conv_pooling_cycle_estimate( * numeric_util.round_up_divide(ofm_tens_shape[3], block_config.depth) ) - cycles_output_blk = get_output_cycle_estimate( + cycles_output_blk = estimate_output_cycles( arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, None, use_acc_40bits ) @@ -379,6 +383,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding) assert primary_op.type.npu_block_type == ps.npu_block_type npu_block_type = primary_op.type.npu_block_type + block_traversal = TensorBlockTraversal.Default ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm() @@ -395,15 +400,13 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f ifm_tensor_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1) ifm_tensor_bandwidth_shape = numeric_util.full_shape(4, ifm_tensor.bandwidth_shape, 1) - batch_size = ifm_tensor.shape[0] + batch_size = ifm_tensor_shape[0] ifm_depth = ifm_tensor_bandwidth_shape[3] # add in padding ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2] # height += top and bottom ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3] # width += left and right - block_traversal = TensorBlockTraversal.Default - strides = primary_op.attrs["strides"] if npu_block_type != NpuBlockType.Pooling: if npu_block_type == NpuBlockType.ReduceSum: @@ -514,7 +517,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f macs[MacCount.NeuralNetworkMacs] += nn_ops macs[MacCount.HardwareMacs] += num_mac_ops - cycles[PassCycles.Dpu] = get_conv_pooling_cycle_estimate( + cycles[PassCycles.Npu] = estimate_conv_pooling_cycles( arch, npu_block_type, primary_op, @@ -532,7 +535,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f ) num_mac_ops = nn_macs - cycles[PassCycles.Dpu] = get_conv_pooling_cycle_estimate( + cycles[PassCycles.Npu] = estimate_conv_pooling_cycles( arch, npu_block_type, primary_op, @@ -558,9 +561,19 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f weight_read_multiple = non_zero_fraction elif npu_block_type == NpuBlockType.ElementWise: # Work out how many elements we have and calculate performance. - cycles[PassCycles.ElementWise] = get_output_cycle_estimate( + cycles[PassCycles.Npu] = estimate_output_cycles( arch, npu_block_type, primary_op, ofm_tensor.elements(), ps.ifm_tensor, ps.ofm_tensor, ps.ifm2_tensor ) + + prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None) + if prev_npu_pass is None: + # cycles for DMA ops in first pass + dma_ops = (op for op in ps.ops if op.type == Op.DMA) + for dma_op in dma_ops: + mem_area = dma_op.attrs["source"] + for tens in dma_op.inputs: + cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area] + # apply the desired rewrites for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list: if ps != ps_to_rewrite: diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py index 6fd68f85..3cd769f0 100644 --- a/ethosu/vela/stats_writer.py +++ b/ethosu/vela/stats_writer.py @@ -154,15 +154,14 @@ def write_pass_metrics_csv(nng, pass_filename): for mem_area in mem_areas_to_report(): for purpose, purpose_candidates in purpose_list: for direction, direction_candidates in direction_list: - label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction) + label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction) bandwidth_names.append(label) bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates)) all_macs = MacCount.all() all_cycles = ( PassCycles.Total, - PassCycles.Dpu, - PassCycles.ElementWise, + PassCycles.Npu, PassCycles.Cpu, PassCycles.SramAccess, PassCycles.DramAccess, @@ -253,16 +252,16 @@ def print_performance_metrics_for_strat( if name: print("", file=f) print("Network summary for", name, file=f) - print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f) - print("System configuration %20s" % (arch.system_config,), file=f) - print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f) + print("Accelerator configuration {:20}".format(arch.accelerator_config), file=f) + print("System configuration {:20}".format(arch.system_config), file=f) + print("Accelerator clock {:12d} MHz".format(int(arch.npu_clock / 1e6)), file=f) for mem_area, label in mem_area_labels: print( - "Design peak %-25s %12.2f GB/s" - % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,), + "Design peak {:25} {:12.2f} GB/s".format( + label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 + ), file=f, ) - print(file=f) for mem_area, label in mem_area_labels: if mem_area not in memory_used: @@ -272,18 +271,19 @@ def print_performance_metrics_for_strat( extra = "" if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None: - extra = " (%.2f bits per element)" % (bits_per_element[mem_area],) + extra = " ({:.2f} bits per element)".format(bits_per_element[mem_area]) - print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f) + print("Total {:25} {:12.2f} KiB{}".format(aug_label, memory_used[mem_area] / 1024.0, extra), file=f) print(file=f) - print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f) + print("{:d} passes fused into {:d}".format(num_passes, num_cascaded_passes), file=f) n_cpu_operations = len(cpu_operations) if n_operations > 0: print( - "%d/%d (%4.1f %%) operations falling back to the CPU" - % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100), + "{:d}/{:d} ({:4.1%}) operations falling back to the CPU".format( + n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100 + ), file=f, ) @@ -294,8 +294,9 @@ def print_performance_metrics_for_strat( return " ".join(str(list(tens.shape)) for tens in lst) print( - "CPU operation: %s, inputs %s, outputs %s" - % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)), + "CPU operation: {} inputs {}, outputs {}".format( + op.type, format_tens_list(op.inputs), format_tens_list(op.outputs) + ), file=f, ) @@ -308,38 +309,46 @@ def print_performance_metrics_for_strat( fm_bws = bws[TensorPurpose.FeatureMap] aug_label = label + " bandwidth" print( - "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,), + "Average {:25} {:12.2f} GB/s".format(aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0), file=f, ) print( - "Input %-25s %12.2f MB/batch" - % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,), + "Input {:25} {:12.2f} MB/batch".format( + aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0 + ), file=f, ) - print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f) + print("Weight {:25} {:12.2f} MB/batch".format(aug_label, np.sum(weight_bws) / 1000.0 / 1000.0), file=f) print( - "Output %-25s %12.2f MB/batch" - % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,), + "Output {:25} {:12.2f} MB/batch".format( + aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0 + ), file=f, ) - print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f) + print("Total {:25} {:12.2f} MB/batch".format(aug_label, total_bw / 1000.0 / 1000.0), file=f) print( - "Total %-25s per input %9.2f MB/inference (batch size %d)" - % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size), + "Total {:25} per input {:9.2f} MB/inference (batch size {:d})".format( + aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size + ), file=f, ) print(file=f) - print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f) - print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f) print( - "Network Tops/s %12.2f Tops/s" - % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12), + "Neural network macs {:12d} MACs/batch".format(int(macs[MacCount.NeuralNetworkMacs])), + file=f, + ) + print("Hardware macs {:12d} MACs/batch".format(int(macs[MacCount.HardwareMacs])), file=f) + print( + "Network Tops/s {:12.2f} Tops/s".format( + macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12 + ), file=f, ) print( - "Hardware Tops/s %12.2f Tops/s" - % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12), + "Hardware Tops/s {:12.2f} Tops/s".format( + macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12 + ), file=f, ) print(file=f) @@ -347,12 +356,13 @@ def print_performance_metrics_for_strat( for kind in PassCycles.all(): aug_label = kind.display_name() + " cycles" cyc = cycles[kind] - print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f) + print("{:30} {:12d} cycles/batch".format(aug_label, int(cyc)), file=f) print(file=f) print( - "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)" - % (midpoint_inference_time * 1000, midpoint_fps, batch_size), + "Batch Inference time {:7.2f} ms, {:7.2f} inferences/s (batch size {:d})".format( + midpoint_inference_time * 1000, midpoint_fps, batch_size + ), file=f, ) print(file=f) -- cgit v1.2.1