From 69aadd052588eb53a257e8f7431ed858161b3286 Mon Sep 17 00:00:00 2001 From: Diqing Zhong Date: Tue, 8 Dec 2020 13:08:48 +0100 Subject: Vela: bandwidth calculation improvements - Combine conv and vector_product calculation - Remove internal bandwidth - Remove blocks and hw_macs from report - Use scaled_bws for cycle estimation Related to: MLBEDSW-3598 Change-Id: I1927a8311ec563f68115e0f2ed077806b86fd717 Signed-off-by: Diqing Zhong --- ethosu/vela/npu_performance.py | 248 +++++++++-------------------------------- ethosu/vela/scheduler.py | 5 +- ethosu/vela/stats_writer.py | 92 ++++++--------- 3 files changed, 89 insertions(+), 256 deletions(-) (limited to 'ethosu/vela') diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 2d7a1b09..8ada1e23 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -90,22 +90,6 @@ class PassCycles(IntEnum): ) -class MacCount(IntEnum): - NeuralNetworkMacs = 0 - HardwareMacs = auto() - Size = auto() - - def display_name(self): - return ("Neural Network Macs", "Hardware Macs", "Size")[self.value] - - def identifier_name(self): - return ("nn_macs", "hardware_macs", "size")[self.value] - - @staticmethod - def all(): - return (MacCount.NeuralNetworkMacs, MacCount.HardwareMacs) - - class BandwidthDirection(IntEnum): Read = 0 Write = auto() @@ -126,77 +110,18 @@ def make_bandwidth_array(): return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) -def make_macs_array(): - return np.zeros(MacCount.Size, np.int) - - def make_cycles_array(): return np.zeros(PassCycles.Size) def make_metrics_arrays(): - return (make_bandwidth_array(), make_macs_array(), make_cycles_array()) - - -def get_n_blocks_and_area( - ifm_brick_size, ifm_height_width, orig_skirt, clamped_skirt, block_config, min_block_size, strides -): - - ifm_block_config = (block_config[0] * strides[1], block_config[1] * strides[2]) - - n_normal_blocks = [] - remainder_size = [] - for i in range(2): - non_skirt_dim = ifm_height_width[i] - orig_skirt[i] - orig_skirt[2 + i] - n_blocks = non_skirt_dim // ifm_block_config[i] - n_normal_blocks.append(n_blocks) - remainder_dim = numeric_util.round_up( - ((non_skirt_dim - n_blocks * ifm_block_config[i] - 1) // strides[i + 1]) + 1, min_block_size[i] - ) - remainder_size.append(remainder_dim) - - # this will actually calculate reads into the edge padding. - - # there are four cases in total, handling the edges that will not fill a complete block. - - # 0000000001 - # 0000000001 - # 0000000001 - # 0000000001 - # 0000000001 - # 0000000001 - # 2222222223 - total_blocks = 0 - total_area = 0 - - block_setup = ( - (n_normal_blocks[0] * n_normal_blocks[1], block_config), - (1 * n_normal_blocks[1], (remainder_size[0], block_config[1])), - (n_normal_blocks[0] * 1, (block_config[0], remainder_size[1])), - (1 * 1, remainder_size), - ) - - for n_blocks, block_size in block_setup: - if block_size[0] == 0 or block_size[1] == 0: - continue - read_dims = [0, 0] - for i in range(2): - read_dims[i] = ( - numeric_util.round_up(clamped_skirt[i], ifm_brick_size[i + 1]) - + block_size[i] * strides[i + 1] - + numeric_util.round_up(clamped_skirt[2 + i], ifm_brick_size[i + 1]) - ) - assert n_blocks >= 0 - total_blocks += n_blocks - total_area += n_blocks * read_dims[0] * read_dims[1] - assert total_blocks >= 1 - return total_blocks, total_area, block_setup + return (make_bandwidth_array(), 0, make_cycles_array()) def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth): ifm_blk_depth = ofm_blk_depth - if npu_block_type == NpuBlockType.ConvolutionMxN or npu_block_type == NpuBlockType.ReduceSum: + if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst: ifm_blk_depth = 16 elif ifm_elemwidth == 8: @@ -213,11 +138,11 @@ def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk") ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk") cycles_ifm_blk = ( - estimate_memory_bandwidth(arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk) + estimate_memory_transfer_efficiency(arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk) / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area] ) cycles_ofm_blk = ( - estimate_memory_bandwidth(arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk) + estimate_memory_transfer_efficiency(arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk) / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area] ) return ( @@ -449,7 +374,7 @@ def estimate_conv_pooling_cycles( return total_cycles -def estimate_memory_bandwidth(arch, mem_area, direction, tensor, block_size: Block, replace_bw=None): +def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block_size: Block, replace_bw=None): if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16): return tensor.bandwidth() if replace_bw is None else replace_bw @@ -493,18 +418,15 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, if block_config is None: block_config = ps.block_config bws = make_bandwidth_array() - macs = make_macs_array() + scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency + macs = 0 cycles = make_cycles_array() - blocks = 0 ifm_read_multiple = 1 weight_read_multiple = 0 if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit): - return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass - - min_block_size = arch.min_block_sizes[ps.npu_block_type] + return bws, macs, cycles, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass - skirt = (0, 0, 0, 0) explicit_padding = (0, 0, 0, 0) primary_op = ps.primary_op replacement_read_bws = {} @@ -512,13 +434,13 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, ifm_block = Block(block_config[1], block_config[0], block_config[3]) if ps.placement == PassPlacement.Npu and primary_op: - skirt = primary_op.attrs.get("skirt", skirt) explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding) assert primary_op.type.npu_block_type == ps.npu_block_type npu_block_type = primary_op.type.npu_block_type ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm() ifm_tensor_shape = numeric_util.full_shape(4, ifm_tensor.shape, 1) + ofm_tensor_shape = numeric_util.full_shape(4, ofm_tensor.shape, 1) if npu_block_type == NpuBlockType.ReduceSum: block_traversal = TensorBlockTraversal.DepthFirst @@ -540,21 +462,17 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, if npu_block_type in ( NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, + NpuBlockType.VectorProduct, NpuBlockType.Pooling, NpuBlockType.ReduceSum, ): # extent the ifm to full dimension - ifm_tensor_brick_size = tuple(numeric_util.full_shape(4, list(ifm_tensor.brick_size), 1)) - ifm_tensor_bandwidth_shape = numeric_util.full_shape(4, ifm_tensor.bandwidth_shape, 1) - batch_size = ifm_tensor_shape[0] - ifm_depth = ifm_tensor_bandwidth_shape[3] # add in padding ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2] # height += top and bottom ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3] # width += left and right - strides = primary_op.attrs["strides"] if npu_block_type != NpuBlockType.Pooling: if npu_block_type == NpuBlockType.ReduceSum: weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]] @@ -562,14 +480,16 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, weight_tensor_element_size = 0 weight_tensor_bandwidth_compression_scale = 0.0 else: - weight_tensor_shape = weight_tensor.shape - weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape + # For Vector product, weight format of IO is extended to HWIO, with H=W=1 + weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1) + weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1) weight_tensor_element_size = weight_tensor.element_size() weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale + nn_ops = ( - int(ofm_tensor.shape[0]) - * int(ofm_tensor.shape[1]) - * int(ofm_tensor.shape[2]) + int(ofm_tensor_shape[0]) + * int(ofm_tensor_shape[1]) + * int(ofm_tensor_shape[2]) * int(weight_tensor_shape[0]) * int(weight_tensor_shape[1]) * int(weight_tensor_shape[2]) @@ -595,72 +515,25 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1]) n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x - clamped_skirt = list(skirt) - clamped_skirt[2] = min(clamped_skirt[2], sub_kernel_limits[0] - 1 - clamped_skirt[0]) - clamped_skirt[3] = min(clamped_skirt[3], sub_kernel_limits[1] - 1 - clamped_skirt[1]) - n_blocks, area, block_setup = get_n_blocks_and_area( - ifm_tensor_brick_size, - ifm_tensor_shape[1:3], - skirt, - clamped_skirt, - block_config, - min_block_size, - strides, - ) - - blocks = n_blocks * numeric_util.round_up_divide(weight_tensor_shape[3], ofm_block.depth) - - n_weight_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth) - if npu_block_type == NpuBlockType.ConvolutionDepthWise or npu_block_type == NpuBlockType.Pooling: - n_weight_stages = 1 # force to no reread + n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth) + if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): + n_full_depth_stages = 1 # force to no reread - ifm_tensor_bw = ( - n_sub_kernels - * batch_size - * area - * ifm_depth - * n_weight_stages - * ifm_tensor.element_size() - * ifm_tensor.bandwidth_compression_scale - ) - replacement_read_bws[ifm_tensor] = ifm_tensor_bw - ifm_read_multiple = n_weight_stages + ifm_read_multiple = n_sub_kernels * n_full_depth_stages + replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple + weight_read_multiple = numeric_util.round_up_divide( + ofm_tensor_shape[1], ofm_block.height + ) * numeric_util.round_up_divide(ofm_tensor_shape[2], ofm_block.width) replacement_read_bws[weight_tensor] = ( batch_size * shape_num_elements(weight_tensor_bandwidth_shape) * weight_tensor_element_size * weight_tensor_bandwidth_compression_scale - * n_blocks - ) # read once per block and batch - weight_read_multiple = n_blocks - - n_kernel_xy = kernel_dims[0] * kernel_dims[1] - n_input_channels_at_a_time = block_config[2] - - if (npu_block_type == NpuBlockType.Pooling) or ( - block_traversal in (TensorBlockTraversal.PartKernelFirst, TensorBlockTraversal.DepthWise) - ): - n_input_channels_at_a_time = numeric_util.round_up_divide(n_input_channels_at_a_time, 4) - n_kernel_xy = max( - n_kernel_xy, 4 - ) # need at least 4, as this is the minimum duty cycle for secondary accumulator writes - if weight_tensor is not None: - n_kernel_xy = numeric_util.round_up(n_kernel_xy, 4) # weights need to be read in blocks of 4 - - num_mac_ops = 0 - for n_blocks_for_size, block_size in block_setup: - num_mac_ops += ( - batch_size - * n_blocks_for_size - * block_size[0] - * block_size[1] - * numeric_util.round_up(weight_tensor_shape[2], n_input_channels_at_a_time) - * numeric_util.round_up(weight_tensor_shape[3], ofm_block.depth) - * n_kernel_xy - ) - macs[MacCount.NeuralNetworkMacs] += nn_ops - macs[MacCount.HardwareMacs] += num_mac_ops + * weight_read_multiple + ) + + macs += nn_ops cycles[PassCycles.Npu] = estimate_conv_pooling_cycles( arch, npu_block_type, @@ -673,31 +546,6 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, ofm_tensor, ps.scale_tensor, ) - elif npu_block_type == NpuBlockType.VectorProduct: - nn_macs = ( - ifm_tensor.shape[0] - * numeric_util.round_up(weight_tensor.shape[-2], block_config[2]) - * numeric_util.round_up(weight_tensor.shape[-1], block_config[3]) - ) - num_mac_ops = nn_macs - - cycles[PassCycles.Npu] = estimate_conv_pooling_cycles( - arch, npu_block_type, primary_op, ifm_block, ofm_block, block_traversal, [1, 1], ifm_tensor, ofm_tensor, - ) - macs[MacCount.NeuralNetworkMacs] += nn_macs - macs[MacCount.HardwareMacs] += num_mac_ops - - blocks = 1 * numeric_util.round_up_divide(weight_tensor.shape[-1], ofm_block.depth) - - non_zero_fraction = 1.0 - if ifm_tensor.values is not None: - nz_vector = np.amax(ifm_tensor.values != 0, axis=0) # max across batch axis - non_zero_fraction = np.average(nz_vector) - - replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() - replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction - ifm_read_multiple = 1 - weight_read_multiple = non_zero_fraction elif npu_block_type == NpuBlockType.ElementWise: # Work out how many elements we have and calculate performance. cycles[PassCycles.Npu] = estimate_output_cycles( @@ -729,8 +577,9 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, if rewrite_op == SchedulerRewrite.Nop: pass # these are fine, no bandwidth changes elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,): + bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens] if tens.purpose == TensorPurpose.FeatureMap: - bw = estimate_memory_bandwidth( + scaled_bw = estimate_memory_transfer_efficiency( arch, arch.fast_storage_mem_area, BandwidthDirection.Read, @@ -739,22 +588,27 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, replacement_read_bws[tens], ) else: - bw = replacement_read_bws[tens] - bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += bw + scaled_bw = replacement_read_bws[tens] + scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw replacement_read_bws[tens] = 0 for tens in ps.outputs: if force_outputs_to_fast_storage: - bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_bandwidth( + bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() + scaled_bws[arch.fast_storage_mem_area][tens.purpose][ + BandwidthDirection.Write + ] += estimate_memory_transfer_efficiency( arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block ) else: - bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_bandwidth( + bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() + scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency( arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block ) for tens in ps.intermediates: bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() + scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() if tens in replacement_read_bws: bw = replacement_read_bws[tens] @@ -762,16 +616,23 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, bw = tens.bandwidth() bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw + scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw for tens in ps.inputs: - bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_bandwidth( - arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, replacement_read_bws.get(tens) + if tens in replacement_read_bws: + bw = replacement_read_bws[tens] + else: + bw = tens.bandwidth() + + bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw + scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency( + arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw ) # quick build access counts for only current pass, even though these aren't the final numbers - update_summary_cycles(arch, bws, cycles) + update_summary_cycles(arch, scaled_bws, cycles) - return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple + return bws, macs, cycles, ifm_read_multiple, weight_read_multiple def update_summary_cycles(arch, bws, cycles): @@ -794,15 +655,14 @@ def collate_stats_for_cascaded_pass(arch, bws, macs, cycles): def performance_for_cascaded_pass(arch, cps): total_bws = make_bandwidth_array() - total_macs = make_macs_array() + total_macs = 0 total_cycles = make_cycles_array() for ps in cps.passes: - bws, macs, cycles, blocks, _, _ = performance_metrics_for_pass(arch, ps) + bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps) ps.bandwidths = bws ps.macs = macs ps.cycles = cycles - ps.n_blocks = blocks total_bws += bws total_macs += macs total_cycles += cycles @@ -816,7 +676,7 @@ def performance_for_cascaded_pass(arch, cps): def calc_performance_for_network(nng, arch): total_bws = make_bandwidth_array() - total_macs = np.zeros(MacCount.Size) + total_macs = 0 total_cycles = np.zeros(PassCycles.Size) for sg in nng.subgraphs: diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 977eb58e..2c10640b 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -32,7 +32,6 @@ from .nn_graph import SchedulerRewrite from .nn_graph import SchedulingStrategy from .npu_performance import make_bandwidth_array from .npu_performance import make_cycles_array -from .npu_performance import make_macs_array from .npu_performance import make_metrics_arrays from .npu_performance import PassCycles from .numeric_util import full_shape @@ -108,7 +107,7 @@ class Strategy: return False if (self.bws != other.bws).any(): return False - if (self.macs != other.macs).any(): + if self.macs != other.macs: return False if (self.cycles != other.cycles).any(): return False @@ -211,7 +210,7 @@ class StrategySet: empty_strategy = Strategy( - SchedulingStrategy.Unknown, None, [], [], [], make_bandwidth_array(), make_macs_array(), make_cycles_array(), 0 + SchedulingStrategy.Unknown, None, [], [], [], make_bandwidth_array(), 0, make_cycles_array(), 0 ) INFINITY = 1e30 diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py index 494b25e7..02d95d81 100644 --- a/ethosu/vela/stats_writer.py +++ b/ethosu/vela/stats_writer.py @@ -22,7 +22,6 @@ import numpy as np from .nn_graph import PassPlacement from .npu_performance import BandwidthDirection -from .npu_performance import MacCount from .npu_performance import PassCycles from .numeric_util import round_up_to_int from .operation import Op @@ -70,7 +69,7 @@ def write_summary_metrics_csv(nng, summary_filename, arch): mem_area.identifier_name() + "_total_bytes", ] - labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"] + labels += ["nn_macs", "nn_tops"] labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()] @@ -128,10 +127,8 @@ def write_summary_metrics_csv(nng, summary_filename, arch): ] data_items += [ - nng.macs[MacCount.NeuralNetworkMacs], - nng.macs[MacCount.HardwareMacs], - nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12, - nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12, + nng.macs, + nng.macs * 2 * midpoint_fps / 1e12, ] data_items += [nng.cycles[kind] for kind in PassCycles.all()] @@ -164,7 +161,6 @@ def write_pass_metrics_csv(nng, pass_filename): bandwidth_names.append(label) bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates)) - all_macs = MacCount.all() all_cycles = ( PassCycles.Total, PassCycles.Npu, @@ -183,10 +179,9 @@ def write_pass_metrics_csv(nng, pass_filename): "block_config_width", "block_config_input_channels", "block_config_output_channels", - "n_blocks_in_pass", ] + ["cycles_" + v.identifier_name() for v in all_cycles] - + [v.identifier_name() for v in all_macs] + + ["nn_macs"] + bandwidth_names + ["sram_used"] ) @@ -205,9 +200,8 @@ def write_pass_metrics_csv(nng, pass_filename): stats += [ps.placement.name] stats += [cps.strategy.name] stats += list(ps.block_config) - stats += [ps.n_blocks] stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles] - stats += [round_up_to_int(ps.macs[v]) for v in all_macs] + stats += [round_up_to_int(ps.macs)] for indices in bandwidth_indices: res = 0 i = indices[0] @@ -256,17 +250,16 @@ def print_performance_metrics_for_strat( if name: print("", file=f) - print("Network summary for", name, file=f) - print("Accelerator configuration {:>20}".format(arch.accelerator_config.name), file=f) - print("System configuration {:>20}".format(arch.system_config), file=f) - print("Memory mode {:>20}".format(arch.memory_mode), file=f) - print("Accelerator clock {:12d} MHz".format(int(arch.core_clock / 1e6)), file=f) + print(f"Network summary for {name}", file=f) + print(f"Accelerator configuration {arch.accelerator_config.name:>20}", file=f) + print(f"System configuration {arch.system_config:>20}", file=f) + print(f"Memory mode {arch.memory_mode:>20}", file=f) + print(f"Accelerator clock {int(arch.core_clock / 1e6):12d} MHz", file=f) for mem_area, label in mem_area_labels: + label += " bandwidth" + bandwidth = arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 print( - "Design peak {:25} {:12.2f} GB/s".format( - label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 - ), - file=f, + f"Design peak {label:25} {bandwidth:12.2f} GB/s", file=f, ) print(file=f) for mem_area, label in mem_area_labels: @@ -277,12 +270,12 @@ def print_performance_metrics_for_strat( extra = "" if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None: - extra = " ({:.2f} bits per element)".format(bits_per_element[mem_area]) + extra = f" ({bits_per_element[mem_area]:.2f} bits per element)" - print("Total {:25} {:12.2f} KiB{}".format(aug_label, memory_used[mem_area] / 1024.0, extra), file=f) + print(f"Total {aug_label:25} {memory_used[mem_area] / 1024.0:12.2f} KiB{extra}", file=f) print(file=f) - print("{:d} passes fused into {:d}".format(num_passes, num_cascaded_passes), file=f) + print(f"{num_passes:d} passes fused into {num_cascaded_passes:d}", file=f) if cpu_operations is None: cpu_operations = [] @@ -290,9 +283,8 @@ def print_performance_metrics_for_strat( n_cpu_operations = len(cpu_operations) if n_operations > 0: print( - "{:d}/{:d} ({:4.1%}) operations falling back to the CPU".format( - n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100 - ), + f"{n_cpu_operations:d}/{n_operations:d}" + f" ({n_cpu_operations / n_operations * 100:4.1%}) operations falling back to the CPU", file=f, ) @@ -303,9 +295,8 @@ def print_performance_metrics_for_strat( return " ".join(str(list(tens.shape)) for tens in lst) print( - "CPU operation: {} inputs {}, outputs {}".format( - op.type, format_tens_list(op.inputs), format_tens_list(op.outputs) - ), + f"CPU operation: {op.type}" + f" inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)}", file=f, ) @@ -318,60 +309,43 @@ def print_performance_metrics_for_strat( fm_bws = bws[TensorPurpose.FeatureMap] aug_label = label + " bandwidth" print( - "Average {:25} {:12.2f} GB/s".format(aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0), - file=f, + f"Average {aug_label:25} {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s", file=f, ) print( - "Input {:25} {:12.2f} MB/batch".format( - aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0 - ), + f"Input {aug_label:25} {np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0:12.2f} MB/batch", file=f, ) - print("Weight {:25} {:12.2f} MB/batch".format(aug_label, np.sum(weight_bws) / 1000.0 / 1000.0), file=f) + print(f"Weight {aug_label:25} {np.sum(weight_bws) / 1000.0 / 1000.0:12.2f} MB/batch", file=f) print( - "Output {:25} {:12.2f} MB/batch".format( - aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0 - ), + f"Output {aug_label:25} " + f"{np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0:12.2f} MB/batch", file=f, ) - print("Total {:25} {:12.2f} MB/batch".format(aug_label, total_bw / 1000.0 / 1000.0), file=f) + print(f"Total {aug_label:25} {total_bw / 1000.0 / 1000.0:12.2f} MB/batch", file=f) print( - "Total {:25} per input {:9.2f} MB/inference (batch size {:d})".format( - aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size - ), + f"Total {aug_label:25} per input " + f"{total_bw / 1000.0 / 1000.0 / batch_size:9.2f} MB/inference (batch size {batch_size:d})", file=f, ) print(file=f) print( - "Neural network macs {:12d} MACs/batch".format(int(macs[MacCount.NeuralNetworkMacs])), - file=f, + f"Neural network macs {int(macs):12d} MACs/batch", file=f, ) - print("Hardware macs {:12d} MACs/batch".format(int(macs[MacCount.HardwareMacs])), file=f) print( - "Network Tops/s {:12.2f} Tops/s".format( - macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12 - ), - file=f, - ) - print( - "Hardware Tops/s {:12.2f} Tops/s".format( - macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12 - ), - file=f, + f"Network Tops/s {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s", file=f, ) print(file=f) for kind in PassCycles.all(): aug_label = kind.display_name() + " cycles" cyc = cycles[kind] - print("{:30} {:12d} cycles/batch".format(aug_label, int(cyc)), file=f) + print(f"{aug_label:30} {int(cyc):12d} cycles/batch", file=f) print(file=f) print( - "Batch Inference time {:7.2f} ms, {:7.2f} inferences/s (batch size {:d})".format( - midpoint_inference_time * 1000, midpoint_fps, batch_size - ), + f"Batch Inference time {midpoint_inference_time * 1000:7.2f} ms," + f" {midpoint_fps:7.2f} inferences/s (batch size {batch_size:d})", file=f, ) print(file=f) -- cgit v1.2.1