diff options
Diffstat (limited to 'ethosu/vela/npu_performance.py')
-rw-r--r-- | ethosu/vela/npu_performance.py | 977 |
1 files changed, 502 insertions, 475 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index c83f8f52..b1dae4e0 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -19,45 +19,28 @@ # # Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance # estimate. +import copy from enum import auto from enum import IntEnum import numpy as np from . import numeric_util +from .architecture_allocator import ArchitectureBlockConfig from .architecture_features import Accelerator -from .architecture_features import Block -from .data_type import DataType -from .nn_graph import PassPlacement -from .nn_graph import SchedulerRewrite -from .operation import NpuBlockType +from .architecture_features import NpuBlockType +from .architecture_features import SHRAMElements +from .architecture_features import TensorFormat +from .numeric_util import round_up +from .operation import Kernel from .operation import Op -from .shared_buffer_allocation import is_acc_40bits_used +from .scheduler import Schedule +from .scheduler import SchedulerOperation +from .shape4d import Shape4D from .tensor import BandwidthDirection from .tensor import MemArea -from .tensor import shape_num_elements -from .tensor import Tensor -from .tensor import TensorBlockTraversal -from .tensor import TensorFormat from .tensor import TensorPurpose - - -def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2): - ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1]) - kernel = ps2.primary_op.kernel - - if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct): - op = ps2.primary_op - ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits()) - else: - ifm_block_depth = block_config_ps2[-1] - - ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max) - - # The performed height calculation is for worst case - height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0]) - width = ifm_block.width - return [height, width] +from .weight_compressor import WeightKey class PassCycles(IntEnum): @@ -91,82 +74,173 @@ class PassCycles(IntEnum): ) -def make_bandwidth_array(): - return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) - - -def make_cycles_array(): - return np.zeros(PassCycles.Size) - - -def make_metrics_arrays(): - return (make_bandwidth_array(), 0, make_cycles_array()) - +class PerformanceQuery: + def __init__(self, npu_block_type=0): + self.npu_block_type = npu_block_type + self.ifm_shape = Shape4D(0) + self.ifm_format = TensorFormat.NHWC + self.ifm_memory_area = MemArea.Unknown + self.ifm2_memory_area = MemArea.Unknown + self.ifm_bits = 0 + self.ifm2_bits = 0 + self.ifm2_shape = None + self.ifm2_format = TensorFormat.NHWC + self.ofm_shape = Shape4D(0) + self.ofm_format = TensorFormat.NHWC + self.ofm_memory_area = MemArea.Unknown + self.ofm_bits = 0 + self.const_shape = Shape4D(0) + self.const_memory_area = MemArea.Unknown + self.kernel = Kernel(1, 1) + self.config = ArchitectureBlockConfig() + + +class CycleCost: + def __init__(self): + self.op_macs = 0 + self.op_cycles = 0 + + def __mul__(self, scale): + out = CycleCost() + out.op_macs = self.op_macs * scale + out.op_cycles = self.op_cycles * scale + return out + + def __iadd__(self, rhs): + self.op_macs += rhs.op_macs + self.op_cycles += rhs.op_cycles + return self + + def __str__(self): + return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles) + + +class ElementAccess: + def __init__(self): + # List of ONLY element access counts, consumers + # need to scale these values by the correct bitwidths + # to calculated memory bandwidth + self.ifm_read = [0, 0] # ifm1, ifm2 + self.ofm_write = 0 + self.weights_refetch = 0 + self.const_read = [0, 0] # weights, scales + + def __mul__(self, scale): + out = ElementAccess() + out.ifm_read[0] = self.ifm_read[0] * scale + out.ifm_read[1] = self.ifm_read[1] * scale + out.ofm_write = self.ofm_write * scale + out.weights_refetch = self.weights_refetch * scale + out.const_read[0] = self.const_read[0] * scale + out.const_read[1] = self.const_read[1] * scale + return out + + def __iadd__(self, rhs): + self.ifm_read[0] += rhs.ifm_read[0] + self.ifm_read[1] += rhs.ifm_read[1] + self.ofm_write += rhs.ofm_write + self.weights_refetch += rhs.weights_refetch + self.const_read[0] += rhs.const_read[0] + self.const_read[1] += rhs.const_read[1] + return self + + def __str__(self): + return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read) + + +def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits): + if format == TensorFormat.NHWC: + strides = [0, 0, 0, 0] + strides[3] = element_bits / 8 # +Z + strides[2] = (element_bits * shape.depth) // 8 # +X + strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y + strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N + elif format == TensorFormat.NHCWB16: + strides = [0, 0, 0, 0, 0] + strides[4] = element_bits / 8 # +Z + strides[3] = (element_bits * 16) / 8 # +X + strides[2] = (element_bits * 16 * shape.width) / 8 # +C + strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y + strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N + + return strides + + +def _estimate_memory_transfer_efficiency( + arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer +): + burst_len = 8 -def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth): - ifm_blk_depth = ofm_blk_depth + strides = _strides_for_shape(shape4D, format, element_bits) - if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): - if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst: - ifm_blk_depth = 16 - elif ifm_elemwidth == 8: - ifm_blk_depth = 32 + if format == TensorFormat.NHCWB16: + if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit + burst_len = element_bits * block_size.depth * block_size.width + elif is_read: + burst_len = 16 * element_bits * block_size.width else: - ifm_blk_depth = 8 + burst_len = 16 * element_bits * block_size.width * arch.ncores + elif format == TensorFormat.NHWC: + if is_read: + if strides[3] == block_size.depth: + burst_len = element_bits * block_size.depth * block_size.width + else: + burst_len = element_bits * block_size.depth + else: + if block_size.depth <= 16 and strides[3] == block_size.depth: + burst_len = element_bits * block_size.depth * block_size.width + else: + burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits) - return min(ifm_depth, ifm_blk_depth) + burst_len = burst_len // 8 # bits->bytes + burst_len = min(arch.memory_burst_length[mem_area], burst_len) + return to_transfer * (arch.memory_burst_length[mem_area] / burst_len) -def get_minimal_cmd_cycles( - arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, ifm_shape4D, ofm_shape4D, dpu_cycles=0 -): - ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk") - ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk") - cycles_ifm_blk = ( - estimate_memory_transfer_efficiency( - arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk, shape4D=ifm_shape4D +def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery): + # Input block HW transfer (only for elements present) + ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements() + cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read] + cycles_ifm_blk = cycles_ifm_blk + ( + _estimate_memory_transfer_efficiency( + arch, + True, + query.ifm_memory_area, + query.ifm_format, + query.ifm_bits, + query.config.ifm_block, + query.ifm_shape, + ifm_bytes, ) - / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area] + / arch.memory_bandwidths_per_cycle[query.ifm_memory_area] ) - cycles_ofm_blk = ( - estimate_memory_transfer_efficiency( - arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk, shape4D=ofm_shape4D + # Output block HW transfer (only for elements present) + ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements() + cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write] + cycles_ofm_blk = cycles_ofm_blk + ( + _estimate_memory_transfer_efficiency( + arch, + False, + query.ofm_memory_area, + query.ofm_format, + query.ofm_bits, + query.config.ofm_block, + query.ofm_shape, + ofm_bytes, ) - / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area] + / arch.memory_bandwidths_per_cycle[query.ofm_memory_area] ) - return ( - arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read] - + cycles_ifm_blk - + dpu_cycles - + output_cycles - + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write] - + cycles_ofm_blk - ) / 4 - - -def estimate_output_cycles( - arch, - npu_block_type, - primary_op, - num_elems, - ifm_tensor, - ofm_tensor, - use_acc_40bits=False, - ifm2_tensor=None, - block_config: Block = None, -): - faf = None if primary_op.activation is None else primary_op.activation.op_type - if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32: - if ifm2_tensor is None: - # Unary op - output_perf_index = 0 - else: - # Binary op - output_perf_index = 1 - elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32: + return cycles_ifm_blk, cycles_ofm_blk + + +def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): + if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32: + # Unary op else Binary op + output_perf_index = 0 if query.ifm2_shape is not None else 1 + elif op_type == Op.Mul and query.ofm_bits == 32: output_perf_index = 2 - elif primary_op.type == Op.Mul or ( - npu_block_type + elif op_type == Op.Mul or ( + query.npu_block_type in ( NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, @@ -174,31 +248,24 @@ def estimate_output_cycles( NpuBlockType.ReduceSum, NpuBlockType.VectorProduct, ) - and use_acc_40bits + and query.config.acc_type == SHRAMElements.Acc40 ): output_perf_index = 3 - elif primary_op.type in (Op.Add, Op.Sub): - input_scale = ifm_tensor.quantization.scale_f32 - input2_scale = ifm2_tensor.quantization.scale_f32 - output_scale = ofm_tensor.quantization.scale_f32 - - if "resizebilinear" in primary_op.attrs: - output_scale = input2_scale - - if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale: + elif op_type in (Op.Add, Op.Sub): + if False: # Simple Add/Sub output_perf_index = 4 else: - # Advanced Add/Sub + # Advanced Add/Sub TODO: Add as perf selection as operator variant output_perf_index = 5 - elif primary_op.type.is_maxpool_op(): + elif op_type.is_maxpool_op(): output_perf_index = 6 else: output_perf_index = 7 - if faf in (Op.Sigmoid, Op.Tanh, Op.LUT): + if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT): activation_perf_index = 0 - elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1): + elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1): activation_perf_index = 1 else: activation_perf_index = 2 @@ -207,69 +274,48 @@ def estimate_output_cycles( arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index] ) - if primary_op.type.is_elementwise_op() and block_config is not None: - num_elems_blk = block_config.width * block_config.height * block_config.depth - cycle_cmd = get_minimal_cmd_cycles( - arch, - ifm_tensor, - ofm_tensor, - block_config, - block_config, - num_elems_blk * cycle_per_elem, - primary_op.ifm_shapes[0], - primary_op.ofm_shapes[0], - ) + if op_type.is_elementwise_op(): + num_elems_blk = query.config.ofm_block.elements() + ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) + cycle_cmd = ifm_blk_cycles + ofm_blk_cycles + cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk) - return num_elems * cycle_per_elem + return cycle_per_elem -def estimate_conv_pooling_cycles( - arch, - npu_block_type, - primary_op, - ifm_block: Block, - ofm_block: Block, - block_traversal, - kernel_dims, - ifm_tensor, - ofm_tensor, - scale_tensor=None, -): - ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth) - ifm_tens_shape = primary_op.ifm_shapes[0] - ofm_tens_shape = primary_op.ofm_shapes[0] +def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): + ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) + ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) if ( arch.config.ofm_ublock.height == 2 - and npu_block_type + and query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) - and ofm_tens_shape.height == 1 + and query.ofm_shape.height == 1 # Optimisation only applies for even width tensors - and ofm_tens_shape.width % 2 == 0 - and kernel_dims[0] == 1 + and query.ofm_shape.width % 2 == 0 + and query.kernel.height == 1 ): - ofm_ublock.width = 4 - ofm_ublock.height = 1 - ofm_block.height = 1 + ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth) + ofm_block = ofm_block.with_height(1) + else: + ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc()) num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width) - num_ublk_y = ofm_block.height // ofm_ublock.height + num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height) num_ublk_xy = num_ublk_x * num_ublk_y - num_ublk_z = ofm_block.depth // ofm_ublock.depth - num_ofm_blk = 0 - total_cycles = 0 - num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth - use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor) - - sub_kernel_limits = arch.sub_kernel_limits[npu_block_type] - n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0]) - n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1]) + num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth) + use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40 + + sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] + n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0]) + n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1]) sub_kernel_x = [ - min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x) + min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x) ] sub_kernel_y = [ - min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y) + min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y) ] sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x) @@ -277,27 +323,27 @@ def estimate_conv_pooling_cycles( cycles_wb = 32 * ofm_ublock.depth // 8 for num_kernel_elems in sub_kernel_size: - if npu_block_type == NpuBlockType.Pooling: + if query.npu_block_type == NpuBlockType.Pooling: num_kernel_steps = 1 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z - if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32: + if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32: cycles *= 2 - elif npu_block_type == NpuBlockType.ConvolutionDepthWise: + elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise: cycles = 4 * num_ublk_xy - if ifm_tensor.dtype.size_in_bits() == 16: + if query.ifm_bits == 16: cycles *= 2 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4) cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z elif ( - (npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst) - or npu_block_type == NpuBlockType.VectorProduct - or npu_block_type == NpuBlockType.ReduceSum + (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel) + or query.npu_block_type == NpuBlockType.VectorProduct + or query.npu_block_type == NpuBlockType.ReduceSum ): num_kernel_steps = num_kernel_elems cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z else: - assert block_traversal == TensorBlockTraversal.PartKernelFirst - divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4 + assert query.config.is_partkernel + divider = 2 if query.ifm_bits == 16 else 4 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider) cycles = max(cycles_wb, 4 * num_ublk_xy) * ( num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z @@ -314,345 +360,199 @@ def estimate_conv_pooling_cycles( if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits: delay_cycles += delay * num_ublk_z else: - delay = ( - 3 - if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128) - else 2 - ) + if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128): + delay = 3 + else: + delay = 2 + if num_ublk_x == 1 and num_ublk_y == 1: if num_ublk_z == 1: delay_cycles = delay * num_kernel_steps elif num_kernel_steps > 1: delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z - if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst: + if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel: delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8) cycles_dpu_blk += cycles cycles_dpu_blk += delay_cycles - if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): - cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth) + if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): + cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth) cycles_dpu_blk /= arch.ncores - num_ofm_blk = ( - numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height) - * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width) - * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth) - ) - - cycles_output_blk = estimate_output_cycles( - arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits - ) + # Estimate output cycles + num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements() + cycles_output_blk = _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements() - if scale_tensor: + # Scale and bias tensor + if query.const_shape.depth > 0: cycles_bias_blk = ( - 10 - * min(ofm_block.depth, ofm_tens_shape.depth) - * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read] - / 256 + 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256 ) cycles_output_blk = max(cycles_output_blk, cycles_bias_blk) - cycles_cmd = get_minimal_cmd_cycles( - arch, - ifm_tensor, - ofm_tensor, - ifm_block, - ofm_block, - cycles_dpu_blk, - ifm_tens_shape, - ofm_tens_shape, - cycles_output_blk, - ) + ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) + cycles_cmd = ifm_blk_cycles + ofm_blk_cycles + cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU + cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd) cycles_output_blk = max(cycles_output_blk, cycles_cmd) if cycles_dpu_blk > cycles_output_blk: - total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk + total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk else: - total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk + total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk return total_cycles -def estimate_memory_transfer_efficiency( - arch, mem_area, direction, tensor, block_size: Block, replace_bw=None, shape4D=None -): - if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16): - return tensor.bandwidth() if replace_bw is None else replace_bw - - # Estimate memory transfer efficiency by calculating the burst length - # this is related to data format, block shape, and tensor shape, etc. - burst_len = 0 - elem_size = tensor.dtype.size_in_bytes() - is_ifm = direction == BandwidthDirection.Read - tens = tensor.clone() - - if not tensor.needs_linear_format: - tens.set_format(TensorFormat.NHCWB16, arch) - strides = tens.get_strides(shape4D=shape4D) - - if tens.format == TensorFormat.NHCWB16: - if strides[1] == block_size.depth: - burst_len = elem_size * block_size.depth * block_size.width - elif is_ifm: - burst_len = 16 * elem_size * block_size.width - else: - burst_len = 16 * elem_size * block_size.width * arch.ncores - else: - assert tens.format == TensorFormat.NHWC - if is_ifm: - if strides[3] == block_size.depth: - burst_len = elem_size * block_size.depth * block_size.width - else: - burst_len = elem_size * block_size.depth - else: - if block_size.depth <= 16 and strides[3] == block_size.depth: - burst_len = elem_size * block_size.depth * block_size.width - else: - burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size) - - burst_len = min(arch.memory_burst_length[mem_area], burst_len) - bw = tens.bandwidth() if replace_bw is None else replace_bw +def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer): + from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area] + to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area] + return max(from_cycles, to_cycles) - return bw * (arch.memory_burst_length[mem_area] / burst_len) +def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): + cycles = CycleCost() -def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False): - if block_config is None: - block_config = ps.block_config - bws = make_bandwidth_array() - scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency - macs = 0 - cycles = make_cycles_array() - ifm_read_multiple = 1 - weight_read_multiple = 0 - - if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit): - return bws, macs, cycles, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass - - explicit_padding = (0, 0, 0, 0) - primary_op = ps.primary_op - replacement_read_bws = {} - ofm_block = Block(block_config[1], block_config[0], block_config[3]) - ifm_block = Block(block_config[1], block_config[0], block_config[3]) - - if ps.placement == PassPlacement.Npu and primary_op: - explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding) - assert primary_op.type.npu_block_type == ps.npu_block_type - npu_block_type = primary_op.type.npu_block_type - - ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm() - ifm_tensor_shape = ps.primary_op.ifm_shapes[0] - ofm_tensor_shape = ps.primary_op.ofm_shapes[0] - ofm_block.width = min(ofm_block.width, ofm_tensor_shape.width) - ofm_block.height = min(ofm_block.height, ofm_tensor_shape.height) - ofm_block.depth = min(ofm_block.depth, ofm_tensor_shape.depth) - - if npu_block_type == NpuBlockType.ReduceSum: - block_traversal = TensorBlockTraversal.DepthFirst - elif npu_block_type in ( - NpuBlockType.ConvolutionMxN, - NpuBlockType.ConvolutionDepthWise, - NpuBlockType.VectorProduct, - ): - block_traversal = weight_tensor.block_traversal + # Convolution/Vector product cycle calculation + if query.npu_block_type in ( + NpuBlockType.ConvolutionMxN, + NpuBlockType.ConvolutionDepthWise, + NpuBlockType.VectorProduct, + NpuBlockType.Pooling, + NpuBlockType.ReduceSum, + ): + # cycles.op_macs and cycles.op_cycles should both handle >32-bits + if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): + cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements()) else: - block_traversal = TensorBlockTraversal.Default - ifm_block_depth = get_ifm_block_depth( - npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth - ) - ifm_block = arch.get_ifm_block_size( - ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode + cycles.op_macs = ( + int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements()) + ) + + cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query)) + # Elementwise cycle calculation + elif query.npu_block_type == NpuBlockType.ElementWise: + cycles.op_macs = 0 + cycles.op_cycles = int(_estimate_output_cycles_per_element(arch, op_type, faf_type, query)) * int( + query.ofm_shape.elements() ) - ifm_block.width = min(ifm_block.width, ifm_tensor_shape.width) - ifm_block.height = min(ifm_block.height, ifm_tensor_shape.height) + else: + assert False - if npu_block_type in ( - NpuBlockType.ConvolutionMxN, - NpuBlockType.ConvolutionDepthWise, - NpuBlockType.VectorProduct, - NpuBlockType.Pooling, - NpuBlockType.ReduceSum, - ): - # extent the ifm to full dimension + return cycles - batch_size = ifm_tensor_shape.batch - # add in padding, height += top and bottom, width += left and right - ifm_tensor_shape = ifm_tensor_shape.add( - 0, explicit_padding[0] + explicit_padding[2], explicit_padding[1] + explicit_padding[3], 0 - ) +def measure_element_access(arch, query: PerformanceQuery): + access = ElementAccess() - if npu_block_type != NpuBlockType.Pooling: - if npu_block_type == NpuBlockType.ReduceSum: - weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]] - weight_tensor_bandwidth_shape = [0] * 4 - weight_tensor_element_size = 0 - weight_tensor_bandwidth_compression_scale = 0.0 - else: - # For Vector product, weight format of IO is extended to HWIO, with H=W=1 - weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1) - weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1) - weight_tensor_element_size = weight_tensor.element_size() - weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale - - nn_ops = ( - int(ofm_tensor_shape.batch) - * int(ofm_tensor_shape.height) - * int(ofm_tensor_shape.width) - * int(weight_tensor_shape[0]) - * int(weight_tensor_shape[1]) - * int(weight_tensor_shape[2]) - * int(weight_tensor_shape[3]) - ) - else: - weight_tensor_shape = [ - *primary_op.get_kernel_size(), - 1, - ifm_tensor_shape.depth, - ] - weight_tensor_bandwidth_shape = weight_tensor_shape - weight_tensor_element_size = 0 - weight_tensor_bandwidth_compression_scale = 0.0 - nn_ops = 0 # pooling doesn't count as NN ops - - kernel_dims = weight_tensor_shape[:2] - - sub_kernel_limits = arch.sub_kernel_limits[npu_block_type] - # count the sub kernels; the IFM block needs to be refetched for each of them - n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0]) - n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1]) - n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x - - n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth) - if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): - n_full_depth_stages = 1 # force to no reread - - ifm_read_multiple = n_sub_kernels * n_full_depth_stages - replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple - - weight_read_multiple = numeric_util.round_up_divide( - ofm_tensor_shape.height, ofm_block.height - ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width) - replacement_read_bws[weight_tensor] = ( - batch_size - * shape_num_elements(weight_tensor_bandwidth_shape) - * weight_tensor_element_size - * weight_tensor_bandwidth_compression_scale - * weight_read_multiple - ) + ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) + ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) + ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format])) - macs += nn_ops - cycles[PassCycles.Npu] = estimate_conv_pooling_cycles( - arch, - npu_block_type, - primary_op, - ifm_block, - ofm_block, - block_traversal, - kernel_dims, - ifm_tensor, - ofm_tensor, - ps.scale_tensor, - ) - elif npu_block_type == NpuBlockType.ElementWise: - # Work out how many elements we have and calculate performance. - cycles[PassCycles.Npu] = estimate_output_cycles( - arch, - npu_block_type, - primary_op, - ofm_tensor.elements(), - ps.ifm_tensor, - ps.ofm_tensor, - None, - ps.ifm2_tensor, - ofm_block, - ) + # Number of ofm blocks in the overall output shape + ofm_blocks = query.ofm_shape.div_round_up(ofm_block) + ofm_block_depth = ofm_block.depth + if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): + ofm_blocks = ofm_blocks.with_depth(1) + ofm_block_depth = query.ifm_shape.depth - prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None) - if prev_npu_pass is None: - # cycles for DMA ops in first pass - dma_ops = (op for op in ps.ops if op.type == Op.DMA) - for dma_op in dma_ops: - mem_area = dma_op.attrs["source"] - for tens in dma_op.inputs: - cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area] - - if rewrite_list is not None: - # apply the desired rewrites - for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list: - if ps != ps_to_rewrite: - continue - if rewrite_op == SchedulerRewrite.Nop: - pass # these are fine, no bandwidth changes - elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,): - bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens] - if tens.purpose == TensorPurpose.FeatureMap: - scaled_bw = estimate_memory_transfer_efficiency( - arch, - arch.fast_storage_mem_area, - BandwidthDirection.Read, - tens, - ifm_block, - replacement_read_bws[tens], - ) - else: - scaled_bw = replacement_read_bws[tens] - scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw - replacement_read_bws[tens] = 0 - - for tens in ps.outputs: - if force_outputs_to_fast_storage: - bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() - scaled_bws[arch.fast_storage_mem_area][tens.purpose][ - BandwidthDirection.Write - ] += estimate_memory_transfer_efficiency( - arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0], - ) - else: - bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() - scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency( - arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0] - ) + # Convolution & pooling + if query.npu_block_type in ( + NpuBlockType.ConvolutionMxN, + NpuBlockType.ConvolutionDepthWise, + NpuBlockType.VectorProduct, + NpuBlockType.Pooling, + NpuBlockType.ReduceSum, + ): + # Number of sub kernels + sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] + subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0]) + subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1]) - for tens in ps.intermediates: - bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() - scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() + ofm_block_count = ofm_blocks.elements() + + ifm_fetch = ( + Shape4D.round_up(ifm_block, ifm_rounding).elements_wh() + * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth + ) - if tens in replacement_read_bws: - bw = replacement_read_bws[tens] + if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): + kernel_read = query.kernel.elements_wh() * 1 # force to no reread else: - bw = tens.bandwidth() + kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth + + weight_fetch = kernel_read * ofm_block_depth * ofm_block_count + + access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count + + if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum): + access.const_read[0] = weight_fetch + access.const_read[1] = query.ofm_shape.depth # Scales & biases + access.weights_refetch = ofm_blocks.elements_wh() + # Elementwise + elif query.npu_block_type == NpuBlockType.ElementWise: + if query.ifm_shape.elements() == 1: + if query.ifm_bits > 8: + # ifm is a non 8-bit scalar + access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements() + if query.ifm2_shape: + access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() + else: + access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() + if query.ifm2_shape: + if query.ifm2_shape.elements() > 1: + access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() + elif query.ifm2_bits > 8: + # ifm2 is a non 8-bit scalar + access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements() + # Unknown + else: + assert False - bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw - scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw + ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format])) + access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() + return access - for tens in ps.inputs: - if tens in replacement_read_bws: - bw = replacement_read_bws[tens] - else: - bw = tens.bandwidth() - bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw +def measure_performance_cost( + arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D +): + assert (query.ofm_bits > 0) and (query.ifm_bits > 0) + assert query.ofm_shape.elements() != 0 - op_shape = None - if ps.placement == PassPlacement.Npu and primary_op: - if tens == ps.ifm_tensor: - op_shape = ps.ifm_shapes[0] - elif tens == ps.ifm2_tensor: - op_shape = ps.ifm_shapes[1] + # Default to start if no offset provided + if offset is None: + offset = Shape4D(0, 0, 0, 0) - scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency( - arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw, op_shape - ) + # Default to entire area if no sub-shape provided + if sub_shape is None: + sub_shape = query.ofm_shape + else: + sub_shape = Shape4D.min(sub_shape, query.ofm_shape) + + sub_query = copy.deepcopy(query) + sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape) - # quick build access counts for only current pass, even though these aren't the final numbers - update_summary_cycles(arch, scaled_bws, cycles) + access = ElementAccess() + cycles = CycleCost() - return bws, macs, cycles, ifm_read_multiple, weight_read_multiple + cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query) + cycles += cycle_tmp + access = measure_element_access(arch, sub_query) + + return access, cycles + + +def make_bandwidth_array(): + return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) + + +def make_cycles_array(): + return np.zeros(PassCycles.Size) def update_summary_cycles(arch, bws, cycles): @@ -669,42 +569,169 @@ def update_summary_cycles(arch, bws, cycles): return cycles -def collate_stats_for_cascaded_pass(arch, bws, macs, cycles): - return bws, macs, cycles +def estimate_full_op_performance( + arch, schedule: Schedule, op: SchedulerOperation, prev_op: SchedulerOperation, block_config +): + cycles_a = make_cycles_array() + bws = make_bandwidth_array() + scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency + macs = 0 + query = PerformanceQuery(op.op_type.npu_block_type) + query.ifm_shape = op.ifm.shape + query.ifm_format = op.ifm.format + query.ifm_memory_area = op.ifm.mem_area + query.ifm_bits = op.ifm.dtype.size_in_bits() + query.ifm2_shape = op.ifm2 and op.ifm2.shape + query.ifm2_format = op.ifm2 and op.ifm2.format + query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area + query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() + query.ofm_shape = op.ofm.shape + query.ofm_memory_area = op.ofm.mem_area + query.ofm_bits = op.ofm.dtype.size_in_bits() + query.ofm_format = op.ofm.format + query.kernel = op.kernel + query.config = block_config + + cost = schedule.cost_map[op] + prev_cost = schedule.cost_map[prev_op] if prev_op else None + if op.parent_op.bias: + query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth) + if cost.buffered_weight_tensor: + query.const_memory_area = cost.buffered_weight_tensor.mem_area + else: + query.const_memory_area = cost.npu_weights_tensor.mem_area + + cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query) + cycles_a[PassCycles.Npu] = cycles.op_cycles + macs = cycles.op_macs + + access = measure_element_access(arch, query) + + # How many NPU cycles are available under the previously executing + # operator for performing buffered DMA transfers + slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0 + + # LUT Transfer + parent_op = op.parent_op + lut_transfer_cycles = 0 + if parent_op.activation_lut: + lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] + src_tensor = lut_tensor.src_tensor + if src_tensor and lut_tensor.mem_area != src_tensor.mem_area: + bw = src_tensor.storage_size() + lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw) + + bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw + # LUT read from SHRAM TODO remove? + scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][ + BandwidthDirection.Read + ] += _estimate_memory_transfer_efficiency( + arch, + True, + lut_tensor.mem_area, + lut_tensor.format, + lut_tensor.element_size(), + query.config.ifm_block, + Shape4D(lut_tensor.shape), + bw, + ) -def performance_for_cascaded_pass(arch, cps): - total_bws = make_bandwidth_array() - total_macs = 0 - total_cycles = make_cycles_array() + if cost.npu_weights_tensor and cost.buffered_weight_tensor: + # DMA Weight Transfer + sz = 0 + # Get the size of the first DMA + for core in range(0, arch.ncores): + key = WeightKey(core, 0) + if key in cost.npu_weights_tensor.encoded_ranges: + weight_range = cost.npu_weights_tensor.encoded_ranges[key] + sz += round_up(weight_range.total_bytes, 16) + + total_sz = len(cost.npu_weights_tensor.buffer) + bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz + bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz + + ws_first_transfer_cycles = measure_mem2mem_cycles( + arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz + ) + + # Add cycles for Weight + Scale Transfer + cycles_a[PassCycles.Npu] = max( + cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles, + cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0), + ) + + # Add cycles for LUT Transfer + cycles_a[PassCycles.Npu] += lut_transfer_cycles + else: + # Add cycles for LUT Transfer + cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0) + + # OFM write + ofm = op.parent_op.ofm + bw = access.ofm_write * ofm.element_size() + bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw + scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency( + arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw + ) + + # IFM read + ifm = op.parent_op.ifm + bw = access.ifm_read[0] * ifm.element_size() + bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw + scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( + arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw + ) + if query.ifm2_shape: + ifm2 = op.parent_op.ifm2 + bw = access.ifm_read[1] * ifm2.element_size() + bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw + scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( + arch, + True, + query.ifm2_memory_area, + ifm2.format, + op.ifm2.dtype.size_in_bits(), + query.config.ifm_block, + query.ifm2_shape, + bw, + ) + + # Weight read + if access.const_read[0] > 0: + # alignment not accounted for in bandwidth_compression_scale_approx + encoded_size_approx = ( + cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size() + ) + orig_weight_size = parent_op.weights.elements() + bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size + bw = access.const_read[0] * bandwidth_compression_scale_approx + bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw + + if access.const_read[1] > 0: + # Scales & biases + bw = access.const_read[1] * op.parent_op.bias.element_size() + bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw - for ps in cps.passes: - bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps) - ps.bandwidths = bws - ps.macs = macs - ps.cycles = cycles - total_bws += bws - total_macs += macs - total_cycles += cycles + update_summary_cycles(arch, scaled_bws, cycles_a) - bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles) - cps.bandwidths = bws - cps.macs = macs - cps.cycles = cycles - return bws, macs, cycles + return bws, macs, cycles_a -def calc_performance_for_network(nng, arch): +def calc_new_performance_for_network(nng, arch): total_bws = make_bandwidth_array() total_macs = 0 total_cycles = np.zeros(PassCycles.Size) for sg in nng.subgraphs: - for cps in sg.cascaded_passes: - bws, macs, cycles = performance_for_cascaded_pass(arch, cps) + prev_op = None + for sched_op in sg.sched_ops: + op_info = sg.schedule.cost_map[sched_op] + bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config) total_bws += bws total_macs += macs total_cycles += cycles + prev_op = sched_op nng.bandwidths = total_bws nng.macs = total_macs |