aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/npu_performance.py
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2021-05-27 18:49:40 +0100
committerTim Hall <tim.hall@arm.com>2021-05-27 18:57:39 +0100
commitd8339a75c9b655c0507e34238078fdad068b4023 (patch)
tree36a14726b30760169a83c0356803b480992fade8 /ethosu/vela/npu_performance.py
parent64556f32ff7bfca6036a6598034464b13b64a4ef (diff)
downloadethos-u-vela-d8339a75c9b655c0507e34238078fdad068b4023.tar.gz
MLBEDSW-4034: New Scheduler Size or Performance Optimisation
- Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b Signed-off-by: Tim Hall <tim.hall@arm.com> Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
Diffstat (limited to 'ethosu/vela/npu_performance.py')
-rw-r--r--ethosu/vela/npu_performance.py977
1 files changed, 502 insertions, 475 deletions
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index c83f8f52..b1dae4e0 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -19,45 +19,28 @@
#
# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
# estimate.
+import copy
from enum import auto
from enum import IntEnum
import numpy as np
from . import numeric_util
+from .architecture_allocator import ArchitectureBlockConfig
from .architecture_features import Accelerator
-from .architecture_features import Block
-from .data_type import DataType
-from .nn_graph import PassPlacement
-from .nn_graph import SchedulerRewrite
-from .operation import NpuBlockType
+from .architecture_features import NpuBlockType
+from .architecture_features import SHRAMElements
+from .architecture_features import TensorFormat
+from .numeric_util import round_up
+from .operation import Kernel
from .operation import Op
-from .shared_buffer_allocation import is_acc_40bits_used
+from .scheduler import Schedule
+from .scheduler import SchedulerOperation
+from .shape4d import Shape4D
from .tensor import BandwidthDirection
from .tensor import MemArea
-from .tensor import shape_num_elements
-from .tensor import Tensor
-from .tensor import TensorBlockTraversal
-from .tensor import TensorFormat
from .tensor import TensorPurpose
-
-
-def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):
- ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])
- kernel = ps2.primary_op.kernel
-
- if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
- op = ps2.primary_op
- ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())
- else:
- ifm_block_depth = block_config_ps2[-1]
-
- ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
-
- # The performed height calculation is for worst case
- height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])
- width = ifm_block.width
- return [height, width]
+from .weight_compressor import WeightKey
class PassCycles(IntEnum):
@@ -91,82 +74,173 @@ class PassCycles(IntEnum):
)
-def make_bandwidth_array():
- return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
-
-
-def make_cycles_array():
- return np.zeros(PassCycles.Size)
-
-
-def make_metrics_arrays():
- return (make_bandwidth_array(), 0, make_cycles_array())
-
+class PerformanceQuery:
+ def __init__(self, npu_block_type=0):
+ self.npu_block_type = npu_block_type
+ self.ifm_shape = Shape4D(0)
+ self.ifm_format = TensorFormat.NHWC
+ self.ifm_memory_area = MemArea.Unknown
+ self.ifm2_memory_area = MemArea.Unknown
+ self.ifm_bits = 0
+ self.ifm2_bits = 0
+ self.ifm2_shape = None
+ self.ifm2_format = TensorFormat.NHWC
+ self.ofm_shape = Shape4D(0)
+ self.ofm_format = TensorFormat.NHWC
+ self.ofm_memory_area = MemArea.Unknown
+ self.ofm_bits = 0
+ self.const_shape = Shape4D(0)
+ self.const_memory_area = MemArea.Unknown
+ self.kernel = Kernel(1, 1)
+ self.config = ArchitectureBlockConfig()
+
+
+class CycleCost:
+ def __init__(self):
+ self.op_macs = 0
+ self.op_cycles = 0
+
+ def __mul__(self, scale):
+ out = CycleCost()
+ out.op_macs = self.op_macs * scale
+ out.op_cycles = self.op_cycles * scale
+ return out
+
+ def __iadd__(self, rhs):
+ self.op_macs += rhs.op_macs
+ self.op_cycles += rhs.op_cycles
+ return self
+
+ def __str__(self):
+ return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
+
+
+class ElementAccess:
+ def __init__(self):
+ # List of ONLY element access counts, consumers
+ # need to scale these values by the correct bitwidths
+ # to calculated memory bandwidth
+ self.ifm_read = [0, 0] # ifm1, ifm2
+ self.ofm_write = 0
+ self.weights_refetch = 0
+ self.const_read = [0, 0] # weights, scales
+
+ def __mul__(self, scale):
+ out = ElementAccess()
+ out.ifm_read[0] = self.ifm_read[0] * scale
+ out.ifm_read[1] = self.ifm_read[1] * scale
+ out.ofm_write = self.ofm_write * scale
+ out.weights_refetch = self.weights_refetch * scale
+ out.const_read[0] = self.const_read[0] * scale
+ out.const_read[1] = self.const_read[1] * scale
+ return out
+
+ def __iadd__(self, rhs):
+ self.ifm_read[0] += rhs.ifm_read[0]
+ self.ifm_read[1] += rhs.ifm_read[1]
+ self.ofm_write += rhs.ofm_write
+ self.weights_refetch += rhs.weights_refetch
+ self.const_read[0] += rhs.const_read[0]
+ self.const_read[1] += rhs.const_read[1]
+ return self
+
+ def __str__(self):
+ return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
+
+
+def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
+ if format == TensorFormat.NHWC:
+ strides = [0, 0, 0, 0]
+ strides[3] = element_bits / 8 # +Z
+ strides[2] = (element_bits * shape.depth) // 8 # +X
+ strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y
+ strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N
+ elif format == TensorFormat.NHCWB16:
+ strides = [0, 0, 0, 0, 0]
+ strides[4] = element_bits / 8 # +Z
+ strides[3] = (element_bits * 16) / 8 # +X
+ strides[2] = (element_bits * 16 * shape.width) / 8 # +C
+ strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y
+ strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N
+
+ return strides
+
+
+def _estimate_memory_transfer_efficiency(
+ arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
+):
+ burst_len = 8
-def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):
- ifm_blk_depth = ofm_blk_depth
+ strides = _strides_for_shape(shape4D, format, element_bits)
- if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
- if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
- ifm_blk_depth = 16
- elif ifm_elemwidth == 8:
- ifm_blk_depth = 32
+ if format == TensorFormat.NHCWB16:
+ if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit
+ burst_len = element_bits * block_size.depth * block_size.width
+ elif is_read:
+ burst_len = 16 * element_bits * block_size.width
else:
- ifm_blk_depth = 8
+ burst_len = 16 * element_bits * block_size.width * arch.ncores
+ elif format == TensorFormat.NHWC:
+ if is_read:
+ if strides[3] == block_size.depth:
+ burst_len = element_bits * block_size.depth * block_size.width
+ else:
+ burst_len = element_bits * block_size.depth
+ else:
+ if block_size.depth <= 16 and strides[3] == block_size.depth:
+ burst_len = element_bits * block_size.depth * block_size.width
+ else:
+ burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
- return min(ifm_depth, ifm_blk_depth)
+ burst_len = burst_len // 8 # bits->bytes
+ burst_len = min(arch.memory_burst_length[mem_area], burst_len)
+ return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
-def get_minimal_cmd_cycles(
- arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, ifm_shape4D, ofm_shape4D, dpu_cycles=0
-):
- ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
- ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
- cycles_ifm_blk = (
- estimate_memory_transfer_efficiency(
- arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk, shape4D=ifm_shape4D
+def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
+ # Input block HW transfer (only for elements present)
+ ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
+ cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
+ cycles_ifm_blk = cycles_ifm_blk + (
+ _estimate_memory_transfer_efficiency(
+ arch,
+ True,
+ query.ifm_memory_area,
+ query.ifm_format,
+ query.ifm_bits,
+ query.config.ifm_block,
+ query.ifm_shape,
+ ifm_bytes,
)
- / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]
+ / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
)
- cycles_ofm_blk = (
- estimate_memory_transfer_efficiency(
- arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk, shape4D=ofm_shape4D
+ # Output block HW transfer (only for elements present)
+ ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
+ cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
+ cycles_ofm_blk = cycles_ofm_blk + (
+ _estimate_memory_transfer_efficiency(
+ arch,
+ False,
+ query.ofm_memory_area,
+ query.ofm_format,
+ query.ofm_bits,
+ query.config.ofm_block,
+ query.ofm_shape,
+ ofm_bytes,
)
- / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
+ / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
)
- return (
- arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]
- + cycles_ifm_blk
- + dpu_cycles
- + output_cycles
- + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]
- + cycles_ofm_blk
- ) / 4
-
-
-def estimate_output_cycles(
- arch,
- npu_block_type,
- primary_op,
- num_elems,
- ifm_tensor,
- ofm_tensor,
- use_acc_40bits=False,
- ifm2_tensor=None,
- block_config: Block = None,
-):
- faf = None if primary_op.activation is None else primary_op.activation.op_type
- if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:
- if ifm2_tensor is None:
- # Unary op
- output_perf_index = 0
- else:
- # Binary op
- output_perf_index = 1
- elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:
+ return cycles_ifm_blk, cycles_ofm_blk
+
+
+def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
+ if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
+ # Unary op else Binary op
+ output_perf_index = 0 if query.ifm2_shape is not None else 1
+ elif op_type == Op.Mul and query.ofm_bits == 32:
output_perf_index = 2
- elif primary_op.type == Op.Mul or (
- npu_block_type
+ elif op_type == Op.Mul or (
+ query.npu_block_type
in (
NpuBlockType.ConvolutionMxN,
NpuBlockType.ConvolutionDepthWise,
@@ -174,31 +248,24 @@ def estimate_output_cycles(
NpuBlockType.ReduceSum,
NpuBlockType.VectorProduct,
)
- and use_acc_40bits
+ and query.config.acc_type == SHRAMElements.Acc40
):
output_perf_index = 3
- elif primary_op.type in (Op.Add, Op.Sub):
- input_scale = ifm_tensor.quantization.scale_f32
- input2_scale = ifm2_tensor.quantization.scale_f32
- output_scale = ofm_tensor.quantization.scale_f32
-
- if "resizebilinear" in primary_op.attrs:
- output_scale = input2_scale
-
- if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
+ elif op_type in (Op.Add, Op.Sub):
+ if False:
# Simple Add/Sub
output_perf_index = 4
else:
- # Advanced Add/Sub
+ # Advanced Add/Sub TODO: Add as perf selection as operator variant
output_perf_index = 5
- elif primary_op.type.is_maxpool_op():
+ elif op_type.is_maxpool_op():
output_perf_index = 6
else:
output_perf_index = 7
- if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
+ if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
activation_perf_index = 0
- elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
+ elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
activation_perf_index = 1
else:
activation_perf_index = 2
@@ -207,69 +274,48 @@ def estimate_output_cycles(
arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
)
- if primary_op.type.is_elementwise_op() and block_config is not None:
- num_elems_blk = block_config.width * block_config.height * block_config.depth
- cycle_cmd = get_minimal_cmd_cycles(
- arch,
- ifm_tensor,
- ofm_tensor,
- block_config,
- block_config,
- num_elems_blk * cycle_per_elem,
- primary_op.ifm_shapes[0],
- primary_op.ofm_shapes[0],
- )
+ if op_type.is_elementwise_op():
+ num_elems_blk = query.config.ofm_block.elements()
+ ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
+ cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
+ cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU
cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
- return num_elems * cycle_per_elem
+ return cycle_per_elem
-def estimate_conv_pooling_cycles(
- arch,
- npu_block_type,
- primary_op,
- ifm_block: Block,
- ofm_block: Block,
- block_traversal,
- kernel_dims,
- ifm_tensor,
- ofm_tensor,
- scale_tensor=None,
-):
- ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
- ifm_tens_shape = primary_op.ifm_shapes[0]
- ofm_tens_shape = primary_op.ofm_shapes[0]
+def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
+ ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
+ ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
if (
arch.config.ofm_ublock.height == 2
- and npu_block_type
+ and query.npu_block_type
in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
- and ofm_tens_shape.height == 1
+ and query.ofm_shape.height == 1
# Optimisation only applies for even width tensors
- and ofm_tens_shape.width % 2 == 0
- and kernel_dims[0] == 1
+ and query.ofm_shape.width % 2 == 0
+ and query.kernel.height == 1
):
- ofm_ublock.width = 4
- ofm_ublock.height = 1
- ofm_block.height = 1
+ ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
+ ofm_block = ofm_block.with_height(1)
+ else:
+ ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
- num_ublk_y = ofm_block.height // ofm_ublock.height
+ num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
num_ublk_xy = num_ublk_x * num_ublk_y
- num_ublk_z = ofm_block.depth // ofm_ublock.depth
- num_ofm_blk = 0
- total_cycles = 0
- num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth
- use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
-
- sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
- n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
- n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
+ num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
+ use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
+
+ sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
+ n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
+ n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
sub_kernel_x = [
- min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
+ min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
]
sub_kernel_y = [
- min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
+ min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
]
sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
@@ -277,27 +323,27 @@ def estimate_conv_pooling_cycles(
cycles_wb = 32 * ofm_ublock.depth // 8
for num_kernel_elems in sub_kernel_size:
- if npu_block_type == NpuBlockType.Pooling:
+ if query.npu_block_type == NpuBlockType.Pooling:
num_kernel_steps = 1
cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
- if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
+ if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
cycles *= 2
- elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
+ elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
cycles = 4 * num_ublk_xy
- if ifm_tensor.dtype.size_in_bits() == 16:
+ if query.ifm_bits == 16:
cycles *= 2
num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
elif (
- (npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
- or npu_block_type == NpuBlockType.VectorProduct
- or npu_block_type == NpuBlockType.ReduceSum
+ (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
+ or query.npu_block_type == NpuBlockType.VectorProduct
+ or query.npu_block_type == NpuBlockType.ReduceSum
):
num_kernel_steps = num_kernel_elems
cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
else:
- assert block_traversal == TensorBlockTraversal.PartKernelFirst
- divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
+ assert query.config.is_partkernel
+ divider = 2 if query.ifm_bits == 16 else 4
num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
cycles = max(cycles_wb, 4 * num_ublk_xy) * (
num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
@@ -314,345 +360,199 @@ def estimate_conv_pooling_cycles(
if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
delay_cycles += delay * num_ublk_z
else:
- delay = (
- 3
- if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)
- else 2
- )
+ if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
+ delay = 3
+ else:
+ delay = 2
+
if num_ublk_x == 1 and num_ublk_y == 1:
if num_ublk_z == 1:
delay_cycles = delay * num_kernel_steps
elif num_kernel_steps > 1:
delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
- if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:
+ if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
cycles_dpu_blk += cycles
cycles_dpu_blk += delay_cycles
- if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
- cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)
+ if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
+ cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
cycles_dpu_blk /= arch.ncores
- num_ofm_blk = (
- numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)
- * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)
- * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)
- )
-
- cycles_output_blk = estimate_output_cycles(
- arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits
- )
+ # Estimate output cycles
+ num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
+ cycles_output_blk = _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
- if scale_tensor:
+ # Scale and bias tensor
+ if query.const_shape.depth > 0:
cycles_bias_blk = (
- 10
- * min(ofm_block.depth, ofm_tens_shape.depth)
- * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
- / 256
+ 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
)
cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
- cycles_cmd = get_minimal_cmd_cycles(
- arch,
- ifm_tensor,
- ofm_tensor,
- ifm_block,
- ofm_block,
- cycles_dpu_blk,
- ifm_tens_shape,
- ofm_tens_shape,
- cycles_output_blk,
- )
+ ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
+ cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
+ cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU
+
cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
cycles_output_blk = max(cycles_output_blk, cycles_cmd)
if cycles_dpu_blk > cycles_output_blk:
- total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
+ total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
else:
- total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk
+ total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
return total_cycles
-def estimate_memory_transfer_efficiency(
- arch, mem_area, direction, tensor, block_size: Block, replace_bw=None, shape4D=None
-):
- if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):
- return tensor.bandwidth() if replace_bw is None else replace_bw
-
- # Estimate memory transfer efficiency by calculating the burst length
- # this is related to data format, block shape, and tensor shape, etc.
- burst_len = 0
- elem_size = tensor.dtype.size_in_bytes()
- is_ifm = direction == BandwidthDirection.Read
- tens = tensor.clone()
-
- if not tensor.needs_linear_format:
- tens.set_format(TensorFormat.NHCWB16, arch)
- strides = tens.get_strides(shape4D=shape4D)
-
- if tens.format == TensorFormat.NHCWB16:
- if strides[1] == block_size.depth:
- burst_len = elem_size * block_size.depth * block_size.width
- elif is_ifm:
- burst_len = 16 * elem_size * block_size.width
- else:
- burst_len = 16 * elem_size * block_size.width * arch.ncores
- else:
- assert tens.format == TensorFormat.NHWC
- if is_ifm:
- if strides[3] == block_size.depth:
- burst_len = elem_size * block_size.depth * block_size.width
- else:
- burst_len = elem_size * block_size.depth
- else:
- if block_size.depth <= 16 and strides[3] == block_size.depth:
- burst_len = elem_size * block_size.depth * block_size.width
- else:
- burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
-
- burst_len = min(arch.memory_burst_length[mem_area], burst_len)
- bw = tens.bandwidth() if replace_bw is None else replace_bw
+def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
+ from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
+ to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
+ return max(from_cycles, to_cycles)
- return bw * (arch.memory_burst_length[mem_area] / burst_len)
+def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
+ cycles = CycleCost()
-def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):
- if block_config is None:
- block_config = ps.block_config
- bws = make_bandwidth_array()
- scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
- macs = 0
- cycles = make_cycles_array()
- ifm_read_multiple = 1
- weight_read_multiple = 0
-
- if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit):
- return bws, macs, cycles, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass
-
- explicit_padding = (0, 0, 0, 0)
- primary_op = ps.primary_op
- replacement_read_bws = {}
- ofm_block = Block(block_config[1], block_config[0], block_config[3])
- ifm_block = Block(block_config[1], block_config[0], block_config[3])
-
- if ps.placement == PassPlacement.Npu and primary_op:
- explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
- assert primary_op.type.npu_block_type == ps.npu_block_type
- npu_block_type = primary_op.type.npu_block_type
-
- ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
- ifm_tensor_shape = ps.primary_op.ifm_shapes[0]
- ofm_tensor_shape = ps.primary_op.ofm_shapes[0]
- ofm_block.width = min(ofm_block.width, ofm_tensor_shape.width)
- ofm_block.height = min(ofm_block.height, ofm_tensor_shape.height)
- ofm_block.depth = min(ofm_block.depth, ofm_tensor_shape.depth)
-
- if npu_block_type == NpuBlockType.ReduceSum:
- block_traversal = TensorBlockTraversal.DepthFirst
- elif npu_block_type in (
- NpuBlockType.ConvolutionMxN,
- NpuBlockType.ConvolutionDepthWise,
- NpuBlockType.VectorProduct,
- ):
- block_traversal = weight_tensor.block_traversal
+ # Convolution/Vector product cycle calculation
+ if query.npu_block_type in (
+ NpuBlockType.ConvolutionMxN,
+ NpuBlockType.ConvolutionDepthWise,
+ NpuBlockType.VectorProduct,
+ NpuBlockType.Pooling,
+ NpuBlockType.ReduceSum,
+ ):
+ # cycles.op_macs and cycles.op_cycles should both handle >32-bits
+ if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
+ cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
else:
- block_traversal = TensorBlockTraversal.Default
- ifm_block_depth = get_ifm_block_depth(
- npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
- )
- ifm_block = arch.get_ifm_block_size(
- ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
+ cycles.op_macs = (
+ int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
+ )
+
+ cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
+ # Elementwise cycle calculation
+ elif query.npu_block_type == NpuBlockType.ElementWise:
+ cycles.op_macs = 0
+ cycles.op_cycles = int(_estimate_output_cycles_per_element(arch, op_type, faf_type, query)) * int(
+ query.ofm_shape.elements()
)
- ifm_block.width = min(ifm_block.width, ifm_tensor_shape.width)
- ifm_block.height = min(ifm_block.height, ifm_tensor_shape.height)
+ else:
+ assert False
- if npu_block_type in (
- NpuBlockType.ConvolutionMxN,
- NpuBlockType.ConvolutionDepthWise,
- NpuBlockType.VectorProduct,
- NpuBlockType.Pooling,
- NpuBlockType.ReduceSum,
- ):
- # extent the ifm to full dimension
+ return cycles
- batch_size = ifm_tensor_shape.batch
- # add in padding, height += top and bottom, width += left and right
- ifm_tensor_shape = ifm_tensor_shape.add(
- 0, explicit_padding[0] + explicit_padding[2], explicit_padding[1] + explicit_padding[3], 0
- )
+def measure_element_access(arch, query: PerformanceQuery):
+ access = ElementAccess()
- if npu_block_type != NpuBlockType.Pooling:
- if npu_block_type == NpuBlockType.ReduceSum:
- weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]
- weight_tensor_bandwidth_shape = [0] * 4
- weight_tensor_element_size = 0
- weight_tensor_bandwidth_compression_scale = 0.0
- else:
- # For Vector product, weight format of IO is extended to HWIO, with H=W=1
- weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1)
- weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1)
- weight_tensor_element_size = weight_tensor.element_size()
- weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
-
- nn_ops = (
- int(ofm_tensor_shape.batch)
- * int(ofm_tensor_shape.height)
- * int(ofm_tensor_shape.width)
- * int(weight_tensor_shape[0])
- * int(weight_tensor_shape[1])
- * int(weight_tensor_shape[2])
- * int(weight_tensor_shape[3])
- )
- else:
- weight_tensor_shape = [
- *primary_op.get_kernel_size(),
- 1,
- ifm_tensor_shape.depth,
- ]
- weight_tensor_bandwidth_shape = weight_tensor_shape
- weight_tensor_element_size = 0
- weight_tensor_bandwidth_compression_scale = 0.0
- nn_ops = 0 # pooling doesn't count as NN ops
-
- kernel_dims = weight_tensor_shape[:2]
-
- sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
- # count the sub kernels; the IFM block needs to be refetched for each of them
- n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
- n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
- n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x
-
- n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)
- if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
- n_full_depth_stages = 1 # force to no reread
-
- ifm_read_multiple = n_sub_kernels * n_full_depth_stages
- replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple
-
- weight_read_multiple = numeric_util.round_up_divide(
- ofm_tensor_shape.height, ofm_block.height
- ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)
- replacement_read_bws[weight_tensor] = (
- batch_size
- * shape_num_elements(weight_tensor_bandwidth_shape)
- * weight_tensor_element_size
- * weight_tensor_bandwidth_compression_scale
- * weight_read_multiple
- )
+ ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
+ ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
+ ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
- macs += nn_ops
- cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
- arch,
- npu_block_type,
- primary_op,
- ifm_block,
- ofm_block,
- block_traversal,
- kernel_dims,
- ifm_tensor,
- ofm_tensor,
- ps.scale_tensor,
- )
- elif npu_block_type == NpuBlockType.ElementWise:
- # Work out how many elements we have and calculate performance.
- cycles[PassCycles.Npu] = estimate_output_cycles(
- arch,
- npu_block_type,
- primary_op,
- ofm_tensor.elements(),
- ps.ifm_tensor,
- ps.ofm_tensor,
- None,
- ps.ifm2_tensor,
- ofm_block,
- )
+ # Number of ofm blocks in the overall output shape
+ ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
+ ofm_block_depth = ofm_block.depth
+ if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
+ ofm_blocks = ofm_blocks.with_depth(1)
+ ofm_block_depth = query.ifm_shape.depth
- prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)
- if prev_npu_pass is None:
- # cycles for DMA ops in first pass
- dma_ops = (op for op in ps.ops if op.type == Op.DMA)
- for dma_op in dma_ops:
- mem_area = dma_op.attrs["source"]
- for tens in dma_op.inputs:
- cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]
-
- if rewrite_list is not None:
- # apply the desired rewrites
- for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
- if ps != ps_to_rewrite:
- continue
- if rewrite_op == SchedulerRewrite.Nop:
- pass # these are fine, no bandwidth changes
- elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):
- bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]
- if tens.purpose == TensorPurpose.FeatureMap:
- scaled_bw = estimate_memory_transfer_efficiency(
- arch,
- arch.fast_storage_mem_area,
- BandwidthDirection.Read,
- tens,
- ifm_block,
- replacement_read_bws[tens],
- )
- else:
- scaled_bw = replacement_read_bws[tens]
- scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw
- replacement_read_bws[tens] = 0
-
- for tens in ps.outputs:
- if force_outputs_to_fast_storage:
- bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
- scaled_bws[arch.fast_storage_mem_area][tens.purpose][
- BandwidthDirection.Write
- ] += estimate_memory_transfer_efficiency(
- arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0],
- )
- else:
- bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
- scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency(
- arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0]
- )
+ # Convolution & pooling
+ if query.npu_block_type in (
+ NpuBlockType.ConvolutionMxN,
+ NpuBlockType.ConvolutionDepthWise,
+ NpuBlockType.VectorProduct,
+ NpuBlockType.Pooling,
+ NpuBlockType.ReduceSum,
+ ):
+ # Number of sub kernels
+ sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
+ subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
+ subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
- for tens in ps.intermediates:
- bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
- scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
+ ofm_block_count = ofm_blocks.elements()
+
+ ifm_fetch = (
+ Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
+ * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
+ )
- if tens in replacement_read_bws:
- bw = replacement_read_bws[tens]
+ if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
+ kernel_read = query.kernel.elements_wh() * 1 # force to no reread
else:
- bw = tens.bandwidth()
+ kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
+
+ weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
+
+ access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
+
+ if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
+ access.const_read[0] = weight_fetch
+ access.const_read[1] = query.ofm_shape.depth # Scales & biases
+ access.weights_refetch = ofm_blocks.elements_wh()
+ # Elementwise
+ elif query.npu_block_type == NpuBlockType.ElementWise:
+ if query.ifm_shape.elements() == 1:
+ if query.ifm_bits > 8:
+ # ifm is a non 8-bit scalar
+ access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
+ if query.ifm2_shape:
+ access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
+ else:
+ access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
+ if query.ifm2_shape:
+ if query.ifm2_shape.elements() > 1:
+ access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
+ elif query.ifm2_bits > 8:
+ # ifm2 is a non 8-bit scalar
+ access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
+ # Unknown
+ else:
+ assert False
- bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
- scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
+ ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
+ access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
+ return access
- for tens in ps.inputs:
- if tens in replacement_read_bws:
- bw = replacement_read_bws[tens]
- else:
- bw = tens.bandwidth()
- bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
+def measure_performance_cost(
+ arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
+):
+ assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
+ assert query.ofm_shape.elements() != 0
- op_shape = None
- if ps.placement == PassPlacement.Npu and primary_op:
- if tens == ps.ifm_tensor:
- op_shape = ps.ifm_shapes[0]
- elif tens == ps.ifm2_tensor:
- op_shape = ps.ifm_shapes[1]
+ # Default to start if no offset provided
+ if offset is None:
+ offset = Shape4D(0, 0, 0, 0)
- scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency(
- arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw, op_shape
- )
+ # Default to entire area if no sub-shape provided
+ if sub_shape is None:
+ sub_shape = query.ofm_shape
+ else:
+ sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
+
+ sub_query = copy.deepcopy(query)
+ sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
- # quick build access counts for only current pass, even though these aren't the final numbers
- update_summary_cycles(arch, scaled_bws, cycles)
+ access = ElementAccess()
+ cycles = CycleCost()
- return bws, macs, cycles, ifm_read_multiple, weight_read_multiple
+ cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
+ cycles += cycle_tmp
+ access = measure_element_access(arch, sub_query)
+
+ return access, cycles
+
+
+def make_bandwidth_array():
+ return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
+
+
+def make_cycles_array():
+ return np.zeros(PassCycles.Size)
def update_summary_cycles(arch, bws, cycles):
@@ -669,42 +569,169 @@ def update_summary_cycles(arch, bws, cycles):
return cycles
-def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):
- return bws, macs, cycles
+def estimate_full_op_performance(
+ arch, schedule: Schedule, op: SchedulerOperation, prev_op: SchedulerOperation, block_config
+):
+ cycles_a = make_cycles_array()
+ bws = make_bandwidth_array()
+ scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
+ macs = 0
+ query = PerformanceQuery(op.op_type.npu_block_type)
+ query.ifm_shape = op.ifm.shape
+ query.ifm_format = op.ifm.format
+ query.ifm_memory_area = op.ifm.mem_area
+ query.ifm_bits = op.ifm.dtype.size_in_bits()
+ query.ifm2_shape = op.ifm2 and op.ifm2.shape
+ query.ifm2_format = op.ifm2 and op.ifm2.format
+ query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
+ query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
+ query.ofm_shape = op.ofm.shape
+ query.ofm_memory_area = op.ofm.mem_area
+ query.ofm_bits = op.ofm.dtype.size_in_bits()
+ query.ofm_format = op.ofm.format
+ query.kernel = op.kernel
+ query.config = block_config
+
+ cost = schedule.cost_map[op]
+ prev_cost = schedule.cost_map[prev_op] if prev_op else None
+ if op.parent_op.bias:
+ query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
+ if cost.buffered_weight_tensor:
+ query.const_memory_area = cost.buffered_weight_tensor.mem_area
+ else:
+ query.const_memory_area = cost.npu_weights_tensor.mem_area
+
+ cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
+ cycles_a[PassCycles.Npu] = cycles.op_cycles
+ macs = cycles.op_macs
+
+ access = measure_element_access(arch, query)
+
+ # How many NPU cycles are available under the previously executing
+ # operator for performing buffered DMA transfers
+ slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
+
+ # LUT Transfer
+ parent_op = op.parent_op
+ lut_transfer_cycles = 0
+ if parent_op.activation_lut:
+ lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
+ src_tensor = lut_tensor.src_tensor
+ if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
+ bw = src_tensor.storage_size()
+ lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
+
+ bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
+ # LUT read from SHRAM TODO remove?
+ scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][
+ BandwidthDirection.Read
+ ] += _estimate_memory_transfer_efficiency(
+ arch,
+ True,
+ lut_tensor.mem_area,
+ lut_tensor.format,
+ lut_tensor.element_size(),
+ query.config.ifm_block,
+ Shape4D(lut_tensor.shape),
+ bw,
+ )
-def performance_for_cascaded_pass(arch, cps):
- total_bws = make_bandwidth_array()
- total_macs = 0
- total_cycles = make_cycles_array()
+ if cost.npu_weights_tensor and cost.buffered_weight_tensor:
+ # DMA Weight Transfer
+ sz = 0
+ # Get the size of the first DMA
+ for core in range(0, arch.ncores):
+ key = WeightKey(core, 0)
+ if key in cost.npu_weights_tensor.encoded_ranges:
+ weight_range = cost.npu_weights_tensor.encoded_ranges[key]
+ sz += round_up(weight_range.total_bytes, 16)
+
+ total_sz = len(cost.npu_weights_tensor.buffer)
+ bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
+ bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
+
+ ws_first_transfer_cycles = measure_mem2mem_cycles(
+ arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz
+ )
+
+ # Add cycles for Weight + Scale Transfer
+ cycles_a[PassCycles.Npu] = max(
+ cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
+ cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
+ )
+
+ # Add cycles for LUT Transfer
+ cycles_a[PassCycles.Npu] += lut_transfer_cycles
+ else:
+ # Add cycles for LUT Transfer
+ cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
+
+ # OFM write
+ ofm = op.parent_op.ofm
+ bw = access.ofm_write * ofm.element_size()
+ bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
+ scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
+ arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
+ )
+
+ # IFM read
+ ifm = op.parent_op.ifm
+ bw = access.ifm_read[0] * ifm.element_size()
+ bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
+ scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
+ arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
+ )
+ if query.ifm2_shape:
+ ifm2 = op.parent_op.ifm2
+ bw = access.ifm_read[1] * ifm2.element_size()
+ bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
+ scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
+ arch,
+ True,
+ query.ifm2_memory_area,
+ ifm2.format,
+ op.ifm2.dtype.size_in_bits(),
+ query.config.ifm_block,
+ query.ifm2_shape,
+ bw,
+ )
+
+ # Weight read
+ if access.const_read[0] > 0:
+ # alignment not accounted for in bandwidth_compression_scale_approx
+ encoded_size_approx = (
+ cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
+ )
+ orig_weight_size = parent_op.weights.elements()
+ bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
+ bw = access.const_read[0] * bandwidth_compression_scale_approx
+ bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
+
+ if access.const_read[1] > 0:
+ # Scales & biases
+ bw = access.const_read[1] * op.parent_op.bias.element_size()
+ bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
- for ps in cps.passes:
- bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps)
- ps.bandwidths = bws
- ps.macs = macs
- ps.cycles = cycles
- total_bws += bws
- total_macs += macs
- total_cycles += cycles
+ update_summary_cycles(arch, scaled_bws, cycles_a)
- bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)
- cps.bandwidths = bws
- cps.macs = macs
- cps.cycles = cycles
- return bws, macs, cycles
+ return bws, macs, cycles_a
-def calc_performance_for_network(nng, arch):
+def calc_new_performance_for_network(nng, arch):
total_bws = make_bandwidth_array()
total_macs = 0
total_cycles = np.zeros(PassCycles.Size)
for sg in nng.subgraphs:
- for cps in sg.cascaded_passes:
- bws, macs, cycles = performance_for_cascaded_pass(arch, cps)
+ prev_op = None
+ for sched_op in sg.sched_ops:
+ op_info = sg.schedule.cost_map[sched_op]
+ bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config)
total_bws += bws
total_macs += macs
total_cycles += cycles
+ prev_op = sched_op
nng.bandwidths = total_bws
nng.macs = total_macs