diff options
Diffstat (limited to 'ethosu/vela/register_command_stream_generator.py')
-rw-r--r-- | ethosu/vela/register_command_stream_generator.py | 1970 |
1 files changed, 1073 insertions, 897 deletions
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index e3fedfcc..30b5e04a 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -14,47 +14,72 @@ # See the License for the specific language governing permissions and # limitations under the License. # Description: -# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates +# Register level (low-level) command stream generation for Ethos-U55. Takes a list of NPU operations and generates # all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit # stream suitable for interpretation by the Ethos-U55 processor. from collections import defaultdict from collections import namedtuple from enum import Enum from enum import IntEnum +from typing import List +from typing import Optional import numpy as np +from . import numeric_util from . import scaling +from .api import NpuActivation +from .api import NpuActivationOp +from .api import NpuAddressRange +from .api import NpuBlockOperation +from .api import NpuBlockTraversal +from .api import NpuConv2DOperation +from .api import NpuDataType +from .api import NpuDmaOperation +from .api import NpuElementWiseOp +from .api import NpuElementWiseOperation +from .api import NpuFeatureMap +from .api import NpuKernel +from .api import NpuLayout +from .api import NpuOperation +from .api import NpuOperationType +from .api import NpuPadding +from .api import NpuPoolingOp +from .api import NpuPoolingOperation +from .api import NpuQuantization +from .api import NpuResamplingMode +from .api import NpuRoundingMode +from .api import NpuShape3D +from .api import NpuTileBox +from .architecture_features import Accelerator from .architecture_features import ArchitectureFeatures from .architecture_features import Block from .architecture_features import Rect from .architecture_features import SharedBufferArea from .architecture_features import SHRAMElements -from .data_type import BaseType -from .data_type import DataType from .debug_database import DebugDatabase from .ethos_u55_regs.ethos_u55_regs import acc_format from .ethos_u55_regs.ethos_u55_regs import activation from .ethos_u55_regs.ethos_u55_regs import cmd0 from .ethos_u55_regs.ethos_u55_regs import cmd1 from .ethos_u55_regs.ethos_u55_regs import elementwise_mode -from .ethos_u55_regs.ethos_u55_regs import ifm_precision from .ethos_u55_regs.ethos_u55_regs import pooling_mode from .ethos_u55_regs.ethos_u55_regs import resampling_mode from .ethos_u55_regs.ethos_u55_regs import rounding from .high_level_command_stream import CommandType -from .numeric_util import clamp_sigmoid -from .numeric_util import clamp_tanh -from .numeric_util import full_shape +from .high_level_command_to_npu_op import convert_command_to_npu_op +from .high_level_command_to_npu_op import to_kernel +from .high_level_command_to_npu_op import unary_elementwise_ops from .numeric_util import quantise_float32 from .numeric_util import round_away_zero from .numeric_util import round_up_to_int from .operation import NpuBlockType -from .operation import Op -from .tensor import MemType -from .tensor import TensorBlockTraversal -from .tensor import TensorFormat -from .tensor import TensorPurpose +from .range_set import AccessDirection +from .range_set import MemoryAccessSet +from .range_set import MemoryRangeSet +from .shared_buffer_allocation import find_suitable_block_configs +from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op +from .shared_buffer_allocation import SharedBufferAllocation class RegisterMachine: @@ -80,22 +105,6 @@ class CmdMode(IntEnum): CmdOpMask = 0x03FF -class BasePointerIndex(IntEnum): - WeightTensor = 0 # base address index for the Weight tensor - ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena - ScratchFastTensor = 2 # base address for the Scratch_fast_tensor - Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer - - -# TODO: Replace with definitions from ethos_u55_regs -class IFM2Broadcast(IntEnum): - BroadcastHdim = 1 << 0 - BroadcastWdim = 1 << 1 - BroadcastCdim = 1 << 2 - ReverseOperandOrder = 1 << 6 - UseIFM2Scalar = 1 << 7 - - class CommandStreamEmitter: WORD_SIZE = 4 @@ -117,7 +126,7 @@ class CommandStreamEmitter: sz += len(cmd) * CommandStreamEmitter.WORD_SIZE return sz - def to_list(self): + def to_list(self) -> List[int]: return [elem for cmd in self.cmd_stream for elem in cmd] def print_cmds(self): @@ -146,7 +155,7 @@ class CommandStreamEmitter: print(s) - def cmd0_with_param(self, cmd, param): + def cmd0_with_param(self, cmd: cmd0, param): if isinstance(param, Enum): param = int(param.value) else: @@ -160,7 +169,7 @@ class CommandStreamEmitter: self.cmd_stream.append((command,)) self.offset += CommandStreamEmitter.WORD_SIZE - def cmd1_with_offset(self, cmd, offset, param=0x0): + def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0): offset = int(offset) & 0xFFFFFFFFF command = cmd.value | CmdMode.Payload32.value | (param << 16) @@ -171,13 +180,13 @@ class CommandStreamEmitter: self.cmd_stream.append((command, offset)) self.offset += CommandStreamEmitter.WORD_SIZE * 2 - def cmd_wait(self, cmd, channel, outstanding_count): + def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int): param = (16 * channel) + outstanding_count command = ((param & 0xFFFF) << 16) | cmd.value self.cmd_stream.append((command,)) self.offset += CommandStreamEmitter.WORD_SIZE - def cmd_do_operation(self, cmd, param=0): + def cmd_do_operation(self, cmd: cmd0, param=0): param = int(param) command = ((param & 0xFFFF) << 16) | cmd.value @@ -186,13 +195,674 @@ class CommandStreamEmitter: self.get_reg_machine(cmd).switch_bank() +# ------------------------------------------------------------------- +# REGISTER GENERATION +# ------------------------------------------------------------------- + + +class BasePointerIndex(IntEnum): + WeightTensor = 0 # base address index for the Weight tensor + ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena + ScratchFastTensor = 2 # base address for the Scratch_fast_tensor + Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer + + +# TODO: Replace with definitions from ethos_u55_regs +class IFM2Broadcast(IntEnum): + BroadcastHdim = 1 << 0 + BroadcastWdim = 1 << 1 + BroadcastCdim = 1 << 2 + ReverseOperandOrder = 1 << 6 + UseIFM2Scalar = 1 << 7 + + +pooling_op_map = { + NpuPoolingOp.MAX: pooling_mode.MAX.value, + NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value, + NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value, +} + +elementwise_op_map = { + NpuElementWiseOp.MUL: elementwise_mode.MUL.value, + NpuElementWiseOp.ADD: elementwise_mode.ADD.value, + NpuElementWiseOp.SUB: elementwise_mode.SUB.value, + NpuElementWiseOp.MIN: elementwise_mode.MIN.value, + NpuElementWiseOp.MAX: elementwise_mode.MAX.value, + NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value, + NpuElementWiseOp.ABS: elementwise_mode.ABS.value, + NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value, + NpuElementWiseOp.SHR: elementwise_mode.SHR.value, + NpuElementWiseOp.SHL: elementwise_mode.SHL.value, +} + +activation_op_map = { + NpuActivationOp.NONE_OR_RELU: activation.NONE, + NpuActivationOp.TANH: activation.TANH, + NpuActivationOp.SIGMOID: activation.SIGMOID, +} + +# Maps an AccumulatorType enum to the corresponding acc_format value +acc_format_map = { + SHRAMElements.Acc16: acc_format.FP_S5_10.value, + SHRAMElements.Acc32: acc_format.INT_32BIT.value, + SHRAMElements.Acc40: acc_format.INT_40BIT.value, +} + +resampling_mode_map = { + NpuResamplingMode.NONE: resampling_mode.NONE, + NpuResamplingMode.NEAREST: resampling_mode.NEAREST, + NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE, +} + +# Maps data type size in bits to activation precision +precision_map = {8: 0, 16: 1, 32: 2} + +# Maps rounding mode to the corresponding value +rounding_mode_map = { + NpuRoundingMode.TFL: rounding.TFL.value, + NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value, + NpuRoundingMode.NATURAL: rounding.NATURAL.value, +} + + +def quantise(value: float, quant: Optional[NpuQuantization]) -> int: + """Quantizes the given value""" + scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32 + zp = 0 if quant is None else quant.zero_point + return quantise_float32(value, scale, zp) + + +def has_ifm2(npu_op: NpuBlockOperation) -> bool: + """Checks if op has non-scalar IFM2""" + return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None + + +def is_dma_op(npu_op: NpuOperation) -> bool: + """Checks if op is a DMA operation""" + return npu_op.op_type == NpuOperationType.Dma + + +def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding): + """Generates IFM_PAD registers""" + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right) + + +def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap): + """Generates ACTIVATION registers""" + act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU) + + if act.min is None: + quantized_min = ofm.data_type.min_value() + else: + quantized_min = quantise(act.min, ofm.quantization) + if act.max is None: + quantized_max = ofm.data_type.max_value() + else: + quantized_max = quantise(act.max, ofm.quantization) + quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value()) + quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value()) + if act.op_type == NpuActivationOp.TABLE_LOOKUP: + assert 0 <= act.lookup_table_index < 8 + activation_value = 16 + act.lookup_table_index + if ofm.data_type == NpuDataType.INT32: + activation_value |= 3 << 12 # Force I8 range + quantized_min = max(-128, quantized_min) + quantized_max = min(127, quantized_max) + else: + activation_value = activation_op_map[act.op_type] + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value) + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min) + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max) + + +def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout): + """Generates xFM_BASE registers""" + if layout == NpuLayout.NHCWB16: + # Check that all BasePointer addresses are aligned to 16 bytes + assert all((int(addr) % 16) == 0 for addr in addresses) + emit.cmd1_with_offset(ptr_cmds[0], addresses[0]) + emit.cmd1_with_offset(ptr_cmds[1], addresses[1]) + emit.cmd1_with_offset(ptr_cmds[2], addresses[2]) + emit.cmd1_with_offset(ptr_cmds[3], addresses[3]) + + +def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox): + """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers""" + emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1) + emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1) + emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1) + + +def generate_strides( + emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1 +): + """Generates STRIDE_C/Y/X registers""" + strides = get_strides(fm) + emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C) + emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H) + emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W) + + +def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0): + """Generates IFM/IFM2_PRECISION register""" + dtype = fm.data_type + prec = 1 if dtype.is_signed() else 0 + activation_precision = precision_map[dtype.size_in_bits()] + prec += activation_precision << 2 + + if fm.layout == NpuLayout.NHCWB16: + prec |= 1 << 6 + + prec |= op_to_scale << 8 + emit.cmd0_with_param(precision_cmd, prec) + + +def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool): + """Generates OFM_PRECISION register""" + dtype = npu_op.ofm.data_type + prec = 1 if dtype.is_signed() else 0 + activation_precision = precision_map[dtype.size_in_bits()] + prec += activation_precision << 1 + + if use_global_scale: + # Set global scale bit, as opposed to using per channel scale + prec |= 1 << 8 + if npu_op.ofm.layout == NpuLayout.NHCWB16: + prec |= 1 << 6 + prec |= rounding_mode_map[npu_op.rounding_mode] << 14 + emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec) + + +def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation): + """Generates IFM2_BROADCAST register for binary elementwise operations""" + ifm2_broadcast = 0 + ifm = npu_op.ifm + ifm2 = npu_op.ifm2 + if npu_op.reversed_operands: + ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder + if npu_op.ifm2_scalar is not None: + # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST + ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar + else: + if ifm.shape.height != ifm2.shape.height: + # Broadcast in 'H' dimension + assert ifm2.shape.height == 1 + ifm2_broadcast |= IFM2Broadcast.BroadcastHdim + + if ifm.shape.width != ifm2.shape.width: + # Broadcast in 'W' dimension + assert ifm2.shape.width == 1 + ifm2_broadcast |= IFM2Broadcast.BroadcastWdim + + if ifm.shape.depth != ifm2.shape.depth: + # Broadcast in 'C' dimension + assert ifm2.shape.depth == 1 + ifm2_broadcast |= IFM2Broadcast.BroadcastCdim + + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast) + + +def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap): + """Generates general IFM registers""" + emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region) + generate_addresses( + emit, + [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3], + ifm.tiles.addresses, + ifm.layout, + ) + generate_tiles( + emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles + ) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1) + generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point)) + + +def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool): + """Generates general IFM2 registers""" + if not has_scalar: + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region) + generate_addresses( + emit, + [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3], + ifm2.tiles.addresses, + ifm2.layout, + ) + generate_tiles( + emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles + ) + generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X) + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point)) + + +def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap): + """Generates general OFM registers""" + emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region) + generate_addresses( + emit, + [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3], + ofm.tiles.addresses, + ofm.layout, + ) + generate_tiles( + emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles + ) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1) + generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point)) + + +def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal): + """Generates KERNEL related registers""" + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1)) + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1)) + # set kernel x stride low bit + stride = (kernel.stride_x - 1) & 1 + # set kernel y stride low bit + stride |= (kernel.stride_y - 1 & 1) << 1 + # set kernel x stride extension bits + stride |= (kernel.stride_x - 1 >> 1) << 6 + # set kernel y stride extension bits + stride |= (kernel.stride_y - 1 >> 1) << 9 + stride |= (kernel.dilation_x - 1) << 3 + stride |= (kernel.dilation_y - 1) << 4 + if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST: + stride |= 1 << 2 + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride) + + +def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures): + """Generates WEIGHT registers""" + if len(weights) == 0: + return + emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region) + # Set weights sources for active and present cores + for core, (addr, length) in enumerate( + [ + (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH), + (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH), + ] + ): + if core < len(weights): + emit.cmd1_with_offset(addr, weights[core].address) + emit.cmd1_with_offset(length, weights[core].length) + elif core < arch.ncores: + emit.cmd1_with_offset(addr, weights[0].address) + emit.cmd1_with_offset(length, 0) + + +def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures): + """Generates SCALE registers""" + if len(biases) == 0: + return + emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region) + # Set weights sources for active and present cores + for core, (addr, length) in enumerate( + [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)] + ): + if core < len(biases): + emit.cmd1_with_offset(addr, biases[core].address) + emit.cmd1_with_offset(length, biases[core].length) + elif core < arch.ncores: + emit.cmd1_with_offset(addr, biases[0].address) + emit.cmd1_with_offset(length, 0) + + +def generate_block_config( + emit: CommandStreamEmitter, + npu_op: NpuBlockOperation, + arch: ArchitectureFeatures, + shared_buffer: SharedBufferAllocation, +) -> NpuShape3D: + """Selects a suitable block config if none has been set, and generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers""" + block_config = npu_op.block_config + if block_config is None or block_config.height < 0: + # Note: this code only used if the public API to generate command streams is used; + # in the "normal" flow, the block config selected by the scheduler is used + if npu_op.weights: + assert block_config is not None, "block_config.depth must be provided for ops with weights" + # Block config has not been provided: find one + blocks = find_suitable_block_configs(arch, shared_buffer) + # Return the block with biggest volume + # TODO: use a better algorithm to find the best block + best_block = None + best_value = 0 + for block in blocks: + if block_config is not None and block[3] != block_config.depth: + continue + value = block[0] * block[1] * block[3] + if value > best_value: + best_value = value + best_block = block + assert best_block is not None, f"No suitable block config was found, {npu_op.op_type}" + block_config = NpuShape3D(height=best_block[0], width=best_block[1], depth=best_block[3]) + alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth)) + assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}" + emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1) + return block_config + + +def generate_shram_registers_elementwise( + emit: CommandStreamEmitter, + npu_op: NpuElementWiseOperation, + arch: ArchitectureFeatures, + shared_buffer: SharedBufferAllocation, +): + """Generates IB_END/IB_START/AB_START registers for elementwise operations""" + # For elementwise set the required SHRAM to be equal to the total size of available SHRAM + uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP + shram_required = arch.available_shram_banks(uses_lut) + + # Acc buffers not needed so set AB_START to size of SHRAM + emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required) + emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required) + if has_ifm2(npu_op): + # Set IFM2_IB_START to the latter half of the IB space + ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM] + emit.cmd0_with_param( + cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start, + ) + emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element]) + + +def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation): + """Generates IB_END/IB_START/AB_START registers for non-elementwise operations""" + emit.cmd0_with_param( + cmd0.NPU_SET_IFM_IB_END, + shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM], + ) + emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators]) + emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element]) + + +def generate_common( + emit: CommandStreamEmitter, + npu_op: NpuBlockOperation, + block_traversal: NpuBlockTraversal, + arch: ArchitectureFeatures, + use_global_scale: bool = False, + op_to_scale: int = 0, +): + """Generate registers that are common to most operations""" + assert npu_op.ifm is not None and npu_op.ofm is not None + generate_ifm(emit, npu_op.ifm) + generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale]) + if npu_op.padding is not None: + generate_padding(emit, npu_op.padding) + generate_ofm(emit, npu_op.ofm) + generate_ofm_precision(emit, npu_op, use_global_scale) + if npu_op.op_type != NpuOperationType.ElementWise: + assert npu_op.kernel is not None + generate_kernel(emit, npu_op.kernel, block_traversal) + generate_weights(emit, npu_op.weights, arch) + generate_biases(emit, npu_op.biases, arch) + generate_activation(emit, npu_op.activation, npu_op.ofm) + + +# ------------------------------------------------------------------- +# SCALING +# ------------------------------------------------------------------- + + +def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation): + """Generates OFM_SCALE register for pooling operations""" + # For valid padding vela has to output scaling values + kernel = pool_op.kernel + ifm_quant = pool_op.ifm.quantization + ofm_quant = pool_op.ofm.quantization + if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH): + assert ifm_quant.scale_f32 is not None + rescale = 0x3000 * ifm_quant.scale_f32 + if pool_op.ifm.data_type == NpuDataType.INT16: + # Calculate scale and shift for the output scale of 1/(3*4096) + shift = 0 + max_rescale = np.iinfo(np.int16).max / 2 + while rescale <= max_rescale and shift <= 30: + shift += 1 + rescale *= 2 + scale = int(rescale) + else: + rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 + scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits) + scale = int(round_away_zero(scale * rescale)) + elif pool_op.fused_quantize: + # Quantize op requires different scaling + ifm_scale_f64 = np.double(ifm_quant.scale_f32) + ofm_scale_f64 = np.double(ofm_quant.scale_f32) + scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64) + elif pool_op.rescale is not None: + # for ResizeBilinear operations with "rescale" in primary_op.attrs + rescale = pool_op.rescale + rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 + scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits) + scale = int(round_away_zero(scale * rescale)) + else: + # In case avg pool fused with concat or other memory operation, rescaling might be needed. + # kernel height == kernel width == 1 is always true in this case + # Normally the scale is maximised, to get maximum precision, which means that + # if rescale != 1, scale need to consider the number of bits needed for rescaling + if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None: + rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32 + rescale_bits = 0 + if kernel.height == kernel.width == 1: + if rescale > 1: + rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 + elif rescale < 1: + rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1) + scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits) + scale = int(round_away_zero(scale * rescale)) + else: + scale = 1 + shift = 0 + + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift) + + +def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int: + """ + Generates OFM/OPA/OPB_SCALE registers for elementwise operators. + Returns the operator to scale + """ + op_to_scale = 0 + if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB): + input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None + input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None + output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None + + if npu_op.activation is not None and npu_op.activation.op_type in ( + NpuActivationOp.SIGMOID, + NpuActivationOp.TANH, + ): + output_scale = 1 / 0x3000 + + if npu_op.sub_op_type == NpuElementWiseOp.MUL: + if None in (input_scale, input2_scale, output_scale): + ofm_scale = 1 + shift = 0 + else: + ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale) + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) + else: # Add/Sub + if None in (input_scale, input2_scale, output_scale): + opa_scale = opb_scale = ofm_scale = 1 + opa_shift = shift = 0 + if npu_op.rescale is not None: + ofm_scale, shift = npu_op.rescale + elif input_scale == input2_scale: + opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale( + input_scale, input2_scale, output_scale + ) + opa_shift = 0 # Unused for this case + else: + # Use advanced implementation only when input scales differ + bitdepth = npu_op.ifm.data_type.size_in_bits() + (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale( + input_scale, input2_scale, output_scale, bitdepth + ) + opb_scale = 0 # Unused for this case + if npu_op.reversed_operands: + # If the operand order is reversed we also have to swap which operand is scaled + if op_to_scale == scaling.OperandToScale.OPa: + op_to_scale = scaling.OperandToScale.OPb + else: + op_to_scale = scaling.OperandToScale.OPa + emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift) + emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale) + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) + elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS): + output_scale = npu_op.ofm.quantization.scale_f32 + ofm_scale, shift = scaling.quantise_scale(output_scale) + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) + else: + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0) + return op_to_scale + + +# ------------------------------------------------------------------- +# ADDRESSING/STRIDES (helper functions) +# ------------------------------------------------------------------- + + +def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool: + """Checks if the ranges overlap""" + return range1.region == range2.region and numeric_util.overlaps( + range1.address, range1.address + range1.length, range2.address, range2.address + range2.length + ) + + +def get_strides(fm: NpuFeatureMap) -> NpuShape3D: + """Calculates STRIDE_C/Y/X""" + if fm.strides is not None: + return fm.strides + elem_size = fm.data_type.size_in_bytes() + if fm.layout == NpuLayout.NHWC: + stride_c = elem_size + stride_x = fm.shape.depth * stride_c + stride_y = fm.shape.width * stride_x + else: + stride_x = 16 * elem_size + stride_c = stride_x * fm.shape.width + stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16) + return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x) + + +def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int: + """Returns address of given coordinate""" + t = 0 + BRICK = 16 + stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth + stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width + if x >= fm.tiles.width_0: + x -= fm.tiles.width_0 + t = 1 + if y >= fm.tiles.height_1: + y -= fm.tiles.height_1 + t += 2 + elif y >= fm.tiles.height_0: + y -= fm.tiles.height_0 + t += 2 + elem_size = fm.data_type.size_in_bytes() + return ( + fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size + ) + + +def get_address_range( + fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int +) -> NpuAddressRange: + """Gets address range for (y0, x0, c0) - (y1, x1, c1)""" + addr0 = get_address(fm, strides, y0, x0, c0) + addr1 = get_address(fm, strides, y1, x1, c1) + return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes()) + + +def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]: + """Returns 4 adddress ranges, one for every tile, None if the tile is not in use""" + strides = get_strides(fm) + height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth + height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 + t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,) + if width > width_0: + t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1) + else: + t1 = None + if height > height_0: + t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1) + else: + t2 = None + if t1 is not None and t2 is not None: + t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1) + else: + t3 = None + return [t0, t1, t2, t3] + + +# ------------------------------------------------------------------- +# DMA_WAIT/KERNEL_WAIT +# ------------------------------------------------------------------- + + Watermark = namedtuple("Watermark", ["npu", "dma"]) -def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, watermark: Watermark): - cmd = cmd_stream[cmd_index] - cmd_access = memory_accesses[cmd] - index = cmd_index - 1 +def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet: + return MemoryRangeSet(range.region, range.address, range.address + range.length) + + +def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet: + """Returns the address that are read and written by the given DMA operation""" + res = MemoryAccessSet() + res.add(memory_range_set(dma_op.src), AccessDirection.Read) + res.add(memory_range_set(dma_op.dest), AccessDirection.Write) + return res + + +def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet: + """Returns the addresses that are read and written by the given operation""" + assert npu_op.ifm is not None and npu_op.ofm is not None + # Read addresses + read_ranges = get_address_ranges(npu_op.ifm) + if has_ifm2(npu_op): + assert npu_op.ifm2 is not None + read_ranges.extend(get_address_ranges(npu_op.ifm2)) + read_ranges.extend(npu_op.weights) + read_ranges.extend(npu_op.biases) + if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP: + address = arch.available_shram_banks(True) * arch.shram_bank_size + read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048)) + # Written addresses + write_ranges = get_address_ranges(npu_op.ofm) + # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks + uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP + written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size + write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size)) + + res = MemoryAccessSet() + for read_range in read_ranges: + if read_range is not None: + res.add(memory_range_set(read_range), AccessDirection.Read) + for write_range in write_ranges: + if write_range is not None: + res.add(memory_range_set(write_range), AccessDirection.Write) + return res + + +def get_wait_dependency( + arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark +): + """Used to calculate whether DMA wait or kernel wait operations are needed""" + npu_op = npu_op_list[op_index] + op_access = memory_accesses[npu_op] + index = op_index - 1 # NPU dependency tracking npu_outstanding = -1 @@ -211,33 +881,32 @@ def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, waterm # the command that issues the wait. # NPU->NPU dependency is handled via blockdep. while (index >= npu_index) or (index >= dma_index): - prev_cmd = cmd_stream[index] - prev_access = memory_accesses[prev_cmd] - - # Check DMA consuming NPU output - if prev_cmd.cmdtype == CommandType.NpuStripe: - if index >= npu_index: - if (cmd.cmdtype == CommandType.DMA) and (npu_outstanding == -1) and prev_access.conflicts(cmd_access): - npu_outstanding = npu_ops - npu_ops = npu_ops + 1 # Count NPU ops in the pipeline - if npu_ops >= arch.max_outstanding_kernels: - npu_index = max(index + 1, npu_index) + prev_op = npu_op_list[index] + prev_access = memory_accesses[prev_op] # Check NPU consuming DMA output - elif prev_cmd.cmdtype == CommandType.DMA: + if is_dma_op(prev_op): if index >= dma_index: - if cmd.cmdtype == CommandType.NpuStripe: - if (dma_outstanding == -1) and prev_access.conflicts(cmd_access): + if not is_dma_op(npu_op): + if (dma_outstanding == -1) and prev_access.conflicts(op_access): dma_outstanding = dma_ops - dma_ops = dma_ops + 1 # Count DMA ops in the pipeline + dma_ops += 1 # Count DMA ops in the pipeline if dma_ops >= arch.max_outstanding_dma: dma_index = max(index + 1, dma_index) + # Check DMA consuming NPU output + else: + if index >= npu_index: + if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access): + npu_outstanding = npu_ops + npu_ops += 1 # Count NPU ops in the pipeline + if npu_ops >= arch.max_outstanding_kernels: + npu_index = max(index + 1, npu_index) - index = index - 1 + index -= 1 # Update DMA watermark if we didn't see any and the NPU pipeline is full if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels): - dma_index = cmd_index + dma_index = op_index # Bring the search watermark forwards as we complete for those dependencies watermark = Watermark(npu_index, dma_index) @@ -246,873 +915,380 @@ def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, waterm return watermark, outstanding -def has_prev_op_dependency(prev_cmd, cmd): - if prev_cmd is None: - return False - if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps): - if prev_cmd.ofm_tensor.equivalent(cmd.ifm_tensor): - return True - elif cmd.ifm2_tensor is not None: - return prev_cmd.ofm_tensor.equivalent(cmd.ifm2_tensor) - return False - +def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark): + if cmd_waits.npu >= 0: + emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu) -def get_op_ofm_rect(cmd): - start = full_shape(4, cmd.ofm_box.start_coord, 0) - end = full_shape(4, cmd.ofm_box.end_coord, 1) - return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1) + if cmd_waits.dma >= 0: + emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma) -def get_op_ifm_rect(cmd): - start = full_shape(4, cmd.ifm_box.start_coord, 0) - end = full_shape(4, cmd.ifm_box.end_coord, 1) - return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1) +# ------------------------------------------------------------------- +# BLOCKDEP +# ------------------------------------------------------------------- -def get_op_ifmofm_block_depth(arch, cmd): - # Note: NOT equivalent to the normal ifm block depth calculation since - # it takes into account 'depthless' block operations by returning full - # depth - if cmd.ps.npu_block_type in ( - NpuBlockType.ConvolutionDepthWise, - NpuBlockType.Pooling, - NpuBlockType.ElementWise, - NpuBlockType.ReduceSum, - ): - return cmd.ofm_box.get_size_shape()[-1] +def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool: + """Checks if npu_op's input is dependent on prev_op's output""" + assert npu_op.ifm is not None + assert prev_op.ofm is not None + curr_input_ranges = get_address_ranges(npu_op.ifm) - return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits) - - -def get_op_padding_lt(cmd): - if cmd.ps.npu_block_type not in ( - NpuBlockType.ConvolutionDepthWise, - NpuBlockType.Pooling, - NpuBlockType.ConvolutionMxN, - NpuBlockType.ReduceSum, - ): - return (0, 0) - - explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right) - - # Check if this is for horizontal ifm streaming - if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe): - explicit_padding[0] = cmd.pad_top - explicit_padding[2] = cmd.pad_bottom - - return (explicit_padding[1], explicit_padding[0]) - - -def ifm_ifm2_correct_order(ifm_shape, ifm2_shape): - if ifm_shape == []: - # Scalar needs to be in IFM2 - return False - elif ifm2_shape == []: - return True - - for ifm, ifm2 in zip(ifm_shape, ifm2_shape): - if ifm != ifm2 and ifm == 1: - # Broadcasted FM needs to be in IFM2 - return False + if has_ifm2(npu_op): + assert npu_op.ifm2 is not None + curr_input_ranges.extend(get_address_ranges(npu_op.ifm2)) + for prev_range in get_address_ranges(prev_op.ofm): + if prev_range is None: + continue + for curr_range in curr_input_ranges: + if curr_range is not None and ranges_overlap(prev_range, curr_range): + return True + return False - return True +def shape3d_to_rect(shape: NpuShape3D) -> Rect: + return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1) -def generate_register_command_stream(nng, sg, arch, verbose=False): - emit = CommandStreamEmitter() - if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area: - base_ptr_idx_map = { - MemType.Permanent_NPU: BasePointerIndex.WeightTensor, - MemType.Permanent_CPU: BasePointerIndex.WeightTensor, - MemType.Scratch: BasePointerIndex.ScratchTensor, - MemType.Scratch_fast: BasePointerIndex.ScratchTensor, - } +def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int: + # Note: NOT equivalent to the normal ifm block depth calculation since + # it takes into account 'depthless' block operations by returning full + # depth + if npu_op.op_type == NpuOperationType.Conv2D: + res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits()) + return res + return npu_op.ofm.shape.depth + + +def calc_blockdep( + arch: ArchitectureFeatures, + prev_op: Optional[NpuBlockOperation], + prev_block_config: Optional[NpuShape3D], + npu_op: NpuBlockOperation, + block_config: NpuShape3D, +) -> int: + """Calculates the value of the BLOCKDEP register""" + if prev_op is None: + return 0 + if not is_dependent_on_prev_op(prev_op, npu_op): + return ArchitectureFeatures.MAX_BLOCKDEP + if prev_op.ofm.shape != npu_op.ifm.shape: + return 0 + prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op) + prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth) + prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape) + prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape) + cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op) + cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth) + cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape) + cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape) + cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top) + blockdep = arch.calc_block_dep( + prev_ifm_rect, + prev_ofm_rect, + prev_ifm_block_depth, + prev_ofm_block, + to_kernel(prev_op.kernel), + cur_ifm_rect, + cur_ofm_rect, + cur_ifm_block_depth, + cur_ofm_block, + to_kernel(npu_op.kernel), + cur_padLT, + ) + return blockdep + + +# ------------------------------------------------------------------- +# PRINT +# ------------------------------------------------------------------- + + +def print_feature_map(fm: NpuFeatureMap, name: str): + if fm is not None: + q = ( + "no quantization" + if fm.quantization is None + else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}" + ) + h, w, c = fm.shape + sz = h * w * c * fm.data_type.size_in_bytes() + print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}") + strides = get_strides(fm) + stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}" + t = fm.tiles + addresses = [hex(addr) for addr in t.addresses] + print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}") + + +def print_operation(npu_op: NpuOperation, index: int = 0): + pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else "" + if is_dma_op(npu_op): + print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}") + return + k = None if npu_op.kernel is None else to_kernel(npu_op.kernel) + if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise): + print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}") else: - base_ptr_idx_map = { - MemType.Permanent_NPU: BasePointerIndex.WeightTensor, - MemType.Permanent_CPU: BasePointerIndex.WeightTensor, - MemType.Scratch: BasePointerIndex.ScratchTensor, - MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor, - } - - # Maps an AccumulatorType enum to the corresponding acc_format value - acc_format_map = { - SHRAMElements.Acc16: acc_format.FP_S5_10.value, - SHRAMElements.Acc32: acc_format.INT_32BIT.value, - SHRAMElements.Acc40: acc_format.INT_40BIT.value, - } - - # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE - elementwise_mode_map = { - Op.Mul: elementwise_mode.MUL.value, - Op.Add: elementwise_mode.ADD.value, - Op.Sub: elementwise_mode.SUB.value, - Op.Minimum: elementwise_mode.MIN.value, - Op.Maximum: elementwise_mode.MAX.value, - Op.LeakyRelu: elementwise_mode.LRELU.value, - Op.Abs: elementwise_mode.ABS.value, - Op.CLZ: elementwise_mode.CLZ.value, - Op.SHR: elementwise_mode.SHR.value, - Op.SHL: elementwise_mode.SHL.value, - } - - cmd_stream = [] - memory_accesses = {} - for cmd in sg.high_level_command_stream: - if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default: - print("Warning: Skipping register command stream generation for", cmd.ps) + if ( + npu_op.op_type == NpuOperationType.Conv2D + and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1 + ): + fc = "FullyConnected " else: - cmd_stream.append(cmd) - memory_accesses[cmd] = cmd.get_memory_accesses() - - def emit_cmd_waits(cmd_waits): - if cmd_waits.npu >= 0: - emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu) - - if cmd_waits.dma >= 0: - emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma) + fc = "" + print(f"{index} {fc}{npu_op.op_type.name}{pass_info}") + print_feature_map(npu_op.ifm, "IFM") + if npu_op.ifm2_scalar is not None: + quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization) + print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}") + else: + print_feature_map(npu_op.ifm2, "IFM2") + print_feature_map(npu_op.ofm, "OFM") + if k is not None and npu_op.op_type != NpuOperationType.ElementWise: + print(f" Kernel: {k}") + if npu_op.padding is not None: + print(f" {npu_op.padding}") + for weights in npu_op.weights: + print(f" Weights: {weights}") + for bias in npu_op.biases: + print(f" Scales: {bias}") + if npu_op.activation is not None: + act = npu_op.activation + if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None: + lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else "" + print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}") + if npu_op.op_type == NpuOperationType.Conv2D: + print(f" {npu_op.block_traversal}") + bh, bw, bc = npu_op.block_config + rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else "" + print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}") + + +def print_operations(npu_op_list: List[NpuOperation]): + for index, npu_op in enumerate(npu_op_list): + print_operation(npu_op, index) + + +# ------------------------------------------------------------------- +# OPERATIONS +# ------------------------------------------------------------------- + + +def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation): + """Generates NPU_OP_* command""" + op_type = npu_op.op_type + if op_type == NpuOperationType.Dma: + emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode) + elif op_type == NpuOperationType.Conv2D: + emit.cmd_do_operation(cmd0.NPU_OP_CONV) + elif op_type == NpuOperationType.ConvDepthWise: + emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE) + elif op_type == NpuOperationType.Pooling: + emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type]) + elif op_type == NpuOperationType.ElementWise: + emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type]) + else: + assert 0, "Unsupported operation" + + +def generate_conv2d_op( + emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures +) -> NpuShape3D: + """Generates register commands for Conv2D operations""" + generate_common(emit, npu_op, npu_op.block_traversal, arch) + ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale] + shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ConvolutionMxN, ifm_resampling_mode) + block_config = generate_block_config(emit, npu_op, arch, shared_buffer) + generate_shram_registers_non_elementwise(emit, shared_buffer) + return block_config + + +def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures): + """Generates register commands for depthwise convolution operations""" + generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch) + ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale] + shared_buffer = shared_buffer_allocation_for_npu_op( + arch, npu_op, NpuBlockType.ConvolutionDepthWise, ifm_resampling_mode + ) + block_config = generate_block_config(emit, npu_op, arch, shared_buffer) + generate_shram_registers_non_elementwise(emit, shared_buffer) + return block_config + + +def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures): + """Generates register commands for pooling operations""" + use_global_scale = ( + npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0 + ) + generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale) + # Pooling op specific + if use_global_scale: + generate_ofm_scaling_for_pooling(emit, npu_op) + ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale] + npu_block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling + shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, npu_block_type, ifm_resampling_mode) + block_config = generate_block_config(emit, npu_op, arch, shared_buffer) + generate_shram_registers_non_elementwise(emit, shared_buffer) + return block_config + + +def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures): + """Generates register commands for elementwise operations""" + use_global_scale = npu_op.sub_op_type in ( + NpuElementWiseOp.ADD, + NpuElementWiseOp.SUB, + NpuElementWiseOp.MUL, + NpuElementWiseOp.LRELU, + NpuElementWiseOp.ABS, + ) + op_to_scale = generate_scaling_for_elementwise(emit, npu_op) + generate_common( + emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale + ) + # Elementwise op specific + if npu_op.sub_op_type not in unary_elementwise_ops: + # Binary operation; generate IFM2 registers + assert npu_op.ifm2 is not None + has_scalar = npu_op.ifm2_scalar is not None + generate_ifm2(emit, npu_op.ifm2, has_scalar) + generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION) + generate_ifm2_broadcast(emit, npu_op) + if has_scalar: + quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization) + assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value() + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar) + ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale] + shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ElementWise, ifm_resampling_mode) + block_config = generate_block_config(emit, npu_op, arch, shared_buffer) + generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer) + return block_config + + +def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation): + """Generates register commands for DMA operations""" + emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region) + emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address) + emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region) + + emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address) + emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length) + + +def generate_registers_for_op( + emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures +) -> Optional[NpuShape3D]: + """ + Generates register commands for the given operation, but not the final NPU_OP_... command. + Returns the selected block config + """ + op_type = npu_op.op_type + block_config = None + if op_type == NpuOperationType.Conv2D: + block_config = generate_conv2d_op(emit, npu_op, arch) + elif op_type == NpuOperationType.ConvDepthWise: + block_config = generate_conv_depthwise_op(emit, npu_op, arch) + elif op_type == NpuOperationType.Pooling: + block_config = generate_pooling_op(emit, npu_op, arch) + elif op_type == NpuOperationType.ElementWise: + block_config = generate_elementwise_op(emit, npu_op, arch) + elif op_type == NpuOperationType.Dma: + generate_dma_op(emit, npu_op) + else: + assert 0, "Unsupported operation" + return block_config - # Initialise operator dependency state - prev_ifm_rect = cur_ifm_rect = None - prev_ifm_block_depth = cur_ifm_block_depth = None - prev_ofm_rect = cur_ofm_rect = None - prev_ofm_block = cur_ofm_block = None - prev_kernel = cur_kernel = None - prev_cmd = None +def generate_command_stream( + emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None +): + """Generates register commands for the given list of NPU operations""" + # Calculate memory accesses for every operation + memory_accesses = {} + for npu_op in npu_op_list: + if is_dma_op(npu_op): + memory_accesses[npu_op] = get_dma_memory_accesses(npu_op) + else: + memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch) if arch.is_yoda_system: emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1) - dep_watermark = Watermark(0, 0) - - stream_id = DebugDatabase.add_stream(sg) - DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing - - for cmd_index, cmd in enumerate(cmd_stream): - dep_watermark, cmd_waits = get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, dep_watermark) - - if cmd.cmdtype == CommandType.DMA: - start_coord = cmd.box.start_coord - - src_addr = cmd.in_tensor.address_for_coordinate(start_coord) - dst_addr = cmd.out_tensor.address_for_coordinate(start_coord) - - if cmd.in_tensor.compressed_values is not None: - if cmd.out_tensor.purpose == TensorPurpose.FSBias: - sz = cmd.in_tensor.storage_size() - else: - stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord) - sz = cmd.in_tensor.size_of_compressed_stream(stream_index) - else: - sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr - - emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type]) - emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr) - if cmd.out_tensor.purpose == TensorPurpose.LUT: - emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem) - else: - emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type]) - - emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr) - emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz) - dma_channel = 0 - mode = 0 # From external to external - - emit_cmd_waits(cmd_waits) - emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode) - - elif cmd.cmdtype == CommandType.NpuStripe: - - ps = cmd.ps - primary_op = ps.primary_op - npu_block_type = ps.npu_block_type - # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale - use_global_scale = False - # Specifies type of rounding to be used. - rounding_mode = ( - rounding.NATURAL if primary_op.attrs.get("rounding_mode", "") == b"NATURAL" else rounding.TFL - ) - if primary_op.type == Op.ResizeBilinear: - rounding_mode = rounding.TRUNCATE - fmf = primary_op.memory_function - faf = primary_op.activation - fused_quantize = any(op.type == Op.Quantize for op in ps.ops) - # Force output scale, used in operations with fused LUT - # Note: with current LUT support, forced_ofm_quantization is always equal to cmd.ofm_tensor.quantization - # except when primary_op is AddAct + 0 (no-op) + LUT - forced_ofm_quantization = primary_op.forced_output_quantization - ofm_quant = cmd.ofm_tensor.quantization - if forced_ofm_quantization is not None: - ofm_quant = forced_ofm_quantization - - # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB - op_to_scale = 0 - - # Update state history - prev_ifm_rect = cur_ifm_rect - prev_ifm_block_depth = cur_ifm_block_depth - prev_ofm_rect = cur_ofm_rect - prev_ofm_block = cur_ofm_block - prev_kernel = cur_kernel - cur_kernel = ps.primary_op.kernel if ps.primary_op else None - - block_config = ps.block_config - emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1) - emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1) - emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1) - - shared_buffer = ps.shared_buffer - - if npu_block_type == NpuBlockType.ElementWise: - ifm2_broadcast = 0 - - if cmd.ifm2_tensor and not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape): - # The scalar has to be the ifm2 tensor so switch the ifms - cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor - cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box - - # Set ReverseOperandOrder bit to IFM2_BROADCAST - ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder - - # Calculate scales needed for arithmetic elementwise operators - if primary_op.type in set((Op.Add, Op.Mul, Op.Sub,)): - input_scale = cmd.ifm_tensor.quantization.scale_f32 if cmd.ifm_tensor.quantization else None - input2_scale = cmd.ifm2_tensor.quantization.scale_f32 if cmd.ifm2_tensor.quantization else None - output_scale = ofm_quant.scale_f32 if ofm_quant else None - use_global_scale = True - - if output_scale is not None and faf in (Op.Sigmoid, Op.Tanh): - output_scale = 1 / 0x3000 - - if primary_op.type == Op.Mul: - if None in (input_scale, input2_scale, output_scale): - ofm_scale = 1 - shift = 0 - else: - ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale) - emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) - else: # AddAct/SubAct - # Force output scale same as the input scale for - # resizebilinear 1x1 that is converted to add - if "resizebilinear" in primary_op.attrs: - output_scale = input2_scale - - if None in (input_scale, input2_scale, output_scale): - opa_scale = opb_scale = ofm_scale = 1 - opa_shift = shift = 0 - ofm_scale, shift = primary_op.attrs.get("rescale", [1, 0]) - elif input_scale == input2_scale: - opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale( - input_scale, input2_scale, output_scale - ) - opa_shift = 0 # Unused for this case - else: - # Use advanced implementation only when input scales differ - bitdepth = cmd.ifm_tensor.dtype.bits - ( - opa_scale, - opa_shift, - ofm_scale, - shift, - op_to_scale, - ) = scaling.advanced_elementwise_add_sub_scale( - input_scale, input2_scale, output_scale, bitdepth - ) - opb_scale = 0 # Unused for this case - if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder: - # If the operand order is reversed we also have to swap which operand is scaled - if op_to_scale == scaling.OperandToScale.OPa: - op_to_scale = scaling.OperandToScale.OPb - else: - op_to_scale = scaling.OperandToScale.OPa - - emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift) - emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale) - emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) - - elif primary_op.type in set((Op.LeakyRelu, Op.Abs,)): - output_scale = ofm_quant.scale_f32 - use_global_scale = True - - if primary_op.type == Op.LeakyRelu: - output_scale = primary_op.attrs["alpha"] - - ofm_scale, shift = scaling.quantise_scale(output_scale) - emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) - else: - emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0) - - # For elementwise set the required SHRAM to be equal to the total size of available SHRAM - uses_lut = primary_op.activation_lut is not None - shram_required = arch.available_shram_banks(uses_lut) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required) - - # Acc buffers not needed so set AB_START to size of SHRAM - emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required) - - # Is not a unary operator - if cmd.ifm2_tensor is not None: - if cmd.ifm2_tensor.shape == []: - # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST - ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar - else: - ifm_box_shape = cmd.ifm_box.get_size_shape() - ifm2_box_shape = cmd.ifm2_box.get_size_shape() - - if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]: - # Broadcast in 'H' dimension - assert cmd.ifm2_tensor.shape[1] == 1 - ifm2_broadcast |= IFM2Broadcast.BroadcastHdim - - if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]: - # Broadcast in 'W' dimension - assert cmd.ifm2_tensor.shape[2] == 1 - ifm2_broadcast |= IFM2Broadcast.BroadcastWdim - - if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]: - # Broadcast in 'C' dimension - assert cmd.ifm2_tensor.shape[3] == 1 - ifm2_broadcast |= IFM2Broadcast.BroadcastCdim - - # Set IFM2_IB_START to the latter half of the IB space - ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM] - emit.cmd0_with_param( - cmd0.NPU_SET_IFM2_IB_START, - (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start, - ) - - emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast) - - else: - emit.cmd0_with_param( - cmd0.NPU_SET_IFM_IB_END, - shared_buffer.bank_locations[SharedBufferArea.IFM] - + shared_buffer.banks_required[SharedBufferArea.IFM], - ) - emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators]) - - emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element]) - - if primary_op.type == Op.ResizeBilinear: - # perform nearest neighbor upscale - emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST) - elif primary_op.type == Op.Conv2DBackpropInputSwitchedBias: - # perform insert zero upscale - emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE) - else: - emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE) - - if npu_block_type in set( - ( - NpuBlockType.ConvolutionMxN, - NpuBlockType.ConvolutionDepthWise, - NpuBlockType.Pooling, - NpuBlockType.ReduceSum, - ) - ): - # Set up padding - explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right) - - # Check if this is for horizontal ifm streaming - if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe): - explicit_padding[0] = cmd.pad_top - explicit_padding[2] = cmd.pad_bottom - - # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output, - # because of activation function needed to be fused. - if cmd.ifm_box.start_coord[-2] > 0: - explicit_padding[1] = 0 - if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]: - explicit_padding[3] = 0 - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0]) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1]) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2]) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3]) - - # set kernel x stride low bit - stride = primary_op.attrs["strides"][2] - 1 & 1 - # set kernel y stride low bit - stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1 - # set kernel x stride extension bits - stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6 - # set kernel y stride extension bits - stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9 - - if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)): - k_height, k_width = primary_op.attrs["ksize"][1:3] - emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1) - emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1) - - valid_padding = sum(explicit_padding) == 0 - - if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.ReduceSum)) and valid_padding: - # For valid padding vela has to output scaling values - if faf == Op.Sigmoid or faf == Op.Tanh: - rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32 - if cmd.ifm_tensor.dtype == DataType.int16: - # Calculate scale and shift for the output scale of 1/(3*4096) - shift = 0 - max_rescale = np.iinfo(np.int16).max / 2 - while rescale <= max_rescale and shift <= 30: - shift += 1 - rescale *= 2 - scale = int(rescale) - else: - rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 - scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits) - scale = int(round_away_zero(scale * rescale)) - elif fused_quantize: - # Quantize op requires different scaling - ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32) - ofm_scale_f64 = np.double(ofm_quant.scale_f32) - scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64) - elif primary_op.type == Op.ResizeBilinear and "rescale" in primary_op.attrs: - rescale = primary_op.attrs["rescale"] - rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 - scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits) - scale = int(round_away_zero(scale * rescale)) - else: - # In case avg pool fused with concat or other memory operation, rescaling might be needed. - # k_height == k_width == 1 is allways true in this case - # Normally the scale is maximised, to get maximum precision, which means that - # if rescale != 1, scale need to consider the number of bits needed for rescaling - if None not in (ofm_quant.scale_f32, cmd.ifm_tensor.quantization.scale_f32,): - rescale = cmd.ifm_tensor.quantization.scale_f32 / ofm_quant.scale_f32 - rescale_bits = 0 - if k_height == k_width == 1: - if fmf == Op.ConcatSliceWrite: - rounding_mode = rounding.NATURAL - if rescale > 1: - rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 - elif rescale < 1: - rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1) - scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits) - scale = int(round_away_zero(scale * rescale)) - else: - scale = 1 - shift = 0 - - emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift) - # Valid-padded average pool should use the global scale from - # NPU_SET_OFM_SCALE register, which is set above. - use_global_scale = True - - else: # Convolution - assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default - # Reduced precision quantization and natural rounding used for int16 - if cmd.ifm_tensor.dtype == DataType.int16: - rounding_mode = rounding.NATURAL - stride |= (cur_kernel.dilation.y - 1) << 4 - stride |= (cur_kernel.dilation.x - 1) << 3 - emit.cmd0_with_param( - cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1) - ) - emit.cmd0_with_param( - cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1) - ) - if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst: - # Part-kernel-first weight ordering - assert npu_block_type == NpuBlockType.ConvolutionMxN - stride |= 1 << 2 - - emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride) - - elif npu_block_type in set((NpuBlockType.VectorProduct,)): - # Vector product is implemented using a 1x1 convolution so need - # to setup the appropriate padding and kernel info - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0) - - # kernel stride reg = 0 means stride(1,1) + depth first weight - # order + dilation(0,0) + kernel_split_size=8 - emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0) - - emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0) - emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0) - - if npu_block_type in set( - (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) - ): - # Emit Weight base address commands, only maps the area required for - # this command's weights from the larger tensor. - stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord) - weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index] - substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length - - # Extract weight substream offsets and calculate their lengths - assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0) - weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord) - - # Set weights sources for active and present cores - for core, param in enumerate( - [ - (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH), - (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH), - ] - ): - if core < substreams: - emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core]) - emit.cmd1_with_offset( - param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core] - ) - elif core < arch.ncores: - emit.cmd1_with_offset(param[0], weight_addr) - emit.cmd1_with_offset(param[1], 0) - - weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type] - emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region) - - # Emit Scale & Bias base address commands, with length matching the amount required by - # the weight tensors. - if cmd.scale_tensor is not None: - scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index] - substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length - - # Extract scale substream offsets and calculate their lengths - assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0) - scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:]) - - # Set scale sources for active and present cores - for core, param in enumerate( - [ - (cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), - (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH), - ] - ): - if core < substreams: - emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core]) - emit.cmd1_with_offset( - param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core] - ) - elif core < arch.ncores: - emit.cmd1_with_offset(param[0], scale_addr) - emit.cmd1_with_offset(param[1], 0) - - # Emit base address for NPU to access scale & bias data - scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type] - emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region) - - ofm_quant_qmin = ofm_quant.quant_min if ofm_quant else np.iinfo(np.int16).min - ofm_quant_qmax = ofm_quant.quant_max if ofm_quant else np.iinfo(np.int16).max - ifm_min = cmd.ifm_tensor.quantization.min if cmd.ifm_tensor.quantization else np.iinfo(np.int16).min - ifm_max = cmd.ifm_tensor.quantization.max if cmd.ifm_tensor.quantization else np.iinfo(np.int16).max - - # Emit commands for any fused activation function - if faf is None: - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) - # Even if no activation function, values need to be set to override previous values - faf_min = ofm_quant_qmin - faf_max = ofm_quant_qmax - elif faf == Op.Relu: - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) - faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point) - faf_max = ofm_quant_qmax - elif faf == Op.Relu6: - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) - faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point) - faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point) - elif faf == Op.ReluN1To1: - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) - faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point) - faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point) - elif faf == Op.Tanh: - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH) - if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)): - faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point) - faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point) - else: - faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point) - faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point) - elif faf == Op.Sigmoid: - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID) - if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)): - faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point) - faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point) - else: - faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point) - faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point) - elif faf == Op.LUT: - lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", -1) - assert activation.LUT_START.value <= lut_index <= activation.LUT_END.value, "LUT index out of range." - if cmd.ofm_tensor.dtype == DataType.int32: - lut_index |= 3 << 12 # Force I8 range - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index) - faf_min = ofm_quant_qmin - faf_max = ofm_quant_qmax - else: - raise Exception("Unsupported fused_activation_function = " + faf.name) - - # Activation range needs to be set based upon the quantisation range and the fused activation range - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min)) - emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max)) - - out_shape = cmd.ofm_box.get_size_shape() - if len(out_shape) >= 4: - emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1) - else: - emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0) - if len(out_shape) >= 2: - emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1) - else: - emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0) - emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1) - - if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)): - in_shape = cmd.ifm_box.get_size_shape() - emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1) - else: - emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1) - - for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in ( - ( - cmd.ifm_tensor, - cmd.ifm_box, - cmd0.NPU_SET_IFM_REGION, - (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3), - (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X), - cmd0.NPU_SET_IFM_ZERO_POINT, - ), - ( - cmd.ifm2_tensor, - cmd.ifm2_box, - cmd0.NPU_SET_IFM2_REGION, - ( - cmd1.NPU_SET_IFM2_BASE0, - cmd1.NPU_SET_IFM2_BASE1, - cmd1.NPU_SET_IFM2_BASE2, - cmd1.NPU_SET_IFM2_BASE3, - ), - (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X), - cmd0.NPU_SET_IFM2_ZERO_POINT, - ), - ( - cmd.ofm_tensor, - cmd.ofm_box, - cmd0.NPU_SET_OFM_REGION, - (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3), - (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X), - cmd0.NPU_SET_OFM_ZERO_POINT, - ), - ): - - if tens is None: - continue - - need_zero_point = ( - (faf is not None and forced_ofm_quantization is None) - or (fmf == Op.ConcatSliceWrite) - or fused_quantize - ) - if ( - (primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL)) and not need_zero_point) - or ( - tens.dtype == DataType.int32 - and zero_point_op in (cmd0.NPU_SET_IFM_ZERO_POINT, cmd0.NPU_SET_IFM2_ZERO_POINT) - ) - or tens.quantization is None - ): - # Actual integer operation, just set scale to 1 and zero point to 0 - emit.cmd0_with_param(zero_point_op, 0) - else: - assert tens.quantization.zero_point is not None, "need an actual zero point set" - if cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op and forced_ofm_quantization is not None: - zero_point = forced_ofm_quantization.zero_point - elif ( - "resizebilinear" in primary_op.attrs - and primary_op.type == Op.Add - and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op - ): - # Force output zero point same as the input zero point - # for resizebilinear 1x1 that is converted to add - zero_point = cmd.ifm2_tensor.quantization.zero_point - else: - zero_point = tens.quantization.zero_point - emit.cmd0_with_param(zero_point_op, int(zero_point)) - - if tens.shape == []: - # Empty shape, elementwise constant - ifm2_scalar = tens.quant_values - assert ifm2_scalar.size == 1 - emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0))) - continue - - height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer( - box.start_coord, box.end_coord - ) - if npu_block_type != NpuBlockType.VectorProduct: - if tens == cmd.ifm_tensor: - emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1) - emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1) - elif tens == cmd.ofm_tensor: - emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1) - emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1) - emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1) - if tens == cmd.ifm2_tensor: - emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1) - emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1) - emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1) - else: - if len(out_shape) == 2: - assert out_shape[0] == 1 - if tens == cmd.ifm_tensor: - emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, 0) - elif tens == cmd.ofm_tensor: - emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, 0) - else: - assert False - - emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type]) - - for idx, addr in enumerate(addresses): - if addr is None: - addresses[idx] = 0 - - emit.cmd1_with_offset(ptr_ops[0], addresses[0]) - emit.cmd1_with_offset(ptr_ops[1], addresses[1]) - emit.cmd1_with_offset(ptr_ops[2], addresses[2]) - emit.cmd1_with_offset(ptr_ops[3], addresses[3]) - - strides = tens.get_strides() - emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C) - emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W) - emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H) - - if tens.format == TensorFormat.NHCWB16: - # Check that all BasePointer addresses are aligned to 16 bytes - assert (int(addresses[0]) % 16) == 0 - assert (int(addresses[1]) % 16) == 0 - assert (int(addresses[2]) % 16) == 0 - assert (int(addresses[3]) % 16) == 0 - - ofm_dtype = cmd.ofm_tensor.dtype - assert ofm_dtype.type & BaseType.Int - prec = 0 - if ofm_dtype.size_in_bits() == 8: - prec = 0 - elif ofm_dtype.size_in_bits() == 16: - prec = 2 - elif ofm_dtype.size_in_bits() == 32: - prec = 4 - else: - assert 0 - - if ofm_dtype.type & BaseType.Signed: - prec += 1 - - if use_global_scale: - # Set global scale bit, as opposed to using per channel scale - prec |= 1 << 8 - - if cmd.ofm_tensor.format == TensorFormat.NHCWB16: - prec |= 1 << 6 - - prec |= rounding_mode.value << 14 - - emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec) - - prec = None - weight_bits = 8 - if cmd.weight_tensor is not None: - weight_bits = cmd.weight_tensor.dtype.size_in_bits() - - ifm_dtype = cmd.ifm_tensor.dtype - - assert weight_bits == 8, "Unsupported weight bit depth" - assert ( - ifm_dtype.size_in_bits() in {8, 16} - or ifm_dtype.size_in_bits() == 32 - and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum) - ), "Unsupported ifm bit depth" - - if ifm_dtype.size_in_bits() == 8: - if ifm_dtype.type & BaseType.Signed: - prec = ifm_precision.S8 - else: - prec = ifm_precision.U8 - elif ifm_dtype.size_in_bits() == 16: - if ifm_dtype.type & BaseType.Signed: - prec = ifm_precision.S16 - else: - prec = ifm_precision.U16 - elif ifm_dtype == DataType.int32: - prec = ifm_precision.S32 - - ifm_prec = prec.value - ifm2_prec = ifm_prec - - if cmd.ifm_tensor.format == TensorFormat.NHCWB16: - ifm_prec |= 1 << 6 - - ifm_prec |= op_to_scale << 8 - - emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec) - - if cmd.ifm2_tensor is not None: - if cmd.ifm2_tensor.format == TensorFormat.NHCWB16: - ifm2_prec |= 1 << 6 - emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec) - - # Get op parameters - cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd) - cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3]) - cur_ofm_rect = get_op_ofm_rect(cmd) - cur_ifm_rect = get_op_ifm_rect(cmd) - cur_padLT = get_op_padding_lt(cmd) - if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd): - if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape: - blockdep = arch.calc_block_dep( - prev_ifm_rect, - prev_ofm_rect, - prev_ifm_block_depth, - prev_ofm_block, - prev_kernel, - cur_ifm_rect, - cur_ofm_rect, - cur_ifm_block_depth, - cur_ofm_block, - cur_kernel, - cur_padLT, - ) - else: - blockdep = 0 - else: - blockdep = ArchitectureFeatures.MAX_BLOCKDEP - - # Set between every op (dependent or not) + prev_op = None + prev_block_config = None + # Generate register commands for all operations + for op_index, npu_op in enumerate(npu_op_list): + dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark) + block_config = generate_registers_for_op(emit, npu_op, arch) + if not is_dma_op(npu_op): + # Generate BLOCKDEP + assert block_config is not None + blockdep = calc_blockdep(arch, prev_op, prev_block_config, npu_op, block_config) blockdep = min(blockdep, arch.max_blockdep) emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep) - prev_cmd = cmd - - emit_cmd_waits(cmd_waits) - DebugDatabase.add_command(stream_id, emit.offset, primary_op) - - if npu_block_type == NpuBlockType.ConvolutionMxN: - emit.cmd_do_operation(cmd0.NPU_OP_CONV) - elif npu_block_type == NpuBlockType.ConvolutionDepthWise: - emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE) - elif npu_block_type == NpuBlockType.VectorProduct: - # Vector product is implemented using a 1x1 convolution - emit.cmd_do_operation(cmd0.NPU_OP_CONV) - elif npu_block_type == NpuBlockType.Pooling: - param = pooling_mode.MAX.value if primary_op.type.is_maxpool_op() else pooling_mode.AVERAGE.value - emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param) - elif npu_block_type == NpuBlockType.ReduceSum: - emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value) - elif npu_block_type == NpuBlockType.ElementWise: - param = elementwise_mode_map[primary_op.type] - emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param) - else: - print("Warning: Skipping register command stream generation for", ps) - + prev_op = npu_op + prev_block_config = block_config + + generate_cmd_waits(emit, cmd_waits) + # Generate the actual NPU_OP command + generate_operation_code(emit, npu_op) + if add_to_debug_db is not None: + add_to_debug_db(npu_op, emit.offset) # Fill in final part of command stream: emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF) + +def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False): + """Generates command stream for the subgraph, adds it to sg.register_command_stream""" + # Convert high level command stream to list of NpuOperation + npu_op_list = [] + npu_op_to_cmd = dict() # map from npu op to high level command + for cmd in sg.high_level_command_stream: + if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default: + print("Warning: Skipping register command stream generation for", cmd.ps) + else: + npu_op = convert_command_to_npu_op(cmd, arch) + npu_op_list.append(npu_op) + npu_op_to_cmd[npu_op] = cmd + if verbose: + print_operations(npu_op_list) + # Generate register commands + stream_id = DebugDatabase.add_stream(sg) + DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing + emit = CommandStreamEmitter() + + def add_to_debug_db(npu_op: NpuOperation, offset: int): + """Adds info to the debug database""" + if not is_dma_op(npu_op): + cmd = npu_op_to_cmd[npu_op] + DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op) + + generate_command_stream(emit, npu_op_list, arch, add_to_debug_db) sg.register_command_stream = emit.to_list() if verbose: emit.print_cmds() print("number of commands", len(emit.cmd_stream)) print("command stream length in words", len(sg.register_command_stream)) + + +def generate_register_command_stream(npu_op_list: List[NpuOperation], accelerator: Accelerator) -> List[int]: + """ + Public facing API for generating an ethosu register command stream. + Calculates dependencies between commands and inserts wait operations if needed. + + :param npu_op_list: List[NpuOperation] list of high level NPU operations + :param accelerator: architecture_features.Accelerator enum to pick the correct ethosu accelerator + :return ethosu instructions, as a list of 32-bit integers + """ + emit = CommandStreamEmitter() + arch = ArchitectureFeatures( + vela_config=None, + system_config=None, + accelerator_config=accelerator.value, + override_block_config=None, + block_config_limit=None, + global_memory_clock_scale=1.0, + max_blockdep=ArchitectureFeatures.MAX_BLOCKDEP, + weight_estimation_scaling=1.0, + ) + generate_command_stream(emit, npu_op_list, arch) + return emit.to_list() |