1 files changed, 1073 insertions, 897 deletions
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index e3fedfcc..30b5e04a 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -14,47 +14,72 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Description:
-# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
+# Register level (low-level) command stream generation for Ethos-U55. Takes a list of NPU operations and generates
 # all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
 # stream suitable for interpretation by the Ethos-U55 processor.
 from collections import defaultdict
 from collections import namedtuple
 from enum import Enum
 from enum import IntEnum
+from typing import List
+from typing import Optional
 
 import numpy as np
 
+from . import numeric_util
 from . import scaling
+from .api import NpuActivation
+from .api import NpuActivationOp
+from .api import NpuAddressRange
+from .api import NpuBlockOperation
+from .api import NpuBlockTraversal
+from .api import NpuConv2DOperation
+from .api import NpuDataType
+from .api import NpuDmaOperation
+from .api import NpuElementWiseOp
+from .api import NpuElementWiseOperation
+from .api import NpuFeatureMap
+from .api import NpuKernel
+from .api import NpuLayout
+from .api import NpuOperation
+from .api import NpuOperationType
+from .api import NpuPadding
+from .api import NpuPoolingOp
+from .api import NpuPoolingOperation
+from .api import NpuQuantization
+from .api import NpuResamplingMode
+from .api import NpuRoundingMode
+from .api import NpuShape3D
+from .api import NpuTileBox
+from .architecture_features import Accelerator
 from .architecture_features import ArchitectureFeatures
 from .architecture_features import Block
 from .architecture_features import Rect
 from .architecture_features import SharedBufferArea
 from .architecture_features import SHRAMElements
-from .data_type import BaseType
-from .data_type import DataType
 from .debug_database import DebugDatabase
 from .ethos_u55_regs.ethos_u55_regs import acc_format
 from .ethos_u55_regs.ethos_u55_regs import activation
 from .ethos_u55_regs.ethos_u55_regs import cmd0
 from .ethos_u55_regs.ethos_u55_regs import cmd1
 from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
-from .ethos_u55_regs.ethos_u55_regs import ifm_precision
 from .ethos_u55_regs.ethos_u55_regs import pooling_mode
 from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .ethos_u55_regs.ethos_u55_regs import rounding
 from .high_level_command_stream import CommandType
-from .numeric_util import clamp_sigmoid
-from .numeric_util import clamp_tanh
-from .numeric_util import full_shape
+from .high_level_command_to_npu_op import convert_command_to_npu_op
+from .high_level_command_to_npu_op import to_kernel
+from .high_level_command_to_npu_op import unary_elementwise_ops
 from .numeric_util import quantise_float32
 from .numeric_util import round_away_zero
 from .numeric_util import round_up_to_int
 from .operation import NpuBlockType
-from .operation import Op
-from .tensor import MemType
-from .tensor import TensorBlockTraversal
-from .tensor import TensorFormat
-from .tensor import TensorPurpose
+from .range_set import AccessDirection
+from .range_set import MemoryAccessSet
+from .range_set import MemoryRangeSet
+from .shared_buffer_allocation import find_suitable_block_configs
+from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
+from .shared_buffer_allocation import SharedBufferAllocation
 
 
 class RegisterMachine:
@@ -80,22 +105,6 @@ class CmdMode(IntEnum):
     CmdOpMask = 0x03FF
 
 
-class BasePointerIndex(IntEnum):
-    WeightTensor = 0  # base address index for the Weight tensor
-    ScratchTensor = 1  # base address index for the Scratch_tensor in the TensorArena
-    ScratchFastTensor = 2  # base address for the Scratch_fast_tensor
-    Mem2Mem = (1 << 8) | (3 << 0)  # base address slot for memory 2 memory transfer
-
-
-# TODO: Replace with definitions from ethos_u55_regs
-class IFM2Broadcast(IntEnum):
-    BroadcastHdim = 1 << 0
-    BroadcastWdim = 1 << 1
-    BroadcastCdim = 1 << 2
-    ReverseOperandOrder = 1 << 6
-    UseIFM2Scalar = 1 << 7
-
-
 class CommandStreamEmitter:
     WORD_SIZE = 4
 
@@ -117,7 +126,7 @@ class CommandStreamEmitter:
             sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
         return sz
 
-    def to_list(self):
+    def to_list(self) -> List[int]:
         return [elem for cmd in self.cmd_stream for elem in cmd]
 
     def print_cmds(self):
@@ -146,7 +155,7 @@ class CommandStreamEmitter:
 
             print(s)
 
-    def cmd0_with_param(self, cmd, param):
+    def cmd0_with_param(self, cmd: cmd0, param):
         if isinstance(param, Enum):
             param = int(param.value)
         else:
@@ -160,7 +169,7 @@ class CommandStreamEmitter:
         self.cmd_stream.append((command,))
         self.offset += CommandStreamEmitter.WORD_SIZE
 
-    def cmd1_with_offset(self, cmd, offset, param=0x0):
+    def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
         offset = int(offset) & 0xFFFFFFFFF
         command = cmd.value | CmdMode.Payload32.value | (param << 16)
 
@@ -171,13 +180,13 @@ class CommandStreamEmitter:
         self.cmd_stream.append((command, offset))
         self.offset += CommandStreamEmitter.WORD_SIZE * 2
 
-    def cmd_wait(self, cmd, channel, outstanding_count):
+    def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
         param = (16 * channel) + outstanding_count
         command = ((param & 0xFFFF) << 16) | cmd.value
         self.cmd_stream.append((command,))
         self.offset += CommandStreamEmitter.WORD_SIZE
 
-    def cmd_do_operation(self, cmd, param=0):
+    def cmd_do_operation(self, cmd: cmd0, param=0):
         param = int(param)
         command = ((param & 0xFFFF) << 16) | cmd.value
 
@@ -186,13 +195,674 @@ class CommandStreamEmitter:
         self.get_reg_machine(cmd).switch_bank()
 
 
+# -------------------------------------------------------------------
+# REGISTER GENERATION
+# -------------------------------------------------------------------
+
+
+class BasePointerIndex(IntEnum):
+    WeightTensor = 0  # base address index for the Weight tensor
+    ScratchTensor = 1  # base address index for the Scratch_tensor in the TensorArena
+    ScratchFastTensor = 2  # base address for the Scratch_fast_tensor
+    Mem2Mem = (1 << 8) | (3 << 0)  # base address slot for memory 2 memory transfer
+
+
+# TODO: Replace with definitions from ethos_u55_regs
+class IFM2Broadcast(IntEnum):
+    BroadcastHdim = 1 << 0
+    BroadcastWdim = 1 << 1
+    BroadcastCdim = 1 << 2
+    ReverseOperandOrder = 1 << 6
+    UseIFM2Scalar = 1 << 7
+
+
+pooling_op_map = {
+    NpuPoolingOp.MAX: pooling_mode.MAX.value,
+    NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
+    NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
+}
+
+elementwise_op_map = {
+    NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
+    NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
+    NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
+    NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
+    NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
+    NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
+    NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
+    NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
+    NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
+    NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
+}
+
+activation_op_map = {
+    NpuActivationOp.NONE_OR_RELU: activation.NONE,
+    NpuActivationOp.TANH: activation.TANH,
+    NpuActivationOp.SIGMOID: activation.SIGMOID,
+}
+
+# Maps an AccumulatorType enum to the corresponding acc_format value
+acc_format_map = {
+    SHRAMElements.Acc16: acc_format.FP_S5_10.value,
+    SHRAMElements.Acc32: acc_format.INT_32BIT.value,
+    SHRAMElements.Acc40: acc_format.INT_40BIT.value,
+}
+
+resampling_mode_map = {
+    NpuResamplingMode.NONE: resampling_mode.NONE,
+    NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
+    NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
+}
+
+# Maps data type size in bits to activation precision
+precision_map = {8: 0, 16: 1, 32: 2}
+
+# Maps rounding mode to the corresponding value
+rounding_mode_map = {
+    NpuRoundingMode.TFL: rounding.TFL.value,
+    NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
+    NpuRoundingMode.NATURAL: rounding.NATURAL.value,
+}
+
+
+def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
+    """Quantizes the given value"""
+    scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
+    zp = 0 if quant is None else quant.zero_point
+    return quantise_float32(value, scale, zp)
+
+
+def has_ifm2(npu_op: NpuBlockOperation) -> bool:
+    """Checks if op has non-scalar IFM2"""
+    return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
+
+
+def is_dma_op(npu_op: NpuOperation) -> bool:
+    """Checks if op is a DMA operation"""
+    return npu_op.op_type == NpuOperationType.Dma
+
+
+def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
+    """Generates IFM_PAD registers"""
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
+
+
+def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
+    """Generates ACTIVATION registers"""
+    act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
+
+    if act.min is None:
+        quantized_min = ofm.data_type.min_value()
+    else:
+        quantized_min = quantise(act.min, ofm.quantization)
+    if act.max is None:
+        quantized_max = ofm.data_type.max_value()
+    else:
+        quantized_max = quantise(act.max, ofm.quantization)
+    quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
+    quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
+    if act.op_type == NpuActivationOp.TABLE_LOOKUP:
+        assert 0 <= act.lookup_table_index < 8
+        activation_value = 16 + act.lookup_table_index
+        if ofm.data_type == NpuDataType.INT32:
+            activation_value |= 3 << 12  # Force I8 range
+            quantized_min = max(-128, quantized_min)
+            quantized_max = min(127, quantized_max)
+    else:
+        activation_value = activation_op_map[act.op_type]
+    emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
+    emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
+    emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
+
+
+def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
+    """Generates xFM_BASE registers"""
+    if layout == NpuLayout.NHCWB16:
+        # Check that all BasePointer addresses are aligned to 16 bytes
+        assert all((int(addr) % 16) == 0 for addr in addresses)
+    emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
+    emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
+    emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
+    emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
+
+
+def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
+    """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
+    emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
+    emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
+    emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
+
+
+def generate_strides(
+    emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
+):
+    """Generates STRIDE_C/Y/X registers"""
+    strides = get_strides(fm)
+    emit.cmd1_with_offset(stride_c_cmd, strides.depth)  # stride between 16-byte channel blocks (C)
+    emit.cmd1_with_offset(stride_y_cmd, strides.height)  # stride between vertical values (H)
+    emit.cmd1_with_offset(stride_x_cmd, strides.width)  # stride between horisontal values (W)
+
+
+def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
+    """Generates IFM/IFM2_PRECISION register"""
+    dtype = fm.data_type
+    prec = 1 if dtype.is_signed() else 0
+    activation_precision = precision_map[dtype.size_in_bits()]
+    prec += activation_precision << 2
+
+    if fm.layout == NpuLayout.NHCWB16:
+        prec |= 1 << 6
+
+    prec |= op_to_scale << 8
+    emit.cmd0_with_param(precision_cmd, prec)
+
+
+def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
+    """Generates OFM_PRECISION register"""
+    dtype = npu_op.ofm.data_type
+    prec = 1 if dtype.is_signed() else 0
+    activation_precision = precision_map[dtype.size_in_bits()]
+    prec += activation_precision << 1
+
+    if use_global_scale:
+        # Set global scale bit, as opposed to using per channel scale
+        prec |= 1 << 8
+    if npu_op.ofm.layout == NpuLayout.NHCWB16:
+        prec |= 1 << 6
+    prec |= rounding_mode_map[npu_op.rounding_mode] << 14
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
+
+
+def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
+    """Generates IFM2_BROADCAST register for binary elementwise operations"""
+    ifm2_broadcast = 0
+    ifm = npu_op.ifm
+    ifm2 = npu_op.ifm2
+    if npu_op.reversed_operands:
+        ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
+    if npu_op.ifm2_scalar is not None:
+        # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
+        ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
+    else:
+        if ifm.shape.height != ifm2.shape.height:
+            # Broadcast in 'H' dimension
+            assert ifm2.shape.height == 1
+            ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
+
+        if ifm.shape.width != ifm2.shape.width:
+            # Broadcast in 'W' dimension
+            assert ifm2.shape.width == 1
+            ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
+
+        if ifm.shape.depth != ifm2.shape.depth:
+            # Broadcast in 'C' dimension
+            assert ifm2.shape.depth == 1
+            ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
+
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
+
+
+def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
+    """Generates general IFM registers"""
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
+    generate_addresses(
+        emit,
+        [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
+        ifm.tiles.addresses,
+        ifm.layout,
+    )
+    generate_tiles(
+        emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
+    )
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
+    generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
+
+
+def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
+    """Generates general IFM2 registers"""
+    if not has_scalar:
+        emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
+        generate_addresses(
+            emit,
+            [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
+            ifm2.tiles.addresses,
+            ifm2.layout,
+        )
+        generate_tiles(
+            emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
+        )
+        generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
+
+
+def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
+    """Generates general OFM registers"""
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
+    generate_addresses(
+        emit,
+        [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
+        ofm.tiles.addresses,
+        ofm.layout,
+    )
+    generate_tiles(
+        emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
+    )
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
+    generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
+
+
+def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
+    """Generates KERNEL related registers"""
+    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
+    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
+    # set kernel x stride low bit
+    stride = (kernel.stride_x - 1) & 1
+    # set kernel y stride low bit
+    stride |= (kernel.stride_y - 1 & 1) << 1
+    # set kernel x stride extension bits
+    stride |= (kernel.stride_x - 1 >> 1) << 6
+    # set kernel y stride extension bits
+    stride |= (kernel.stride_y - 1 >> 1) << 9
+    stride |= (kernel.dilation_x - 1) << 3
+    stride |= (kernel.dilation_y - 1) << 4
+    if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
+        stride |= 1 << 2
+    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
+
+
+def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
+    """Generates WEIGHT registers"""
+    if len(weights) == 0:
+        return
+    emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
+    # Set weights sources for active and present cores
+    for core, (addr, length) in enumerate(
+        [
+            (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
+            (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
+        ]
+    ):
+        if core < len(weights):
+            emit.cmd1_with_offset(addr, weights[core].address)
+            emit.cmd1_with_offset(length, weights[core].length)
+        elif core < arch.ncores:
+            emit.cmd1_with_offset(addr, weights[0].address)
+            emit.cmd1_with_offset(length, 0)
+
+
+def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
+    """Generates SCALE registers"""
+    if len(biases) == 0:
+        return
+    emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
+    # Set weights sources for active and present cores
+    for core, (addr, length) in enumerate(
+        [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
+    ):
+        if core < len(biases):
+            emit.cmd1_with_offset(addr, biases[core].address)
+            emit.cmd1_with_offset(length, biases[core].length)
+        elif core < arch.ncores:
+            emit.cmd1_with_offset(addr, biases[0].address)
+            emit.cmd1_with_offset(length, 0)
+
+
+def generate_block_config(
+    emit: CommandStreamEmitter,
+    npu_op: NpuBlockOperation,
+    arch: ArchitectureFeatures,
+    shared_buffer: SharedBufferAllocation,
+) -> NpuShape3D:
+    """Selects a suitable block config if none has been set, and generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
+    block_config = npu_op.block_config
+    if block_config is None or block_config.height < 0:
+        # Note: this code only used if the public API to generate command streams is used;
+        # in the "normal" flow, the block config selected by the scheduler is used
+        if npu_op.weights:
+            assert block_config is not None, "block_config.depth must be provided for ops with weights"
+        # Block config has not been provided: find one
+        blocks = find_suitable_block_configs(arch, shared_buffer)
+        # Return the block with biggest volume
+        # TODO: use a better algorithm to find the best block
+        best_block = None
+        best_value = 0
+        for block in blocks:
+            if block_config is not None and block[3] != block_config.depth:
+                continue
+            value = block[0] * block[1] * block[3]
+            if value > best_value:
+                best_value = value
+                best_block = block
+        assert best_block is not None, f"No suitable block config was found, {npu_op.op_type}"
+        block_config = NpuShape3D(height=best_block[0], width=best_block[1], depth=best_block[3])
+    alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
+    assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
+    emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
+    return block_config
+
+
+def generate_shram_registers_elementwise(
+    emit: CommandStreamEmitter,
+    npu_op: NpuElementWiseOperation,
+    arch: ArchitectureFeatures,
+    shared_buffer: SharedBufferAllocation,
+):
+    """Generates IB_END/IB_START/AB_START registers for elementwise operations"""
+    # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
+    uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
+    shram_required = arch.available_shram_banks(uses_lut)
+
+    # Acc buffers not needed so set AB_START to size of SHRAM
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
+    emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
+    if has_ifm2(npu_op):
+        # Set IFM2_IB_START to the latter half of the IB space
+        ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
+        emit.cmd0_with_param(
+            cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
+        )
+    emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
+
+
+def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
+    """Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
+    emit.cmd0_with_param(
+        cmd0.NPU_SET_IFM_IB_END,
+        shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
+    )
+    emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
+    emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
+
+
+def generate_common(
+    emit: CommandStreamEmitter,
+    npu_op: NpuBlockOperation,
+    block_traversal: NpuBlockTraversal,
+    arch: ArchitectureFeatures,
+    use_global_scale: bool = False,
+    op_to_scale: int = 0,
+):
+    """Generate registers that are common to most operations"""
+    assert npu_op.ifm is not None and npu_op.ofm is not None
+    generate_ifm(emit, npu_op.ifm)
+    generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
+    emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
+    if npu_op.padding is not None:
+        generate_padding(emit, npu_op.padding)
+    generate_ofm(emit, npu_op.ofm)
+    generate_ofm_precision(emit, npu_op, use_global_scale)
+    if npu_op.op_type != NpuOperationType.ElementWise:
+        assert npu_op.kernel is not None
+        generate_kernel(emit, npu_op.kernel, block_traversal)
+    generate_weights(emit, npu_op.weights, arch)
+    generate_biases(emit, npu_op.biases, arch)
+    generate_activation(emit, npu_op.activation, npu_op.ofm)
+
+
+# -------------------------------------------------------------------
+# SCALING
+# -------------------------------------------------------------------
+
+
+def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
+    """Generates OFM_SCALE register for pooling operations"""
+    # For valid padding vela has to output scaling values
+    kernel = pool_op.kernel
+    ifm_quant = pool_op.ifm.quantization
+    ofm_quant = pool_op.ofm.quantization
+    if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
+        assert ifm_quant.scale_f32 is not None
+        rescale = 0x3000 * ifm_quant.scale_f32
+        if pool_op.ifm.data_type == NpuDataType.INT16:
+            # Calculate scale and shift for the output scale of 1/(3*4096)
+            shift = 0
+            max_rescale = np.iinfo(np.int16).max / 2
+            while rescale <= max_rescale and shift <= 30:
+                shift += 1
+                rescale *= 2
+            scale = int(rescale)
+        else:
+            rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
+            scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
+            scale = int(round_away_zero(scale * rescale))
+    elif pool_op.fused_quantize:
+        # Quantize op requires different scaling
+        ifm_scale_f64 = np.double(ifm_quant.scale_f32)
+        ofm_scale_f64 = np.double(ofm_quant.scale_f32)
+        scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
+    elif pool_op.rescale is not None:
+        # for ResizeBilinear operations with "rescale" in primary_op.attrs
+        rescale = pool_op.rescale
+        rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
+        scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
+        scale = int(round_away_zero(scale * rescale))
+    else:
+        # In case avg pool fused with concat or other memory operation, rescaling might be needed.
+        # kernel height == kernel width == 1 is always true in this case
+        # Normally the scale is maximised, to get maximum precision, which means that
+        # if rescale != 1, scale need to consider the number of bits needed for rescaling
+        if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
+            rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
+            rescale_bits = 0
+            if kernel.height == kernel.width == 1:
+                if rescale > 1:
+                    rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
+                elif rescale < 1:
+                    rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
+            scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
+            scale = int(round_away_zero(scale * rescale))
+        else:
+            scale = 1
+            shift = 0
+
+    emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
+
+
+def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
+    """
+    Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
+    Returns the operator to scale
+    """
+    op_to_scale = 0
+    if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
+        input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
+        input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
+        output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
+
+        if npu_op.activation is not None and npu_op.activation.op_type in (
+            NpuActivationOp.SIGMOID,
+            NpuActivationOp.TANH,
+        ):
+            output_scale = 1 / 0x3000
+
+        if npu_op.sub_op_type == NpuElementWiseOp.MUL:
+            if None in (input_scale, input2_scale, output_scale):
+                ofm_scale = 1
+                shift = 0
+            else:
+                ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
+            emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+        else:  # Add/Sub
+            if None in (input_scale, input2_scale, output_scale):
+                opa_scale = opb_scale = ofm_scale = 1
+                opa_shift = shift = 0
+                if npu_op.rescale is not None:
+                    ofm_scale, shift = npu_op.rescale
+            elif input_scale == input2_scale:
+                opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
+                    input_scale, input2_scale, output_scale
+                )
+                opa_shift = 0  # Unused for this case
+            else:
+                # Use advanced implementation only when input scales differ
+                bitdepth = npu_op.ifm.data_type.size_in_bits()
+                (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
+                    input_scale, input2_scale, output_scale, bitdepth
+                )
+                opb_scale = 0  # Unused for this case
+                if npu_op.reversed_operands:
+                    # If the operand order is reversed we also have to swap which operand is scaled
+                    if op_to_scale == scaling.OperandToScale.OPa:
+                        op_to_scale = scaling.OperandToScale.OPb
+                    else:
+                        op_to_scale = scaling.OperandToScale.OPa
+            emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
+            emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
+            emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+    elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
+        output_scale = npu_op.ofm.quantization.scale_f32
+        ofm_scale, shift = scaling.quantise_scale(output_scale)
+        emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+    else:
+        emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
+    return op_to_scale
+
+
+# -------------------------------------------------------------------
+# ADDRESSING/STRIDES (helper functions)
+# -------------------------------------------------------------------
+
+
+def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
+    """Checks if the ranges overlap"""
+    return range1.region == range2.region and numeric_util.overlaps(
+        range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
+    )
+
+
+def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
+    """Calculates STRIDE_C/Y/X"""
+    if fm.strides is not None:
+        return fm.strides
+    elem_size = fm.data_type.size_in_bytes()
+    if fm.layout == NpuLayout.NHWC:
+        stride_c = elem_size
+        stride_x = fm.shape.depth * stride_c
+        stride_y = fm.shape.width * stride_x
+    else:
+        stride_x = 16 * elem_size
+        stride_c = stride_x * fm.shape.width
+        stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
+    return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
+
+
+def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
+    """Returns address of given coordinate"""
+    t = 0
+    BRICK = 16
+    stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
+    stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
+    if x >= fm.tiles.width_0:
+        x -= fm.tiles.width_0
+        t = 1
+        if y >= fm.tiles.height_1:
+            y -= fm.tiles.height_1
+            t += 2
+    elif y >= fm.tiles.height_0:
+        y -= fm.tiles.height_0
+        t += 2
+    elem_size = fm.data_type.size_in_bytes()
+    return (
+        fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
+    )
+
+
+def get_address_range(
+    fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
+) -> NpuAddressRange:
+    """Gets address range for (y0, x0, c0) - (y1, x1, c1)"""
+    addr0 = get_address(fm, strides, y0, x0, c0)
+    addr1 = get_address(fm, strides, y1, x1, c1)
+    return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
+
+
+def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
+    """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
+    strides = get_strides(fm)
+    height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
+    height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
+    t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
+    if width > width_0:
+        t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
+    else:
+        t1 = None
+    if height > height_0:
+        t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
+    else:
+        t2 = None
+    if t1 is not None and t2 is not None:
+        t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)
+    else:
+        t3 = None
+    return [t0, t1, t2, t3]
+
+
+# -------------------------------------------------------------------
+# DMA_WAIT/KERNEL_WAIT
+# -------------------------------------------------------------------
+
+
 Watermark = namedtuple("Watermark", ["npu", "dma"])
 
 
-def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, watermark: Watermark):
-    cmd = cmd_stream[cmd_index]
-    cmd_access = memory_accesses[cmd]
-    index = cmd_index - 1
+def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
+    return MemoryRangeSet(range.region, range.address, range.address + range.length)
+
+
+def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
+    """Returns the address that are read and written by the given DMA operation"""
+    res = MemoryAccessSet()
+    res.add(memory_range_set(dma_op.src), AccessDirection.Read)
+    res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
+    return res
+
+
+def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
+    """Returns the addresses that are read and written by the given operation"""
+    assert npu_op.ifm is not None and npu_op.ofm is not None
+    # Read addresses
+    read_ranges = get_address_ranges(npu_op.ifm)
+    if has_ifm2(npu_op):
+        assert npu_op.ifm2 is not None
+        read_ranges.extend(get_address_ranges(npu_op.ifm2))
+    read_ranges.extend(npu_op.weights)
+    read_ranges.extend(npu_op.biases)
+    if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
+        address = arch.available_shram_banks(True) * arch.shram_bank_size
+        read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))
+    # Written addresses
+    write_ranges = get_address_ranges(npu_op.ofm)
+    # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
+    uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
+    written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
+    write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))
+
+    res = MemoryAccessSet()
+    for read_range in read_ranges:
+        if read_range is not None:
+            res.add(memory_range_set(read_range), AccessDirection.Read)
+    for write_range in write_ranges:
+        if write_range is not None:
+            res.add(memory_range_set(write_range), AccessDirection.Write)
+    return res
+
+
+def get_wait_dependency(
+    arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
+):
+    """Used to calculate whether DMA wait or kernel wait operations are needed"""
+    npu_op = npu_op_list[op_index]
+    op_access = memory_accesses[npu_op]
+    index = op_index - 1
 
     # NPU dependency tracking
     npu_outstanding = -1
@@ -211,33 +881,32 @@ def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, waterm
     # the command that issues the wait.
     # NPU->NPU dependency is handled via blockdep.
     while (index >= npu_index) or (index >= dma_index):
-        prev_cmd = cmd_stream[index]
-        prev_access = memory_accesses[prev_cmd]
-
-        # Check DMA consuming NPU output
-        if prev_cmd.cmdtype == CommandType.NpuStripe:
-            if index >= npu_index:
-                if (cmd.cmdtype == CommandType.DMA) and (npu_outstanding == -1) and prev_access.conflicts(cmd_access):
-                    npu_outstanding = npu_ops
-                npu_ops = npu_ops + 1  # Count NPU ops in the pipeline
-                if npu_ops >= arch.max_outstanding_kernels:
-                    npu_index = max(index + 1, npu_index)
+        prev_op = npu_op_list[index]
+        prev_access = memory_accesses[prev_op]
 
         # Check NPU consuming DMA output
-        elif prev_cmd.cmdtype == CommandType.DMA:
+        if is_dma_op(prev_op):
             if index >= dma_index:
-                if cmd.cmdtype == CommandType.NpuStripe:
-                    if (dma_outstanding == -1) and prev_access.conflicts(cmd_access):
+                if not is_dma_op(npu_op):
+                    if (dma_outstanding == -1) and prev_access.conflicts(op_access):
                         dma_outstanding = dma_ops
-                dma_ops = dma_ops + 1  # Count DMA ops in the pipeline
+                dma_ops += 1  # Count DMA ops in the pipeline
                 if dma_ops >= arch.max_outstanding_dma:
                     dma_index = max(index + 1, dma_index)
+        # Check DMA consuming NPU output
+        else:
+            if index >= npu_index:
+                if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
+                    npu_outstanding = npu_ops
+                npu_ops += 1  # Count NPU ops in the pipeline
+                if npu_ops >= arch.max_outstanding_kernels:
+                    npu_index = max(index + 1, npu_index)
 
-        index = index - 1
+        index -= 1
 
     # Update DMA watermark if we didn't see any and the NPU pipeline is full
     if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
-        dma_index = cmd_index
+        dma_index = op_index
 
     # Bring the search watermark forwards as we complete for those dependencies
     watermark = Watermark(npu_index, dma_index)
@@ -246,873 +915,380 @@ def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, waterm
     return watermark, outstanding
 
 
-def has_prev_op_dependency(prev_cmd, cmd):
-    if prev_cmd is None:
-        return False
-    if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
-        if prev_cmd.ofm_tensor.equivalent(cmd.ifm_tensor):
-            return True
-        elif cmd.ifm2_tensor is not None:
-            return prev_cmd.ofm_tensor.equivalent(cmd.ifm2_tensor)
-    return False
-
+def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
+    if cmd_waits.npu >= 0:
+        emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
 
-def get_op_ofm_rect(cmd):
-    start = full_shape(4, cmd.ofm_box.start_coord, 0)
-    end = full_shape(4, cmd.ofm_box.end_coord, 1)
-    return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
+    if cmd_waits.dma >= 0:
+        emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
 
 
-def get_op_ifm_rect(cmd):
-    start = full_shape(4, cmd.ifm_box.start_coord, 0)
-    end = full_shape(4, cmd.ifm_box.end_coord, 1)
-    return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
+# -------------------------------------------------------------------
+# BLOCKDEP
+# -------------------------------------------------------------------
 
 
-def get_op_ifmofm_block_depth(arch, cmd):
-    # Note: NOT equivalent to the normal ifm block depth calculation since
-    # it takes into account 'depthless' block operations by returning full
-    # depth
-    if cmd.ps.npu_block_type in (
-        NpuBlockType.ConvolutionDepthWise,
-        NpuBlockType.Pooling,
-        NpuBlockType.ElementWise,
-        NpuBlockType.ReduceSum,
-    ):
-        return cmd.ofm_box.get_size_shape()[-1]
+def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:
+    """Checks if npu_op's input is dependent on prev_op's output"""
+    assert npu_op.ifm is not None
+    assert prev_op.ofm is not None
+    curr_input_ranges = get_address_ranges(npu_op.ifm)
 
-    return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
-
-
-def get_op_padding_lt(cmd):
-    if cmd.ps.npu_block_type not in (
-        NpuBlockType.ConvolutionDepthWise,
-        NpuBlockType.Pooling,
-        NpuBlockType.ConvolutionMxN,
-        NpuBlockType.ReduceSum,
-    ):
-        return (0, 0)
-
-    explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"])  # (top, left, bottom, right)
-
-    # Check if this is for horizontal ifm streaming
-    if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
-        explicit_padding[0] = cmd.pad_top
-        explicit_padding[2] = cmd.pad_bottom
-
-    return (explicit_padding[1], explicit_padding[0])
-
-
-def ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
-    if ifm_shape == []:
-        # Scalar needs to be in IFM2
-        return False
-    elif ifm2_shape == []:
-        return True
-
-    for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
-        if ifm != ifm2 and ifm == 1:
-            # Broadcasted FM needs to be in IFM2
-            return False
+    if has_ifm2(npu_op):
+        assert npu_op.ifm2 is not None
+        curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))
+    for prev_range in get_address_ranges(prev_op.ofm):
+        if prev_range is None:
+            continue
+        for curr_range in curr_input_ranges:
+            if curr_range is not None and ranges_overlap(prev_range, curr_range):
+                return True
+    return False
 
-    return True
 
+def shape3d_to_rect(shape: NpuShape3D) -> Rect:
+    return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
 
-def generate_register_command_stream(nng, sg, arch, verbose=False):
-    emit = CommandStreamEmitter()
 
-    if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
-        base_ptr_idx_map = {
-            MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
-            MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
-            MemType.Scratch: BasePointerIndex.ScratchTensor,
-            MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
-        }
+def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
+    # Note: NOT equivalent to the normal ifm block depth calculation since
+    # it takes into account 'depthless' block operations by returning full
+    # depth
+    if npu_op.op_type == NpuOperationType.Conv2D:
+        res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
+        return res
+    return npu_op.ofm.shape.depth
+
+
+def calc_blockdep(
+    arch: ArchitectureFeatures,
+    prev_op: Optional[NpuBlockOperation],
+    prev_block_config: Optional[NpuShape3D],
+    npu_op: NpuBlockOperation,
+    block_config: NpuShape3D,
+) -> int:
+    """Calculates the value of the BLOCKDEP register"""
+    if prev_op is None:
+        return 0
+    if not is_dependent_on_prev_op(prev_op, npu_op):
+        return ArchitectureFeatures.MAX_BLOCKDEP
+    if prev_op.ofm.shape != npu_op.ifm.shape:
+        return 0
+    prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)
+    prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
+    prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
+    prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)
+    cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
+    cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
+    cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
+    cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
+    cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
+    blockdep = arch.calc_block_dep(
+        prev_ifm_rect,
+        prev_ofm_rect,
+        prev_ifm_block_depth,
+        prev_ofm_block,
+        to_kernel(prev_op.kernel),
+        cur_ifm_rect,
+        cur_ofm_rect,
+        cur_ifm_block_depth,
+        cur_ofm_block,
+        to_kernel(npu_op.kernel),
+        cur_padLT,
+    )
+    return blockdep
+
+
+# -------------------------------------------------------------------
+# PRINT
+# -------------------------------------------------------------------
+
+
+def print_feature_map(fm: NpuFeatureMap, name: str):
+    if fm is not None:
+        q = (
+            "no quantization"
+            if fm.quantization is None
+            else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
+        )
+        h, w, c = fm.shape
+        sz = h * w * c * fm.data_type.size_in_bytes()
+        print(f"      {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
+        strides = get_strides(fm)
+        stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
+        t = fm.tiles
+        addresses = [hex(addr) for addr in t.addresses]
+        print(f"         {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
+
+
+def print_operation(npu_op: NpuOperation, index: int = 0):
+    pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
+    if is_dma_op(npu_op):
+        print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
+        return
+    k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
+    if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
+        print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
     else:
-        base_ptr_idx_map = {
-            MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
-            MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
-            MemType.Scratch: BasePointerIndex.ScratchTensor,
-            MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
-        }
-
-    # Maps an AccumulatorType enum to the corresponding acc_format value
-    acc_format_map = {
-        SHRAMElements.Acc16: acc_format.FP_S5_10.value,
-        SHRAMElements.Acc32: acc_format.INT_32BIT.value,
-        SHRAMElements.Acc40: acc_format.INT_40BIT.value,
-    }
-
-    # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
-    elementwise_mode_map = {
-        Op.Mul: elementwise_mode.MUL.value,
-        Op.Add: elementwise_mode.ADD.value,
-        Op.Sub: elementwise_mode.SUB.value,
-        Op.Minimum: elementwise_mode.MIN.value,
-        Op.Maximum: elementwise_mode.MAX.value,
-        Op.LeakyRelu: elementwise_mode.LRELU.value,
-        Op.Abs: elementwise_mode.ABS.value,
-        Op.CLZ: elementwise_mode.CLZ.value,
-        Op.SHR: elementwise_mode.SHR.value,
-        Op.SHL: elementwise_mode.SHL.value,
-    }
-
-    cmd_stream = []
-    memory_accesses = {}
-    for cmd in sg.high_level_command_stream:
-        if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
-            print("Warning: Skipping register command stream generation for", cmd.ps)
+        if (
+            npu_op.op_type == NpuOperationType.Conv2D
+            and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
+        ):
+            fc = "FullyConnected "
         else:
-            cmd_stream.append(cmd)
-            memory_accesses[cmd] = cmd.get_memory_accesses()
-
-    def emit_cmd_waits(cmd_waits):
-        if cmd_waits.npu >= 0:
-            emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
-
-        if cmd_waits.dma >= 0:
-            emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
+            fc = ""
+        print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
+    print_feature_map(npu_op.ifm, "IFM")
+    if npu_op.ifm2_scalar is not None:
+        quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
+        print(f"      IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
+    else:
+        print_feature_map(npu_op.ifm2, "IFM2")
+    print_feature_map(npu_op.ofm, "OFM")
+    if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
+        print(f"      Kernel: {k}")
+    if npu_op.padding is not None:
+        print(f"      {npu_op.padding}")
+    for weights in npu_op.weights:
+        print(f"      Weights: {weights}")
+    for bias in npu_op.biases:
+        print(f"      Scales: {bias}")
+    if npu_op.activation is not None:
+        act = npu_op.activation
+        if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
+            lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
+            print(f"      Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
+    if npu_op.op_type == NpuOperationType.Conv2D:
+        print(f"      {npu_op.block_traversal}")
+    bh, bw, bc = npu_op.block_config
+    rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
+    print(f"      Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
+
+
+def print_operations(npu_op_list: List[NpuOperation]):
+    for index, npu_op in enumerate(npu_op_list):
+        print_operation(npu_op, index)
+
+
+# -------------------------------------------------------------------
+# OPERATIONS
+# -------------------------------------------------------------------
+
+
+def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
+    """Generates NPU_OP_* command"""
+    op_type = npu_op.op_type
+    if op_type == NpuOperationType.Dma:
+        emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
+    elif op_type == NpuOperationType.Conv2D:
+        emit.cmd_do_operation(cmd0.NPU_OP_CONV)
+    elif op_type == NpuOperationType.ConvDepthWise:
+        emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
+    elif op_type == NpuOperationType.Pooling:
+        emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
+    elif op_type == NpuOperationType.ElementWise:
+        emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
+    else:
+        assert 0, "Unsupported operation"
+
+
+def generate_conv2d_op(
+    emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures
+) -> NpuShape3D:
+    """Generates register commands for Conv2D operations"""
+    generate_common(emit, npu_op, npu_op.block_traversal, arch)
+    ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
+    shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ConvolutionMxN, ifm_resampling_mode)
+    block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
+    generate_shram_registers_non_elementwise(emit, shared_buffer)
+    return block_config
+
+
+def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
+    """Generates register commands for depthwise convolution operations"""
+    generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
+    ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
+    shared_buffer = shared_buffer_allocation_for_npu_op(
+        arch, npu_op, NpuBlockType.ConvolutionDepthWise, ifm_resampling_mode
+    )
+    block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
+    generate_shram_registers_non_elementwise(emit, shared_buffer)
+    return block_config
+
+
+def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
+    """Generates register commands for pooling operations"""
+    use_global_scale = (
+        npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
+    )
+    generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
+    # Pooling op specific
+    if use_global_scale:
+        generate_ofm_scaling_for_pooling(emit, npu_op)
+    ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
+    npu_block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
+    shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, npu_block_type, ifm_resampling_mode)
+    block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
+    generate_shram_registers_non_elementwise(emit, shared_buffer)
+    return block_config
+
+
+def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
+    """Generates register commands for elementwise operations"""
+    use_global_scale = npu_op.sub_op_type in (
+        NpuElementWiseOp.ADD,
+        NpuElementWiseOp.SUB,
+        NpuElementWiseOp.MUL,
+        NpuElementWiseOp.LRELU,
+        NpuElementWiseOp.ABS,
+    )
+    op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
+    generate_common(
+        emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
+    )
+    # Elementwise op specific
+    if npu_op.sub_op_type not in unary_elementwise_ops:
+        # Binary operation; generate IFM2 registers
+        assert npu_op.ifm2 is not None
+        has_scalar = npu_op.ifm2_scalar is not None
+        generate_ifm2(emit, npu_op.ifm2, has_scalar)
+        generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
+        generate_ifm2_broadcast(emit, npu_op)
+        if has_scalar:
+            quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
+            assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
+            emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
+    ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
+    shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ElementWise, ifm_resampling_mode)
+    block_config = generate_block_config(emit, npu_op, arch, shared_buffer)
+    generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
+    return block_config
+
+
+def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
+    """Generates register commands for DMA operations"""
+    emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
+    emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
+    emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
+
+    emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
+    emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
+
+
+def generate_registers_for_op(
+    emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures
+) -> Optional[NpuShape3D]:
+    """
+    Generates register commands for the given operation, but not the final NPU_OP_... command.
+    Returns the selected block config
+    """
+    op_type = npu_op.op_type
+    block_config = None
+    if op_type == NpuOperationType.Conv2D:
+        block_config = generate_conv2d_op(emit, npu_op, arch)
+    elif op_type == NpuOperationType.ConvDepthWise:
+        block_config = generate_conv_depthwise_op(emit, npu_op, arch)
+    elif op_type == NpuOperationType.Pooling:
+        block_config = generate_pooling_op(emit, npu_op, arch)
+    elif op_type == NpuOperationType.ElementWise:
+        block_config = generate_elementwise_op(emit, npu_op, arch)
+    elif op_type == NpuOperationType.Dma:
+        generate_dma_op(emit, npu_op)
+    else:
+        assert 0, "Unsupported operation"
+    return block_config
 
-    # Initialise operator dependency state
-    prev_ifm_rect = cur_ifm_rect = None
-    prev_ifm_block_depth = cur_ifm_block_depth = None
-    prev_ofm_rect = cur_ofm_rect = None
-    prev_ofm_block = cur_ofm_block = None
-    prev_kernel = cur_kernel = None
-    prev_cmd = None
 
+def generate_command_stream(
+    emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None
+):
+    """Generates register commands for the given list of NPU operations"""
+    # Calculate memory accesses for every operation
+    memory_accesses = {}
+    for npu_op in npu_op_list:
+        if is_dma_op(npu_op):
+            memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
+        else:
+            memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
     if arch.is_yoda_system:
         emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
-
     dep_watermark = Watermark(0, 0)
-
-    stream_id = DebugDatabase.add_stream(sg)
-    DebugDatabase.set_stream_offset(sg, 0)  # Default to zero, can only set during file writing
-
-    for cmd_index, cmd in enumerate(cmd_stream):
-        dep_watermark, cmd_waits = get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, dep_watermark)
-
-        if cmd.cmdtype == CommandType.DMA:
-            start_coord = cmd.box.start_coord
-
-            src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
-            dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
-
-            if cmd.in_tensor.compressed_values is not None:
-                if cmd.out_tensor.purpose == TensorPurpose.FSBias:
-                    sz = cmd.in_tensor.storage_size()
-                else:
-                    stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
-                    sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
-            else:
-                sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
-
-            emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
-            emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
-            if cmd.out_tensor.purpose == TensorPurpose.LUT:
-                emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, BasePointerIndex.Mem2Mem)
-            else:
-                emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
-
-            emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
-            emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
-            dma_channel = 0
-            mode = 0  # From external to external
-
-            emit_cmd_waits(cmd_waits)
-            emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
-
-        elif cmd.cmdtype == CommandType.NpuStripe:
-
-            ps = cmd.ps
-            primary_op = ps.primary_op
-            npu_block_type = ps.npu_block_type
-            # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
-            use_global_scale = False
-            # Specifies type of rounding to be used.
-            rounding_mode = (
-                rounding.NATURAL if primary_op.attrs.get("rounding_mode", "") == b"NATURAL" else rounding.TFL
-            )
-            if primary_op.type == Op.ResizeBilinear:
-                rounding_mode = rounding.TRUNCATE
-            fmf = primary_op.memory_function
-            faf = primary_op.activation
-            fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
-            # Force output scale, used in operations with fused LUT
-            # Note: with current LUT support, forced_ofm_quantization is always equal to cmd.ofm_tensor.quantization
-            # except when primary_op is AddAct + 0 (no-op) + LUT
-            forced_ofm_quantization = primary_op.forced_output_quantization
-            ofm_quant = cmd.ofm_tensor.quantization
-            if forced_ofm_quantization is not None:
-                ofm_quant = forced_ofm_quantization
-
-            # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
-            op_to_scale = 0
-
-            # Update state history
-            prev_ifm_rect = cur_ifm_rect
-            prev_ifm_block_depth = cur_ifm_block_depth
-            prev_ofm_rect = cur_ofm_rect
-            prev_ofm_block = cur_ofm_block
-            prev_kernel = cur_kernel
-            cur_kernel = ps.primary_op.kernel if ps.primary_op else None
-
-            block_config = ps.block_config
-            emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
-            emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
-            emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
-
-            shared_buffer = ps.shared_buffer
-
-            if npu_block_type == NpuBlockType.ElementWise:
-                ifm2_broadcast = 0
-
-                if cmd.ifm2_tensor and not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
-                    # The scalar has to be the ifm2 tensor so switch the ifms
-                    cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
-                    cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
-
-                    # Set ReverseOperandOrder bit to IFM2_BROADCAST
-                    ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
-
-                # Calculate scales needed for arithmetic elementwise operators
-                if primary_op.type in set((Op.Add, Op.Mul, Op.Sub,)):
-                    input_scale = cmd.ifm_tensor.quantization.scale_f32 if cmd.ifm_tensor.quantization else None
-                    input2_scale = cmd.ifm2_tensor.quantization.scale_f32 if cmd.ifm2_tensor.quantization else None
-                    output_scale = ofm_quant.scale_f32 if ofm_quant else None
-                    use_global_scale = True
-
-                    if output_scale is not None and faf in (Op.Sigmoid, Op.Tanh):
-                        output_scale = 1 / 0x3000
-
-                    if primary_op.type == Op.Mul:
-                        if None in (input_scale, input2_scale, output_scale):
-                            ofm_scale = 1
-                            shift = 0
-                        else:
-                            ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
-                        emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
-                    else:  # AddAct/SubAct
-                        # Force output scale same as the input scale for
-                        # resizebilinear 1x1 that is converted to add
-                        if "resizebilinear" in primary_op.attrs:
-                            output_scale = input2_scale
-
-                        if None in (input_scale, input2_scale, output_scale):
-                            opa_scale = opb_scale = ofm_scale = 1
-                            opa_shift = shift = 0
-                            ofm_scale, shift = primary_op.attrs.get("rescale", [1, 0])
-                        elif input_scale == input2_scale:
-                            opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
-                                input_scale, input2_scale, output_scale
-                            )
-                            opa_shift = 0  # Unused for this case
-                        else:
-                            # Use advanced implementation only when input scales differ
-                            bitdepth = cmd.ifm_tensor.dtype.bits
-                            (
-                                opa_scale,
-                                opa_shift,
-                                ofm_scale,
-                                shift,
-                                op_to_scale,
-                            ) = scaling.advanced_elementwise_add_sub_scale(
-                                input_scale, input2_scale, output_scale, bitdepth
-                            )
-                            opb_scale = 0  # Unused for this case
-                            if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
-                                # If the operand order is reversed we also have to swap which operand is scaled
-                                if op_to_scale == scaling.OperandToScale.OPa:
-                                    op_to_scale = scaling.OperandToScale.OPb
-                                else:
-                                    op_to_scale = scaling.OperandToScale.OPa
-
-                        emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
-                        emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
-                        emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
-
-                elif primary_op.type in set((Op.LeakyRelu, Op.Abs,)):
-                    output_scale = ofm_quant.scale_f32
-                    use_global_scale = True
-
-                    if primary_op.type == Op.LeakyRelu:
-                        output_scale = primary_op.attrs["alpha"]
-
-                    ofm_scale, shift = scaling.quantise_scale(output_scale)
-                    emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
-                else:
-                    emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
-
-                # For elementwise set the required SHRAM to be equal to the total size of available SHRAM
-                uses_lut = primary_op.activation_lut is not None
-                shram_required = arch.available_shram_banks(uses_lut)
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
-
-                # Acc buffers not needed so set AB_START to size of SHRAM
-                emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
-
-                # Is not a unary operator
-                if cmd.ifm2_tensor is not None:
-                    if cmd.ifm2_tensor.shape == []:
-                        # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
-                        ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
-                    else:
-                        ifm_box_shape = cmd.ifm_box.get_size_shape()
-                        ifm2_box_shape = cmd.ifm2_box.get_size_shape()
-
-                        if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
-                            # Broadcast in 'H' dimension
-                            assert cmd.ifm2_tensor.shape[1] == 1
-                            ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
-
-                        if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
-                            # Broadcast in 'W' dimension
-                            assert cmd.ifm2_tensor.shape[2] == 1
-                            ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
-
-                        if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
-                            # Broadcast in 'C' dimension
-                            assert cmd.ifm2_tensor.shape[3] == 1
-                            ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
-
-                        # Set IFM2_IB_START to the latter half of the IB space
-                        ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
-                        emit.cmd0_with_param(
-                            cmd0.NPU_SET_IFM2_IB_START,
-                            (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
-                        )
-
-                    emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
-
-            else:
-                emit.cmd0_with_param(
-                    cmd0.NPU_SET_IFM_IB_END,
-                    shared_buffer.bank_locations[SharedBufferArea.IFM]
-                    + shared_buffer.banks_required[SharedBufferArea.IFM],
-                )
-                emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
-
-            emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
-
-            if primary_op.type == Op.ResizeBilinear:
-                # perform nearest neighbor upscale
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
-            elif primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
-                # perform insert zero upscale
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
-            else:
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)
-
-            if npu_block_type in set(
-                (
-                    NpuBlockType.ConvolutionMxN,
-                    NpuBlockType.ConvolutionDepthWise,
-                    NpuBlockType.Pooling,
-                    NpuBlockType.ReduceSum,
-                )
-            ):
-                # Set up padding
-                explicit_padding = list(primary_op.attrs["explicit_padding"])  # (top, left, bottom, right)
-
-                # Check if this is for horizontal ifm streaming
-                if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
-                    explicit_padding[0] = cmd.pad_top
-                    explicit_padding[2] = cmd.pad_bottom
-
-                # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
-                # because of activation function needed to be fused.
-                if cmd.ifm_box.start_coord[-2] > 0:
-                    explicit_padding[1] = 0
-                if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
-                    explicit_padding[3] = 0
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
-
-                # set kernel x stride low bit
-                stride = primary_op.attrs["strides"][2] - 1 & 1
-                # set kernel y stride low bit
-                stride |= (primary_op.attrs["strides"][1] - 1 & 1) << 1
-                # set kernel x stride extension bits
-                stride |= (primary_op.attrs["strides"][2] - 1 >> 1) << 6
-                # set kernel y stride extension bits
-                stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9
-
-                if npu_block_type in set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):
-                    k_height, k_width = primary_op.attrs["ksize"][1:3]
-                    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
-                    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
-
-                    valid_padding = sum(explicit_padding) == 0
-
-                    if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.ReduceSum)) and valid_padding:
-                        # For valid padding vela has to output scaling values
-                        if faf == Op.Sigmoid or faf == Op.Tanh:
-                            rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
-                            if cmd.ifm_tensor.dtype == DataType.int16:
-                                # Calculate scale and shift for the output scale of 1/(3*4096)
-                                shift = 0
-                                max_rescale = np.iinfo(np.int16).max / 2
-                                while rescale <= max_rescale and shift <= 30:
-                                    shift += 1
-                                    rescale *= 2
-                                scale = int(rescale)
-                            else:
-                                rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
-                                scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
-                                scale = int(round_away_zero(scale * rescale))
-                        elif fused_quantize:
-                            # Quantize op requires different scaling
-                            ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32)
-                            ofm_scale_f64 = np.double(ofm_quant.scale_f32)
-                            scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
-                        elif primary_op.type == Op.ResizeBilinear and "rescale" in primary_op.attrs:
-                            rescale = primary_op.attrs["rescale"]
-                            rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
-                            scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
-                            scale = int(round_away_zero(scale * rescale))
-                        else:
-                            # In case avg pool fused with concat or other memory operation, rescaling might be needed.
-                            # k_height == k_width == 1 is allways true in this case
-                            # Normally the scale is maximised, to get maximum precision, which means that
-                            # if rescale != 1, scale need to consider the number of bits needed for rescaling
-                            if None not in (ofm_quant.scale_f32, cmd.ifm_tensor.quantization.scale_f32,):
-                                rescale = cmd.ifm_tensor.quantization.scale_f32 / ofm_quant.scale_f32
-                                rescale_bits = 0
-                                if k_height == k_width == 1:
-                                    if fmf == Op.ConcatSliceWrite:
-                                        rounding_mode = rounding.NATURAL
-                                    if rescale > 1:
-                                        rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
-                                    elif rescale < 1:
-                                        rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
-                                scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
-                                scale = int(round_away_zero(scale * rescale))
-                            else:
-                                scale = 1
-                                shift = 0
-
-                        emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
-                        # Valid-padded average pool should use the global scale from
-                        # NPU_SET_OFM_SCALE register, which is set above.
-                        use_global_scale = True
-
-                else:  # Convolution
-                    assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
-                    # Reduced precision quantization and natural rounding used for int16
-                    if cmd.ifm_tensor.dtype == DataType.int16:
-                        rounding_mode = rounding.NATURAL
-                    stride |= (cur_kernel.dilation.y - 1) << 4
-                    stride |= (cur_kernel.dilation.x - 1) << 3
-                    emit.cmd0_with_param(
-                        cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)
-                    )
-                    emit.cmd0_with_param(
-                        cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)
-                    )
-                    if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
-                        # Part-kernel-first weight ordering
-                        assert npu_block_type == NpuBlockType.ConvolutionMxN
-                        stride |= 1 << 2
-
-                emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
-
-            elif npu_block_type in set((NpuBlockType.VectorProduct,)):
-                # Vector product is implemented using a 1x1 convolution so need
-                # to setup the appropriate padding and kernel info
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
-
-                # kernel stride reg = 0 means stride(1,1) + depth first weight
-                # order + dilation(0,0) + kernel_split_size=8
-                emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
-
-                emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
-                emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
-
-            if npu_block_type in set(
-                (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
-            ):
-                # Emit Weight base address commands, only maps the area required for
-                # this command's weights from the larger tensor.
-                stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
-                weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]
-                substreams = len(weight_substream_offsets) - 1  # Offset list must terminate with full stream length
-
-                # Extract weight substream offsets and calculate their lengths
-                assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
-                weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
-
-                # Set weights sources for active and present cores
-                for core, param in enumerate(
-                    [
-                        (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
-                        (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
-                    ]
-                ):
-                    if core < substreams:
-                        emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core])
-                        emit.cmd1_with_offset(
-                            param[1], weight_substream_offsets[core + 1] - weight_substream_offsets[core]
-                        )
-                    elif core < arch.ncores:
-                        emit.cmd1_with_offset(param[0], weight_addr)
-                        emit.cmd1_with_offset(param[1], 0)
-
-                weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
-                emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
-
-                # Emit Scale & Bias base address commands, with length matching the amount required by
-                # the weight tensors.
-                if cmd.scale_tensor is not None:
-                    scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]
-                    substreams = len(scale_substream_offsets) - 1  # Offset list must terminate with full stream length
-
-                    # Extract scale substream offsets and calculate their lengths
-                    assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
-                    scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])
-
-                    # Set scale sources for active and present cores
-                    for core, param in enumerate(
-                        [
-                            (cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),
-                            (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH),
-                        ]
-                    ):
-                        if core < substreams:
-                            emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core])
-                            emit.cmd1_with_offset(
-                                param[1], scale_substream_offsets[core + 1] - scale_substream_offsets[core]
-                            )
-                        elif core < arch.ncores:
-                            emit.cmd1_with_offset(param[0], scale_addr)
-                            emit.cmd1_with_offset(param[1], 0)
-
-                    # Emit base address for NPU to access scale & bias data
-                    scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
-                    emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
-
-            ofm_quant_qmin = ofm_quant.quant_min if ofm_quant else np.iinfo(np.int16).min
-            ofm_quant_qmax = ofm_quant.quant_max if ofm_quant else np.iinfo(np.int16).max
-            ifm_min = cmd.ifm_tensor.quantization.min if cmd.ifm_tensor.quantization else np.iinfo(np.int16).min
-            ifm_max = cmd.ifm_tensor.quantization.max if cmd.ifm_tensor.quantization else np.iinfo(np.int16).max
-
-            # Emit commands for any fused activation function
-            if faf is None:
-                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
-                # Even if no activation function, values need to be set to override previous values
-                faf_min = ofm_quant_qmin
-                faf_max = ofm_quant_qmax
-            elif faf == Op.Relu:
-                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
-                faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
-                faf_max = ofm_quant_qmax
-            elif faf == Op.Relu6:
-                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
-                faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
-                faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
-            elif faf == Op.ReluN1To1:
-                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
-                faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
-                faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
-            elif faf == Op.Tanh:
-                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
-                if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):
-                    faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
-                    faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
-                else:
-                    faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
-                    faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
-            elif faf == Op.Sigmoid:
-                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
-                if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):
-                    faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
-                    faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
-                else:
-                    faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
-                    faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
-            elif faf == Op.LUT:
-                lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", -1)
-                assert activation.LUT_START.value <= lut_index <= activation.LUT_END.value, "LUT index out of range."
-                if cmd.ofm_tensor.dtype == DataType.int32:
-                    lut_index |= 3 << 12  # Force I8 range
-                emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, lut_index)
-                faf_min = ofm_quant_qmin
-                faf_max = ofm_quant_qmax
-            else:
-                raise Exception("Unsupported fused_activation_function = " + faf.name)
-
-            # Activation range needs to be set based upon the quantisation range and the fused activation range
-            emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
-            emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
-
-            out_shape = cmd.ofm_box.get_size_shape()
-            if len(out_shape) >= 4:
-                emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
-            else:
-                emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
-            if len(out_shape) >= 2:
-                emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
-            else:
-                emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
-            emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
-
-            if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum)):
-                in_shape = cmd.ifm_box.get_size_shape()
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
-            else:
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
-
-            for tens, box, region_op, ptr_ops, stride_ops, zero_point_op in (
-                (
-                    cmd.ifm_tensor,
-                    cmd.ifm_box,
-                    cmd0.NPU_SET_IFM_REGION,
-                    (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
-                    (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
-                    cmd0.NPU_SET_IFM_ZERO_POINT,
-                ),
-                (
-                    cmd.ifm2_tensor,
-                    cmd.ifm2_box,
-                    cmd0.NPU_SET_IFM2_REGION,
-                    (
-                        cmd1.NPU_SET_IFM2_BASE0,
-                        cmd1.NPU_SET_IFM2_BASE1,
-                        cmd1.NPU_SET_IFM2_BASE2,
-                        cmd1.NPU_SET_IFM2_BASE3,
-                    ),
-                    (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
-                    cmd0.NPU_SET_IFM2_ZERO_POINT,
-                ),
-                (
-                    cmd.ofm_tensor,
-                    cmd.ofm_box,
-                    cmd0.NPU_SET_OFM_REGION,
-                    (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
-                    (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
-                    cmd0.NPU_SET_OFM_ZERO_POINT,
-                ),
-            ):
-
-                if tens is None:
-                    continue
-
-                need_zero_point = (
-                    (faf is not None and forced_ofm_quantization is None)
-                    or (fmf == Op.ConcatSliceWrite)
-                    or fused_quantize
-                )
-                if (
-                    (primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL)) and not need_zero_point)
-                    or (
-                        tens.dtype == DataType.int32
-                        and zero_point_op in (cmd0.NPU_SET_IFM_ZERO_POINT, cmd0.NPU_SET_IFM2_ZERO_POINT)
-                    )
-                    or tens.quantization is None
-                ):
-                    # Actual integer operation, just set scale to 1 and zero point to 0
-                    emit.cmd0_with_param(zero_point_op, 0)
-                else:
-                    assert tens.quantization.zero_point is not None, "need an actual zero point set"
-                    if cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op and forced_ofm_quantization is not None:
-                        zero_point = forced_ofm_quantization.zero_point
-                    elif (
-                        "resizebilinear" in primary_op.attrs
-                        and primary_op.type == Op.Add
-                        and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op
-                    ):
-                        # Force output zero point same as the input zero point
-                        # for resizebilinear 1x1 that is converted to add
-                        zero_point = cmd.ifm2_tensor.quantization.zero_point
-                    else:
-                        zero_point = tens.quantization.zero_point
-                    emit.cmd0_with_param(zero_point_op, int(zero_point))
-
-                if tens.shape == []:
-                    # Empty shape, elementwise constant
-                    ifm2_scalar = tens.quant_values
-                    assert ifm2_scalar.size == 1
-                    emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, int(ifm2_scalar.item(0)))
-                    continue
-
-                height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
-                    box.start_coord, box.end_coord
-                )
-                if npu_block_type != NpuBlockType.VectorProduct:
-                    if tens == cmd.ifm_tensor:
-                        emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
-                        emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
-                        emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
-                    elif tens == cmd.ofm_tensor:
-                        emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
-                        emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
-                        emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
-                    if tens == cmd.ifm2_tensor:
-                        emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
-                        emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
-                        emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
-                else:
-                    if len(out_shape) == 2:
-                        assert out_shape[0] == 1
-                        if tens == cmd.ifm_tensor:
-                            emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, 0)
-                        elif tens == cmd.ofm_tensor:
-                            emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, 0)
-                    else:
-                        assert False
-
-                emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
-
-                for idx, addr in enumerate(addresses):
-                    if addr is None:
-                        addresses[idx] = 0
-
-                emit.cmd1_with_offset(ptr_ops[0], addresses[0])
-                emit.cmd1_with_offset(ptr_ops[1], addresses[1])
-                emit.cmd1_with_offset(ptr_ops[2], addresses[2])
-                emit.cmd1_with_offset(ptr_ops[3], addresses[3])
-
-                strides = tens.get_strides()
-                emit.cmd1_with_offset(stride_ops[0], strides[1])  # stride between 16-byte channel blocks (C)
-                emit.cmd1_with_offset(stride_ops[2], strides[3])  # stride between horisontal values (W)
-                emit.cmd1_with_offset(stride_ops[1], strides[2])  # stride between vertical values (H)
-
-                if tens.format == TensorFormat.NHCWB16:
-                    # Check that all BasePointer addresses are aligned to 16 bytes
-                    assert (int(addresses[0]) % 16) == 0
-                    assert (int(addresses[1]) % 16) == 0
-                    assert (int(addresses[2]) % 16) == 0
-                    assert (int(addresses[3]) % 16) == 0
-
-            ofm_dtype = cmd.ofm_tensor.dtype
-            assert ofm_dtype.type & BaseType.Int
-            prec = 0
-            if ofm_dtype.size_in_bits() == 8:
-                prec = 0
-            elif ofm_dtype.size_in_bits() == 16:
-                prec = 2
-            elif ofm_dtype.size_in_bits() == 32:
-                prec = 4
-            else:
-                assert 0
-
-            if ofm_dtype.type & BaseType.Signed:
-                prec += 1
-
-            if use_global_scale:
-                # Set global scale bit, as opposed to using per channel scale
-                prec |= 1 << 8
-
-            if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
-                prec |= 1 << 6
-
-            prec |= rounding_mode.value << 14
-
-            emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
-
-            prec = None
-            weight_bits = 8
-            if cmd.weight_tensor is not None:
-                weight_bits = cmd.weight_tensor.dtype.size_in_bits()
-
-            ifm_dtype = cmd.ifm_tensor.dtype
-
-            assert weight_bits == 8, "Unsupported weight bit depth"
-            assert (
-                ifm_dtype.size_in_bits() in {8, 16}
-                or ifm_dtype.size_in_bits() == 32
-                and npu_block_type in (NpuBlockType.ElementWise, NpuBlockType.ReduceSum)
-            ), "Unsupported ifm bit depth"
-
-            if ifm_dtype.size_in_bits() == 8:
-                if ifm_dtype.type & BaseType.Signed:
-                    prec = ifm_precision.S8
-                else:
-                    prec = ifm_precision.U8
-            elif ifm_dtype.size_in_bits() == 16:
-                if ifm_dtype.type & BaseType.Signed:
-                    prec = ifm_precision.S16
-                else:
-                    prec = ifm_precision.U16
-            elif ifm_dtype == DataType.int32:
-                prec = ifm_precision.S32
-
-            ifm_prec = prec.value
-            ifm2_prec = ifm_prec
-
-            if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
-                ifm_prec |= 1 << 6
-
-            ifm_prec |= op_to_scale << 8
-
-            emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
-
-            if cmd.ifm2_tensor is not None:
-                if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
-                    ifm2_prec |= 1 << 6
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
-
-            # Get op parameters
-            cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
-            cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
-            cur_ofm_rect = get_op_ofm_rect(cmd)
-            cur_ifm_rect = get_op_ifm_rect(cmd)
-            cur_padLT = get_op_padding_lt(cmd)
-            if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
-                if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
-                    blockdep = arch.calc_block_dep(
-                        prev_ifm_rect,
-                        prev_ofm_rect,
-                        prev_ifm_block_depth,
-                        prev_ofm_block,
-                        prev_kernel,
-                        cur_ifm_rect,
-                        cur_ofm_rect,
-                        cur_ifm_block_depth,
-                        cur_ofm_block,
-                        cur_kernel,
-                        cur_padLT,
-                    )
-                else:
-                    blockdep = 0
-            else:
-                blockdep = ArchitectureFeatures.MAX_BLOCKDEP
-
-            # Set between every op (dependent or not)
+    prev_op = None
+    prev_block_config = None
+    # Generate register commands for all operations
+    for op_index, npu_op in enumerate(npu_op_list):
+        dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
+        block_config = generate_registers_for_op(emit, npu_op, arch)
+        if not is_dma_op(npu_op):
+            # Generate BLOCKDEP
+            assert block_config is not None
+            blockdep = calc_blockdep(arch, prev_op, prev_block_config, npu_op, block_config)
             blockdep = min(blockdep, arch.max_blockdep)
             emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
-            prev_cmd = cmd
-
-            emit_cmd_waits(cmd_waits)
-            DebugDatabase.add_command(stream_id, emit.offset, primary_op)
-
-            if npu_block_type == NpuBlockType.ConvolutionMxN:
-                emit.cmd_do_operation(cmd0.NPU_OP_CONV)
-            elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
-                emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
-            elif npu_block_type == NpuBlockType.VectorProduct:
-                # Vector product is implemented using a 1x1 convolution
-                emit.cmd_do_operation(cmd0.NPU_OP_CONV)
-            elif npu_block_type == NpuBlockType.Pooling:
-                param = pooling_mode.MAX.value if primary_op.type.is_maxpool_op() else pooling_mode.AVERAGE.value
-                emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
-            elif npu_block_type == NpuBlockType.ReduceSum:
-                emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value)
-            elif npu_block_type == NpuBlockType.ElementWise:
-                param = elementwise_mode_map[primary_op.type]
-                emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
-            else:
-                print("Warning: Skipping register command stream generation for", ps)
-
+            prev_op = npu_op
+            prev_block_config = block_config
+
+        generate_cmd_waits(emit, cmd_waits)
+        # Generate the actual NPU_OP command
+        generate_operation_code(emit, npu_op)
+        if add_to_debug_db is not None:
+            add_to_debug_db(npu_op, emit.offset)
     # Fill in final part of command stream:
     emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
 
+
+def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
+    """Generates command stream for the subgraph, adds it to sg.register_command_stream"""
+    # Convert high level command stream to list of NpuOperation
+    npu_op_list = []
+    npu_op_to_cmd = dict()  # map from npu op to high level command
+    for cmd in sg.high_level_command_stream:
+        if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
+            print("Warning: Skipping register command stream generation for", cmd.ps)
+        else:
+            npu_op = convert_command_to_npu_op(cmd, arch)
+            npu_op_list.append(npu_op)
+            npu_op_to_cmd[npu_op] = cmd
+    if verbose:
+        print_operations(npu_op_list)
+    # Generate register commands
+    stream_id = DebugDatabase.add_stream(sg)
+    DebugDatabase.set_stream_offset(sg, 0)  # Default to zero, can only set during file writing
+    emit = CommandStreamEmitter()
+
+    def add_to_debug_db(npu_op: NpuOperation, offset: int):
+        """Adds info to the debug database"""
+        if not is_dma_op(npu_op):
+            cmd = npu_op_to_cmd[npu_op]
+            DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
+
+    generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)
     sg.register_command_stream = emit.to_list()
     if verbose:
         emit.print_cmds()
         print("number of commands", len(emit.cmd_stream))
         print("command stream length in words", len(sg.register_command_stream))
+
+
+def generate_register_command_stream(npu_op_list: List[NpuOperation], accelerator: Accelerator) -> List[int]:
+    """
+    Public facing API for generating an ethosu register command stream.
+    Calculates dependencies between commands and inserts wait operations if needed.
+
+    :param npu_op_list: List[NpuOperation] list of high level NPU operations
+    :param accelerator: architecture_features.Accelerator enum to pick the correct ethosu accelerator
+    :return ethosu instructions, as a list of 32-bit integers
+    """
+    emit = CommandStreamEmitter()
+    arch = ArchitectureFeatures(
+        vela_config=None,
+        system_config=None,
+        accelerator_config=accelerator.value,
+        override_block_config=None,
+        block_config_limit=None,
+        global_memory_clock_scale=1.0,
+        max_blockdep=ArchitectureFeatures.MAX_BLOCKDEP,
+        weight_estimation_scaling=1.0,
+    )
+    generate_command_stream(emit, npu_op_list, arch)
+    return emit.to_list()