From a4f8411f870defaba52175717b40afdd41ae0d40 Mon Sep 17 00:00:00 2001 From: William Isaksson Date: Mon, 19 Jun 2023 15:31:46 +0000 Subject: MLBEDSW-7718: Add cmd1 payload legality checks - checks that cmd1 payloads are legal in register_command_stream_generator, - adds unit tests Change-Id: I2bc23147f60fe090c71703f08d9cbaa279fac86e Signed-off-by: William Isaksson --- ethosu/vela/errors.py | 16 ++- ethosu/vela/register_command_stream_generator.py | 57 +++++++-- ethosu/vela/register_command_stream_util.py | 55 +++++++++ .../test/extapi/test_extapi_generate_commands.py | 137 +++++++++++++++++++-- 4 files changed, 244 insertions(+), 21 deletions(-) diff --git a/ethosu/vela/errors.py b/ethosu/vela/errors.py index bf3bb4d3..22fc17c6 100644 --- a/ethosu/vela/errors.py +++ b/ethosu/vela/errors.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2021 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2021, 2023 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -75,3 +75,17 @@ class AllocationError(VelaError): def __init__(self, msg): super().__init__(f"Allocation failed: {msg}") + + +class ByteAlignmentError(VelaError): + """Raised when value is unaligned""" + + def __init__(self, msg): + super().__init__(f"Unaligned Value: {msg}") + + +class ByteSizeError(VelaError): + """Raised when size has illegal value""" + + def __init__(self, msg): + super().__init__(f"Illegal Size: {msg}") diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index 6001a3b8..71fec3be 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -60,6 +60,8 @@ from .architecture_features import Accelerator from .architecture_features import ArchitectureFeatures from .architecture_features import create_default_arch from .architecture_features import SHRAMElements +from .errors import ByteAlignmentError +from .errors import ByteSizeError from .errors import VelaError from .ethos_u55_regs.ethos_u55_regs import acc_format from .ethos_u55_regs.ethos_u55_regs import activation @@ -76,6 +78,11 @@ from .operation import NpuBlockType from .range_set import MemoryAccessSet from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM from .register_command_stream_util import calc_blockdep +from .register_command_stream_util import check_addresses +from .register_command_stream_util import check_alignment +from .register_command_stream_util import check_dma_op +from .register_command_stream_util import check_size +from .register_command_stream_util import check_strides from .register_command_stream_util import get_dma_memory_accesses from .register_command_stream_util import get_op_memory_accesses from .register_command_stream_util import get_strides @@ -335,11 +342,16 @@ def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActi emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max) -def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout): +def generate_addresses( + emit: CommandStreamEmitter, + ptr_cmds: List[cmd1], + addresses: List[int], + layout: NpuLayout, + element_size, + arch: ArchitectureFeatures, +): """Generates xFM_BASE registers""" - if layout == NpuLayout.NHCWB16: - # Check that all BasePointer addresses are aligned to 16 bytes - assert all((int(addr) % 16) == 0 for addr in addresses) + check_addresses(addresses, layout, element_size, arch) for i in range(4): emit.cmd1_with_address(ptr_cmds[i], addresses[i]) @@ -356,6 +368,8 @@ def generate_strides( ): """Generates STRIDE_C/Y/X registers""" strides = get_strides(fm) + check_strides(fm, strides) + emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C) emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H) emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W) @@ -420,7 +434,7 @@ def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOp emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast) -def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap): +def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap, arch: ArchitectureFeatures): """Generates general IFM registers""" emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region) generate_addresses( @@ -428,6 +442,8 @@ def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap): [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3], ifm.tiles.addresses, ifm.layout, + ifm.data_type.size_in_bytes(), + arch, ) generate_tiles( emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles @@ -437,7 +453,7 @@ def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap): emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm)) -def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool): +def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool, arch: ArchitectureFeatures): """Generates general IFM2 registers""" if not has_scalar: emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region) @@ -446,6 +462,8 @@ def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: b [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3], ifm2.tiles.addresses, ifm2.layout, + ifm2.data_type.size_in_bytes(), + arch, ) generate_tiles( emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles @@ -454,7 +472,7 @@ def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: b emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2)) -def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap): +def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap, arch: ArchitectureFeatures): """Generates general OFM registers""" emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region) generate_addresses( @@ -462,6 +480,8 @@ def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap): [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3], ofm.tiles.addresses, ofm.layout, + ofm.data_type.size_in_bytes(), + arch, ) generate_tiles( emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles @@ -505,9 +525,12 @@ def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], ] ): if core < len(weights): + check_alignment(weights[core].address, 16) + check_size(weights[core].length, 16) emit.cmd1_with_address(addr, weights[core].address) emit.cmd1_with_offset(length, weights[core].length) elif core < arch.ncores: + check_alignment(weights[0].address, 16) emit.cmd1_with_address(addr, weights[0].address) emit.cmd1_with_offset(length, 0) @@ -523,6 +546,7 @@ def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], a ): if core < len(biases): emit.cmd1_with_address(addr, biases[core].address) + check_size(biases[core].length, 16) emit.cmd1_with_offset(length, biases[core].length) elif core < arch.ncores: emit.cmd1_with_address(addr, biases[0].address) @@ -631,12 +655,12 @@ def generate_common( ): """Generate registers that are common to most operations""" assert npu_op.ifm is not None and npu_op.ofm is not None - generate_ifm(emit, npu_op.ifm) + generate_ifm(emit, npu_op.ifm, arch) generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION) emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale]) if npu_op.padding is not None: generate_padding(emit, npu_op.padding) - generate_ofm(emit, npu_op.ofm) + generate_ofm(emit, npu_op.ofm, arch) generate_ofm_precision(emit, npu_op, use_global_scale) if npu_op.op_type != NpuOperationType.ElementWise: assert npu_op.kernel is not None @@ -974,7 +998,7 @@ def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOp # Binary operation; generate IFM2 registers assert npu_op.ifm2 is not None has_scalar = npu_op.ifm2_scalar is not None - generate_ifm2(emit, npu_op.ifm2, has_scalar) + generate_ifm2(emit, npu_op.ifm2, has_scalar, arch) generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION) generate_ifm2_broadcast(emit, npu_op) if has_scalar: @@ -983,8 +1007,10 @@ def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOp emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar) -def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation): +def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation, arch: ArchitectureFeatures): """Generates register commands for DMA operations""" + check_dma_op(dma_op, arch) + emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region) emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address) emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region) @@ -1007,7 +1033,7 @@ def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, elif isinstance(npu_op, NpuElementWiseOperation): generate_elementwise_op(emit, npu_op, arch) elif isinstance(npu_op, NpuDmaOperation): - generate_dma_op(emit, npu_op) + generate_dma_op(emit, npu_op, arch) else: assert 0, "Unsupported operation" @@ -1048,8 +1074,13 @@ def generate_command_stream( check_mem_limits(memory_accesses[npu_op], mem_limits) cmd_waits = get_wait_dependency(arch, npu_op, memory_accesses, outstanding_dma_ops, outstanding_npu_ops) generate_registers_for_op(emit, npu_op, arch) + except ByteAlignmentError as e: + # Enables testing for ByteAlignmentErrors specifically + raise ByteAlignmentError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None + except ByteSizeError as e: + # Enables testing for ByteSizeErrors specifically + raise ByteSizeError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None except VelaError as e: - # Add operation info and rethrow raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation): # Generate BLOCKDEP diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py index b131f647..c7050a38 100644 --- a/ethosu/vela/register_command_stream_util.py +++ b/ethosu/vela/register_command_stream_util.py @@ -37,12 +37,16 @@ from .api import NpuShape3D from .architecture_features import ArchitectureFeatures from .architecture_features import Block from .architecture_features import Rect +from .errors import ByteAlignmentError +from .errors import ByteSizeError from .operation import Kernel from .operation import PointXYZ +from .tensor import TensorFormat from ethosu.vela.range_set import AccessDirection from ethosu.vela.range_set import MemoryAccessSet from ethosu.vela.range_set import MemoryRangeSet + # base address slot for memory to memory transfer BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0)) @@ -50,6 +54,18 @@ BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0)) UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ) +def check_alignment(payload, required_alignment): + # assuming payload is defined in bytes + if payload % required_alignment != 0: + raise ByteAlignmentError(f"Cmd1 payload of size: {payload} Bytes is not {required_alignment}-byte aligned") + + +def check_size(payload, required_multiple): + # assuming payload is defined in bytes + if payload % required_multiple != 0: + raise ByteSizeError(f"Cmd1 payload of size: {payload} Bytes is not a multiple of {required_multiple}") + + def to_npu_kernel(kernel: Kernel) -> NpuKernel: """Converts the given internally used kernel object to NpuKernel (of public API)""" return NpuKernel( @@ -241,6 +257,29 @@ def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]: return [t0, t1, t2, t3] +def check_strides(fm: NpuFeatureMap, strides: NpuShape3D): + + element_size_in_bytes = fm.data_type.size_in_bytes() + + if fm.layout == NpuLayout.NHCWB16: + strides_to_check = [strides.depth, strides.height] + required_multiple = 16 * element_size_in_bytes + else: + strides_to_check = [strides.height, strides.width] + required_multiple = element_size_in_bytes + for stride in strides_to_check: + check_size(stride, required_multiple) + + +def check_addresses(addresses: List[int], layout: NpuLayout, element_size, arch: ArchitectureFeatures): + if layout == NpuLayout.NHCWB16: + required_alignment = arch.storage_rounding_quantums[TensorFormat.NHCWB16][-1] + else: + required_alignment = element_size + for addr in addresses: + check_alignment(addr, required_alignment) + + # ------------------------------------------------------------------- # DMA_WAIT/KERNEL_WAIT # ------------------------------------------------------------------- @@ -336,6 +375,22 @@ def get_wait_dependency( return cmd_waits +def check_dma_op(dma_op: NpuDmaOperation, arch: ArchitectureFeatures): + + # For Ethos-U65 only internal addresses have to be aligned, and if the internal address is the destination + # then the length has to be aligned also. + if arch.is_ethos_u65_system: + if dma_op.src.region == BASE_PTR_INDEX_MEM2MEM: + check_alignment(dma_op.src.address, 16) + if dma_op.dest.region == BASE_PTR_INDEX_MEM2MEM: + check_alignment(dma_op.dest.address, 16) + check_size(dma_op.src.length, 16) + else: + check_alignment(dma_op.src.address, 16) + check_alignment(dma_op.dest.address, 16) + check_size(dma_op.src.length, 16) + + # ------------------------------------------------------------------- # BLOCKDEP # ------------------------------------------------------------------- diff --git a/ethosu/vela/test/extapi/test_extapi_generate_commands.py b/ethosu/vela/test/extapi/test_extapi_generate_commands.py index 441c4a4f..6284faa3 100644 --- a/ethosu/vela/test/extapi/test_extapi_generate_commands.py +++ b/ethosu/vela/test/extapi/test_extapi_generate_commands.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2021, 2023 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -42,6 +42,8 @@ from ethosu.vela.api import NpuShape3D from ethosu.vela.api import NpuTileBox from ethosu.vela.architecture_features import Accelerator from ethosu.vela.architecture_features import create_default_arch +from ethosu.vela.errors import ByteAlignmentError +from ethosu.vela.errors import ByteSizeError from ethosu.vela.errors import VelaError from ethosu.vela.ethos_u55_regs.ethos_u55_regs import cmd0 from ethosu.vela.ethos_u55_regs.ethos_u55_regs import cmd1 @@ -49,6 +51,7 @@ from ethosu.vela.high_level_command_to_npu_op import BasePointerIndex from ethosu.vela.high_level_command_to_npu_op import get_mem_limits_for_regions from ethosu.vela.register_command_stream_generator import CmdMode from ethosu.vela.register_command_stream_generator import generate_command_stream +from ethosu.vela.register_command_stream_util import BASE_PTR_INDEX_MEM2MEM from ethosu.vela.register_command_stream_util import get_address_ranges @@ -380,10 +383,10 @@ def test_mul_with_broadcast_and_relu(): def create_avg_pool_op() -> NpuPoolingOperation: op = NpuPoolingOperation(NpuPoolingOp.AVERAGE) op.ifm = create_feature_map( - NpuShape3D(height=29, width=30, depth=27), 2, 0, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128) + NpuShape3D(height=32, width=30, depth=28), 2, 0, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128) ) op.ofm = create_feature_map( - NpuShape3D(height=10, width=10, depth=27), + NpuShape3D(height=10, width=10, depth=28), 2, 0x5BD0, quant=NpuQuantization(scale_f32=0.20392157, zero_point=128), @@ -778,25 +781,25 @@ def test_check_mem_limits(): # Tests that no code is generated with addresses out of bounds conv_op = create_fully_connected_op() # bias with end address out of range - conv_op.biases = [NpuAddressRange(region=0, address=(1 << 32) - 16, length=1000)] + conv_op.biases = [NpuAddressRange(region=0, address=(1 << 32) - 16, length=1024)] with pytest.raises(VelaError): npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_64) # same test should pass with Ethos_U65_512 npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U65_512) # weights with end address out of range conv_op = create_fully_connected_op() - conv_op.weights = [NpuAddressRange(region=0, address=(1 << 40) - 960, length=1000)] + conv_op.weights = [NpuAddressRange(region=0, address=(1 << 40) - 960, length=1024)] with pytest.raises(VelaError): npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U65_256) # bias with high end address, but still within range addr = (1 << 40) - 1024 conv_op = create_fully_connected_op() - conv_op.biases = [NpuAddressRange(region=0, address=addr, length=1000)] + conv_op.biases = [NpuAddressRange(region=0, address=addr, length=1024)] cmds = npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U65_512) check_cmd1(cmds, cmd1.NPU_SET_SCALE_BASE, addr & ((1 << 32) - 1), (addr >> 32) & ((1 << 16) - 1)) conv_op = create_fully_connected_op() # weights with negative address - conv_op.weights = [NpuAddressRange(region=0, address=-16, length=1000)] + conv_op.weights = [NpuAddressRange(region=0, address=-16, length=1024)] with pytest.raises(VelaError): npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_32) op = create_avg_pool_op() @@ -811,6 +814,126 @@ def test_check_mem_limits(): npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_64) +def test_cmd1_payload_legality(): + # Tests payload legality + + # Test Bias and weight payload legality + # Illegal bias length fails + conv_op = create_fully_connected_op() + conv_op.biases = [NpuAddressRange(region=0, address=111, length=24)] + with pytest.raises(ByteSizeError): + npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_64) + # Legal bias length passes + conv_op.biases = [NpuAddressRange(region=0, address=111, length=32)] + npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_64) + + # Illegal weight length fails + conv_op = create_fully_connected_op() + conv_op.weights = [NpuAddressRange(region=0, address=128, length=24)] + with pytest.raises(ByteSizeError): + npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_64) + # Legal weight length passes + conv_op.weights = [NpuAddressRange(region=0, address=128, length=32)] + npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_64) + + # Unaligned weight adress fails + conv_op = create_fully_connected_op() + conv_op.weights = [NpuAddressRange(region=0, address=120, length=32)] + with pytest.raises(ByteAlignmentError): + npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_64) + # Aligned weight length already tested + + # Test DMA payload legality + # Illegal dma length Ethos-U55 fails + dest = NpuAddressRange(BASE_PTR_INDEX_MEM2MEM, 256, 120) + src = NpuAddressRange(0, 512, 120) + dma_op = NpuDmaOperation(src, dest) + with pytest.raises(ByteSizeError): + npu_generate_register_command_stream([dma_op], NpuAccelerator.Ethos_U55_64) + + # Legal dma length U55 passes + dest = NpuAddressRange(BASE_PTR_INDEX_MEM2MEM, 256, 128) + src = NpuAddressRange(0, 512, 128) + dma_op = NpuDmaOperation(src, dest) + npu_generate_register_command_stream([dma_op], NpuAccelerator.Ethos_U55_64) + + # Length not a multiple of 16, Ethos-U65, internal dma destination, fails + dest = NpuAddressRange(BASE_PTR_INDEX_MEM2MEM, 256, 120) + src = NpuAddressRange(0, 512, 120) + dma_op = NpuDmaOperation(src, dest) + with pytest.raises(ByteSizeError): + npu_generate_register_command_stream([dma_op], NpuAccelerator.Ethos_U65_256) + # Length not a multiple of 16, Ethos-U65, external dma destination passes + dest = NpuAddressRange(2, 256, 120) + src = NpuAddressRange(0, 512, 120) + dma_op = NpuDmaOperation(src, dest) + npu_generate_register_command_stream([dma_op], NpuAccelerator.Ethos_U65_256) + + # Test fm stride payload legality + ifm_shape = NpuShape3D(height=30, width=62, depth=46) + address = 512 + op = NpuConv2DOperation() + op.ifm = create_feature_map( + ifm_shape, + 1, + address, + quant=NpuQuantization(scale_f32=0.007843138, zero_point=128), + dtype=NpuDataType.INT16, + ) + op.ofm = create_feature_map( + NpuShape3D(height=30, width=31, depth=46), + 1, + 0x14E40, + quant=NpuQuantization(scale_f32=0.20392157, zero_point=128), + dtype=NpuDataType.INT16, + ) + op.kernel = NpuKernel(3, 2, 2, 1) + op.weights = [NpuAddressRange(region=0, address=0, length=7696)] + op.biases = [NpuAddressRange(region=0, address=32000, length=464)] + op.padding = NpuPadding(top=0, left=0, right=1, bottom=1) + op.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST + op.block_config = NpuShape3D(height=16, width=4, depth=16) + + # NHWC depth stride not a multiple of 32 passes + op.ifm.strides = NpuShape3D(depth=16, height=2, width=16) + npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U65_256) + + # Same depth stride fails for NHCWB16 + op.ifm = create_feature_map( + ifm_shape, + 1, + address, + quant=NpuQuantization(scale_f32=0.007843138, zero_point=128), + layout=NpuLayout.NHCWB16, + dtype=NpuDataType.INT16, + ) + op.ifm.strides = NpuShape3D(depth=16, height=2, width=16) + with pytest.raises(ByteSizeError): + npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U65_256) + + # Test fm adress payload alignment + + # Unaligned adress fails + op.ifm = create_feature_map( + ifm_shape, + 1, + address, + quant=NpuQuantization(scale_f32=0.007843138, zero_point=128), + layout=NpuLayout.NHCWB16, + dtype=NpuDataType.INT16, + ) + op.ifm.tiles = NpuTileBox( + width_0=ifm_shape.width, height_0=ifm_shape.height, height_1=ifm_shape.height, addresses=[address, 16, 16, 24] + ) + with pytest.raises(ByteAlignmentError): + npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U65_256) + # Aligned address passes + op.ifm.tiles = NpuTileBox( + width_0=ifm_shape.width, height_0=ifm_shape.height, height_1=ifm_shape.height, addresses=[address, 16, 16, 16] + ) + npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U65_256) + + def test_check_sram_limit_spilling(): # Tests that no code is generated with addresses outside available sram spilling range arch = create_default_arch(Accelerator.Ethos_U65_512) -- cgit v1.2.1