From ca9cc420984eba39b85885bf0d2d7b48bb920da9 Mon Sep 17 00:00:00 2001 From: Alexander Hansson Date: Thu, 22 Jun 2023 16:01:27 +0000 Subject: MLBEDSW-7728: Fix DMA_WAITs in register_command_stream_generator * Fix bug in register_command_stream_generator where certain high-level command streams resulted in missing DMA_WAIT commands * Add unit-tests for DMA_WAIT and KERNEL_WAIT commands Signed-off-by: Alexander Hansson Change-Id: Iabb3ea3e95fa1ef933c50356d047b6b3f5aeafe3 --- ethosu/vela/api.py | 4 +- ethosu/vela/register_command_stream_generator.py | 7 +- ethosu/vela/register_command_stream_util.py | 94 ++--- .../test/extapi/test_extapi_generate_commands.py | 460 +++++++++++++++++++-- 4 files changed, 470 insertions(+), 95 deletions(-) diff --git a/ethosu/vela/api.py b/ethosu/vela/api.py index cacadf61..589a2834 100644 --- a/ethosu/vela/api.py +++ b/ethosu/vela/api.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -27,7 +27,7 @@ import numpy API_VERSION_MAJOR = 1 -API_VERSION_MINOR = 3 +API_VERSION_MINOR = 4 API_VERSION = f"{API_VERSION_MAJOR}.{API_VERSION_MINOR}" diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index f5f530d3..6001a3b8 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -1039,13 +1039,14 @@ def generate_command_stream( if arch.is_ethos_u65_system: emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1) - dep_watermark = Watermark(0, 0) prev_op = None # Generate register commands for all operations + outstanding_dma_ops: List[NpuOperation] = list() + outstanding_npu_ops: List[NpuOperation] = list() for op_index, npu_op in enumerate(npu_op_list): try: check_mem_limits(memory_accesses[npu_op], mem_limits) - dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark) + cmd_waits = get_wait_dependency(arch, npu_op, memory_accesses, outstanding_dma_ops, outstanding_npu_ops) generate_registers_for_op(emit, npu_op, arch) except VelaError as e: # Add operation info and rethrow diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py index 6f57f54a..b131f647 100644 --- a/ethosu/vela/register_command_stream_util.py +++ b/ethosu/vela/register_command_stream_util.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates # # SPDX-License-Identifier: Apache-2.0 # @@ -294,62 +294,46 @@ def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures def get_wait_dependency( - arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark + arch: ArchitectureFeatures, + npu_op: NpuOperation, + memory_accesses, + outstanding_dma_ops: List[NpuOperation], + outstanding_npu_ops: List[NpuOperation], ): """Used to calculate whether DMA wait or kernel wait operations are needed""" - npu_op = npu_op_list[op_index] - op_access = memory_accesses[npu_op] - index = op_index - 1 - - # NPU dependency tracking - npu_outstanding = -1 - npu_ops = 0 - npu_index = watermark.npu - - # DMA dependency tracking - dma_outstanding = -1 - dma_ops = 0 - dma_index = watermark.dma - - # Seek back in the command stream looking for NPU or DMA dependencies - # but only as far as the first dependency or the watermarks (dependencies - # before this point have been satisfied already). - # The watermark moves to after the latest element we must wait for, not - # the command that issues the wait. - # NPU->NPU dependency is handled via blockdep. - while (index >= npu_index) or (index >= dma_index): - prev_op = npu_op_list[index] - prev_access = memory_accesses[prev_op] - - # Check NPU consuming DMA output - if isinstance(prev_op, NpuDmaOperation): - if index >= dma_index: - if not isinstance(npu_op, NpuDmaOperation): - if (dma_outstanding == -1) and prev_access.conflicts(op_access): - dma_outstanding = dma_ops - dma_ops += 1 # Count DMA ops in the pipeline - if dma_ops >= arch.max_outstanding_dma: - dma_index = max(index + 1, dma_index) - # Check DMA consuming NPU output - else: - if index >= npu_index: - if isinstance(npu_op, NpuDmaOperation) and npu_outstanding == -1 and prev_access.conflicts(op_access): - npu_outstanding = npu_ops - npu_ops += 1 # Count NPU ops in the pipeline - if npu_ops >= arch.max_outstanding_kernels: - npu_index = max(index + 1, npu_index) - - index -= 1 - - # Update DMA watermark if we didn't see any and the NPU pipeline is full - if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels): - dma_index = op_index - - # Bring the search watermark forwards as we complete for those dependencies - watermark = Watermark(npu_index, dma_index) - outstanding = Watermark(npu_outstanding, dma_outstanding) - - return watermark, outstanding + kern_wait = -1 + dma_wait = -1 + op_accesses = memory_accesses[npu_op] + + if isinstance(npu_op, NpuDmaOperation): + outstanding_ops = outstanding_npu_ops + outstanding_dma_ops.append(npu_op) + if len(outstanding_dma_ops) > arch.max_outstanding_dma: + outstanding_dma_ops.pop(0) + else: + outstanding_ops = outstanding_dma_ops + outstanding_npu_ops.append(npu_op) + if len(outstanding_npu_ops) > arch.max_outstanding_kernels: + outstanding_npu_ops.pop(0) + + waits = -1 + for idx in range(len(outstanding_ops) - 1, -1, -1): + waits += 1 + other_op = outstanding_ops[idx] + other_accesses = memory_accesses[other_op] + if other_accesses.conflicts(op_accesses): + if isinstance(npu_op, NpuDmaOperation): + kern_wait = waits + else: + dma_wait = waits + # Current op needs to wait, and after it has waited, + # outstanding_ops[0..idx] are not outstanding any longer + for i in range(idx + 1): + outstanding_ops.pop(0) + break + + cmd_waits = Watermark(kern_wait, dma_wait) + return cmd_waits # ------------------------------------------------------------------- diff --git a/ethosu/vela/test/extapi/test_extapi_generate_commands.py b/ethosu/vela/test/extapi/test_extapi_generate_commands.py index b1d31413..441c4a4f 100644 --- a/ethosu/vela/test/extapi/test_extapi_generate_commands.py +++ b/ethosu/vela/test/extapi/test_extapi_generate_commands.py @@ -52,33 +52,30 @@ from ethosu.vela.register_command_stream_generator import generate_command_strea from ethosu.vela.register_command_stream_util import get_address_ranges -def check_cmd0(cmd_stream, cmd, param, idx=0): +def find_cmd0(cmd_stream, cmd, param, idx=0): """ - Checks that command + parameter exists in the command stream after position idx. - Returns the position in the command stream (if found) otherwise asserts. + Searches the command stream from position idx + Returns the position of cmd + param (if found) otherwise -1. """ param = int(param) & 0xFFFF command = cmd.value | (param << 16) for i in range(idx, len(cmd_stream)): if cmd_stream[i] == command: return i - assert False, f"{cmd} {param} not found in the command stream (after position {idx})" + return -1 -def check_cmd1(cmd_stream, cmd, offset, param=0x0, idx=0): +def check_cmd0(cmd_stream, cmd, param, idx=0): """ Checks that command + parameter exists in the command stream after position idx. - Returns the position in the command stream (if found) otherwise asserts. + Returns the position (if found) otherwise asserts. """ - offset = int(offset) & 0xFFFFFFFF - command = cmd.value | CmdMode.Payload32.value | (param << 16) - for i in range(idx, len(cmd_stream) - 1): - if cmd_stream[i] == command and cmd_stream[i + 1] == offset: - return i - assert False, f"{cmd} {offset} {param} not found in the command stream (after position {idx})" + pos = find_cmd0(cmd_stream, cmd, param, idx) + assert pos >= 0, f"{cmd} {param} not found in the command stream (after position {idx})" + return pos -def find_cmd0(cmd_stream, cmd) -> int: +def find_param_cmd0(cmd_stream, cmd) -> int: """Returns parameter of the first command in the stream that matches the given command""" for command in cmd_stream: if (command & 0xFFFF) == cmd.value: @@ -86,6 +83,29 @@ def find_cmd0(cmd_stream, cmd) -> int: assert False, f"Not in command stream: {cmd}" +def find_cmd1(cmd_stream, cmd, offset, param=0x0, idx=0): + """ + Searches the command stream from position idx + Returns the position of the command (if found) otherwise -1. + """ + offset = int(offset) & 0xFFFFFFFF + command = cmd.value | CmdMode.Payload32.value | (param << 16) + for i in range(idx, len(cmd_stream) - 1): + if cmd_stream[i] == command and cmd_stream[i + 1] == offset: + return i + return -1 + + +def check_cmd1(cmd_stream, cmd, offset, param=0x0, idx=0): + """ + Checks that command + parameter exists in the command stream after position idx. + Returns the position of the command (if found) otherwise asserts. + """ + pos = find_cmd1(cmd_stream, cmd, offset, param, idx) + assert pos >= 0, f"{cmd} {offset} {param} not found in the command stream (after position {idx})" + return pos + + def create_feature_map( shape: NpuShape3D, region: int, @@ -107,24 +127,51 @@ def create_feature_map( return fm +def create_conv2d( + ifm: NpuFeatureMap, + ofm: NpuFeatureMap, + kernel: NpuKernel, + weights: NpuAddressRange, + bias: NpuAddressRange, + padding: NpuPadding, + block_config: NpuShape3D, +): + """Creates a Conv2D operation""" + op = NpuConv2DOperation() + op.ifm = ifm + op.ofm = ofm + op.kernel = kernel + op.weights = [weights] + if bias: + op.biases = [bias] + op.padding = padding + op.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST + op.block_config = block_config + return op + + def test_conv2d(): """Tests command stream generation for a conv2d operation""" - op = NpuConv2DOperation() - op.ifm = create_feature_map( - NpuShape3D(height=30, width=62, depth=46), 1, 512, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128) + op = create_conv2d( + ifm=create_feature_map( + NpuShape3D(height=30, width=62, depth=46), + 1, + 512, + quant=NpuQuantization(scale_f32=0.007843138, zero_point=128), + ), + ofm=create_feature_map( + NpuShape3D(height=30, width=31, depth=46), + 1, + 0x14E40, + quant=NpuQuantization(scale_f32=0.20392157, zero_point=128), + ), + kernel=NpuKernel(3, 2, 2, 1), + weights=NpuAddressRange(region=0, address=0, length=7696), + bias=NpuAddressRange(region=0, address=32000, length=464), + padding=NpuPadding(top=0, left=0, right=1, bottom=1), + block_config=NpuShape3D(height=16, width=4, depth=16), ) - op.ofm = create_feature_map( - NpuShape3D(height=30, width=31, depth=46), - 1, - 0x14E40, - quant=NpuQuantization(scale_f32=0.20392157, zero_point=128), - ) - op.kernel = NpuKernel(3, 2, 2, 1) - op.weights = [NpuAddressRange(region=0, address=0, length=7696)] - op.biases = [NpuAddressRange(region=0, address=32000, length=464)] - op.padding = NpuPadding(top=0, left=0, right=1, bottom=1) - op.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST - op.block_config = NpuShape3D(height=16, width=4, depth=16) + cmds = npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_128) set_cmds = list() set_cmds.append(check_cmd0(cmds, cmd0.NPU_SET_IFM_REGION, 1)) @@ -181,8 +228,8 @@ def test_conv2d(): set_cmds.append(check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)) conv_idx = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0) assert all([conv_idx > x for x in set_cmds]), "NPU_OP_CONV occured before the last SET operation." - ib_end = find_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END) - ab_start = find_cmd0(cmds, cmd0.NPU_SET_AB_START) + ib_end = find_param_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END) + ab_start = find_param_cmd0(cmds, cmd0.NPU_SET_AB_START) assert ib_end > 0 assert ib_end <= ab_start @@ -316,14 +363,14 @@ def test_mul_with_broadcast_and_relu(): set_cmds.append(check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)) elementwise_idx = check_cmd0(cmds, cmd0.NPU_OP_ELEMENTWISE, 0) assert all([elementwise_idx > x for x in set_cmds]), "NPU_OP_ELEMENTWISE occured before the last SET cmd" - ab_start = find_cmd0(cmds, cmd0.NPU_SET_AB_START) + ab_start = find_param_cmd0(cmds, cmd0.NPU_SET_AB_START) assert ab_start > 0 - ifm2_ib_start = find_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START) + ifm2_ib_start = find_param_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START) assert 0 < ifm2_ib_start < ab_start # Check that block width/height were generated that fit - blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1) - blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1) - blk_depth = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1) + blk_height = find_param_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1) + blk_width = find_param_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1) + blk_depth = find_param_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1) assert blk_height >= 0 assert blk_width >= 0 assert blk_depth >= 0 @@ -384,6 +431,349 @@ def test_dma_op(): check_cmd0(cmds, cmd0.NPU_OP_POOL, 1, dma_wait_idx) +def setup_memory_barrier_tests(): + """ + Sets up 4 CONV operations and 4 DMA operations. + Where dma_ops[i] provides the weights for conv[i] + """ + ifm_addresses = [0x27100, 0x0, 0x27100, 0x0] + ofm_addresses = [0x0, 0x27100, 0x0, 0x27100] + weight_addr_r0 = [0x80, 0x150, 0x220, 0x2F0] + weight_addr_r1 = [0x2E650, 0x4E220, 0x4E2F0, 0x4E3C0] + weight_len = 208 + conv_ops = list() + dma_ops = list() + + for i in range(4): + weights_flash = NpuAddressRange(region=0, address=weight_addr_r0[i], length=weight_len) + weights_sram = NpuAddressRange(region=1, address=weight_addr_r1[i], length=weight_len) + dma_op = NpuDmaOperation(weights_flash, weights_sram) + conv = create_conv2d( + ifm=create_feature_map( + shape=NpuShape3D(height=100, width=100, depth=3), + region=1, + address=ifm_addresses[i], + quant=NpuQuantization(scale_f32=1.0, zero_point=0), + ), + ofm=create_feature_map( + shape=NpuShape3D(height=100, width=100, depth=3), + region=1, + address=ofm_addresses[i], + quant=NpuQuantization(scale_f32=1.0, zero_point=0), + ), + kernel=NpuKernel(3, 3, 1, 1), + weights=weights_sram, + bias=None, + padding=NpuPadding(top=1, left=1, right=1, bottom=1), + block_config=NpuShape3D(height=20, width=20, depth=8), + ) + conv_ops.append(conv) + dma_ops.append(dma_op) + return conv_ops, dma_ops + + +def test_dma_wait_1(): + """ + Tests that DMA_WAIT barriers are properly inserted + by the register command stream generator. + high-level command stream: + dma[0] + conv[0] + dma[1] + conv[1] + dma[2] + conv[2] + Where dma[i] provides the weights for conv[i] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + + hlvl_cmds = [dma_ops[0], conv_ops[0], dma_ops[1], conv_ops[1], dma_ops[2], conv_ops[2]] + + # Ethos-U55 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U55_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + # Ethos-U65 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U65_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + +def test_dma_wait_2(): + """ + Tests that DMA_WAIT barriers are properly inserted + by the register command stream generator. + high-level command stream: + dma[0] + dma[1] + conv[0] + dma[2] + conv[1] + conv[2] + Where dma[i] provides the weights for conv[i] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + + hlvl_cmds = [dma_ops[0], dma_ops[1], conv_ops[0], dma_ops[2], conv_ops[1], conv_ops[2]] + + # Ethos-U55 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U55_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + # Ethos-U65 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U65_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 1, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 1, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + +def test_dma_wait_3(): + """ + Tests that DMA_WAIT barriers are properly inserted + by the register command stream generator. + high-level command stream: + dma[0] + dma[1] + dma[2] + conv[0] + dma[3] + conv[1] + conv[2] + conv[3] + Where dma[i] provides the weights for conv[i] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + + hlvl_cmds = [dma_ops[0], dma_ops[1], dma_ops[2], conv_ops[0], dma_ops[3], conv_ops[1], conv_ops[2], conv_ops[3]] + + # Ethos-U55 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U55_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + # Ethos-U65 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U65_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 1, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + +def test_dma_wait_4(): + """ + Tests that DMA_WAIT barriers are properly inserted + by the register command stream generator. + high-level command stream: + dma[0] + dma[1] + dma[2] + conv[0] + conv[1] + dma[3] + conv[2] + conv[3] + Where dma[i] provides the weights for conv[i] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + hlvl_cmds = [dma_ops[0], dma_ops[1], dma_ops[2], conv_ops[0], conv_ops[1], dma_ops[3], conv_ops[2], conv_ops[3]] + + # Ethos-U55 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U55_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + # Ethos-U65 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U65_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 1, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 1, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + +def test_dma_wait_5(): + """ + Tests that DMA_WAIT barriers are properly inserted + by the register command stream generator. + high-level command stream: + dma[0] + dma[1] + dma[2] + conv[0] + conv[1] + conv[2] + dma[3] + conv[3] + Where dma[i] provides the weights for conv[i] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + hlvl_cmds = [dma_ops[0], dma_ops[1], dma_ops[2], conv_ops[0], conv_ops[1], conv_ops[2], dma_ops[3], conv_ops[3]] + + # Ethos-U55 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U55_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + # Ethos-U65 + cmds = npu_generate_register_command_stream(hlvl_cmds, NpuAccelerator.Ethos_U65_256) + pos = 0 + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 1, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos) + + +def test_dma_wait_6(): + """ + Verify that DMA waits are not unnecessarily inserted + between unrelated DMA and KERNEL commands + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + cmds = npu_generate_register_command_stream([dma_ops[0], conv_ops[1]], NpuAccelerator.Ethos_U65_256) + start_pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0) + check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, start_pos) + + wait_pos = find_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0) + assert ( + wait_pos == -1 + ), f"A DMA_WAIT command was unnecessarily inserted (pos {wait_pos}) between unrelated DMA and KERNEL commands" + + +def test_kernel_wait_0(): + """ + Verify that KERNEL_WAIT 0 is generated. + dma_op[0] writes to the weight-address for conv[0] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + cmds = npu_generate_register_command_stream([conv_ops[0], dma_ops[0]], NpuAccelerator.Ethos_U65_256) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0) + pos = check_cmd0(cmds, cmd0.NPU_OP_KERNEL_WAIT, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + + +def test_kernel_wait_1(): + """ + Verify that KERNEL_WAIT 1 is generated. + dma_op[0] writes to the weight-address for conv[0] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + cmds = npu_generate_register_command_stream([conv_ops[0], conv_ops[1], dma_ops[0]], NpuAccelerator.Ethos_U65_256) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_KERNEL_WAIT, 1, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + + +def test_kernel_wait_2(): + """ + Verify that KERNEL_WAIT 2 is generated. + dma_ops[i] writes to the weight-address for conv_ops[i] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + cmds = npu_generate_register_command_stream( + [conv_ops[0], conv_ops[1], conv_ops[2], dma_ops[0]], NpuAccelerator.Ethos_U65_256 + ) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_KERNEL_WAIT, 2, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + + +def test_kernel_wait_3(): + """ + Verify that KERNEL_WAIT 2 is generated. + dma_ops[i] writes to the weight-address for conv_ops[i] + """ + conv_ops, dma_ops = setup_memory_barrier_tests() + cmds = npu_generate_register_command_stream( + [conv_ops[0], conv_ops[1], conv_ops[2], dma_ops[3], dma_ops[0]], NpuAccelerator.Ethos_U65_256 + ) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_CONV, 0, pos + 1) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_KERNEL_WAIT, 2, pos) + pos = check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0, pos) + + def test_check_mem_limits(): # Tests that no code is generated with addresses out of bounds conv_op = create_fully_connected_op() -- cgit v1.2.1