aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2020-08-04 21:40:14 +0100
committertim.hall <tim.hall@arm.com>2020-08-18 18:52:18 +0000
commit289a41dd26913e4d88a38c2d72057aa52d2fab09 (patch)
treeaade435f3fbb156cf0c8e9465e01a7dd82cb86ad
parentf767b937c12935be3cb1f9ee406fbb796176a40c (diff)
downloadethos-u-vela-289a41dd26913e4d88a38c2d72057aa52d2fab09.tar.gz
Vela: Rework NPU/DMA dependency insertion (for MLBEDSW-2620)
- This commit removes unnecessary dependency checks and implements on-demand calculation of the NPU/DMA dependencies. Signed-off-by: <tim.hall@arm.com> Change-Id: I85e681d1ab133bd88f64296dc00500f3c188e777
-rw-r--r--ethosu/vela/architecture_features.py3
-rw-r--r--ethosu/vela/register_command_stream_generator.py166
2 files changed, 76 insertions, 93 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 265af426..2e53a695 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -196,6 +196,9 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.system_config = system_config
self.is_yoda_system = self.accelerator_config in (Accelerator.Yoda_256, Accelerator.Yoda_512)
+ self.max_outstanding_dma = 2 if self.is_yoda_system else 1
+ self.max_outstanding_kernels = 3
+
self.ncores = accel_config.cores
self.ofm_ublock = accel_config.ofm_ublock
self.ifm_ublock = accel_config.ifm_ublock
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 4a9b0719..bdc37223 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -18,6 +18,7 @@
# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
# stream suitable for interpretation by the Ethos-U55 processor.
from collections import defaultdict
+from collections import namedtuple
from enum import Enum
from enum import IntEnum
@@ -165,12 +166,8 @@ class CommandStreamEmitter:
# This is not a redundant command, actually write it
self.cmd_stream.append((command, offset))
- def cmd_wait(self, cmd, param, absolute_wait_time):
- if absolute_wait_time <= self.last_absolute_wait[cmd]:
- return
-
- self.last_absolute_wait[cmd] = absolute_wait_time
- param = int(param)
+ def cmd_wait(self, cmd, channel, outstanding_count):
+ param = (16 * channel) + outstanding_count
command = ((param & 0xFFFF) << 16) | cmd.value
self.cmd_stream.append((command,))
@@ -182,75 +179,64 @@ class CommandStreamEmitter:
self.get_reg_machine(cmd).switch_bank()
-def calc_command_dependencies(cmd_stream, arch):
- cmd_starts = {}
- cmd_ends = {}
- memory_accesses = {}
+Watermark = namedtuple("Watermark", ["npu", "dma"])
- # Keep track of accumulated number of commands in command stream.
- # First element kernel ops: (# of blocks, # of commands)
- # Second element DMA ops: (# of commands)
- pos = np.array((np.array((0, 0)), np.array([0])), dtype=object)
-
- dependencies = {}
-
- for cmd in cmd_stream:
- cmd_starts[cmd] = pos
- op_count = cmd.get_operation_count()
- # Keep track of both num blocks and commands
- cmd_add = 0 if (op_count[0] == 0) else 1
- pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])), dtype=object)
- cmd_ends[cmd] = np.array((pos[0], pos[1]), dtype=object)
- memory_accesses[cmd] = cmd.get_memory_accesses()
-
- for idx, cmd in enumerate(cmd_stream):
- curr_accesses = memory_accesses[cmd]
- # Keep track of command dependency.
- # First element kernel ops: (# of blocks, # of commands)
- # Second element DMA ops: (# of commands)
- dep_offsets = np.array((np.array((-1, -1)), np.array([-1])), dtype=object)
- dep_cmds = [None] * CommandType.Size.value
- if idx > 0:
- # Look at the previous commands in backwards order
- for prev_cmd in cmd_stream[idx - 1 :: -1]:
- assert prev_cmd is not cmd
- if dep_cmds[prev_cmd.cmdtype] is None:
- is_dependency = False
- if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:
- # Special handling here, as dpu -> dpu operations require additional care
- if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):
- is_dependency = True
- elif memory_accesses[prev_cmd].conflicts(curr_accesses):
- is_dependency = True
- else:
- if memory_accesses[prev_cmd].conflicts(curr_accesses) or (
- prev_cmd.cmdtype == CommandType.DMA and prev_cmd.in_tensor.purpose == TensorPurpose.LUT
- ):
- is_dependency = True
-
- if is_dependency:
- new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]
- if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:
- dep_cmds[prev_cmd.cmdtype] = prev_cmd
- dep_offsets[prev_cmd.cmdtype] = new_offset
-
- # Check if we've got dependencies for all commands, in which case we can early out
- for dep in dep_cmds:
- if dep is None:
- break
- else:
- break # all handled
- # Convert absolute to relative dependencies, using None to signal the special case of no
- # dependency of this kind
- res = [None] * CommandType.Size.value
- for i in range(CommandType.Size.value):
- if dep_cmds[i] is not None:
- res[i] = cmd_starts[cmd][i] - dep_offsets[i]
+def get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, watermark: Watermark):
+ cmd = cmd_stream[cmd_index]
+ cmd_access = memory_accesses[cmd]
+ index = cmd_index - 1
+
+ # NPU dependency tracking
+ npu_outstanding = -1
+ npu_ops = 0
+ npu_index = watermark.npu
+
+ # DMA dependency tracking
+ dma_outstanding = -1
+ dma_ops = 0
+ dma_index = watermark.dma
+
+ # Seek back in the command stream looking for NPU or DMA dependencies
+ # but only as far as the first dependency or the watermarks (dependencies
+ # before this point have been satisfied already).
+ # The watermark moves to after the latest element we must wait for, not
+ # the command that issues the wait.
+ # NPU->NPU dependency is handled via blockdep.
+ while (index >= npu_index) or (index >= dma_index):
+ prev_cmd = cmd_stream[index]
+ prev_access = memory_accesses[prev_cmd]
+
+ # Check DMA consuming NPU output
+ if prev_cmd.cmdtype == CommandType.NpuStripe:
+ if index >= npu_index:
+ if (cmd.cmdtype == CommandType.DMA) and (npu_outstanding == -1) and prev_access.conflicts(cmd_access):
+ npu_outstanding = npu_ops
+ npu_ops = npu_ops + 1 # Count NPU ops in the pipeline
+ if npu_ops >= arch.max_outstanding_kernels:
+ npu_index = max(index + 1, npu_index)
+
+ # Check NPU consuming DMA output
+ elif prev_cmd.cmdtype == CommandType.DMA:
+ if index >= dma_index:
+ if cmd.cmdtype == CommandType.NpuStripe:
+ if (dma_outstanding == -1) and prev_access.conflicts(cmd_access):
+ dma_outstanding = dma_ops
+ dma_ops = dma_ops + 1 # Count DMA ops in the pipeline
+ if dma_ops >= arch.max_outstanding_dma:
+ dma_index = max(index + 1, dma_index)
- dependencies[cmd] = cmd_starts[cmd], res
+ index = index - 1
- return dependencies
+ # Update DMA watermark if we didn't see any and the NPU pipeline is full
+ if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
+ dma_index = cmd_index
+
+ # Bring the search watermark forwards as we complete for those dependencies
+ watermark = Watermark(npu_index, dma_index)
+ outstanding = Watermark(npu_outstanding, dma_outstanding)
+
+ return watermark, outstanding
def get_op_kernel(ps):
@@ -385,13 +371,20 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
}
cmd_stream = []
+ memory_accesses = {}
for cmd in sg.high_level_command_stream:
if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
print("Warning: Skipping register command stream generation for", cmd.ps)
else:
cmd_stream.append(cmd)
+ memory_accesses[cmd] = cmd.get_memory_accesses()
- dependencies = calc_command_dependencies(cmd_stream, arch)
+ def emit_cmd_waits(cmd_waits):
+ if cmd_waits.npu >= 0:
+ emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
+
+ if cmd_waits.dma >= 0:
+ emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
# Initialise operator dependency state
prev_ifm_rect = cur_ifm_rect = None
@@ -401,27 +394,14 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
prev_kernel = cur_kernel = None
prev_cmd = None
- def emit_wait_commands(cmd):
- # The command is fully set up, emit whatever wait commands we need
- absolute_dep, relative_dep = dependencies[cmd]
- if relative_dep[CommandType.NpuStripe] is not None:
- if cmd.cmdtype == CommandType.DMA:
- param = relative_dep[CommandType.NpuStripe][1]
- if param <= 3:
- emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])
- else:
- param = relative_dep[CommandType.NpuStripe][0]
- param = min(param, 0xFFFF) # Clamp to allowable wait amount
-
- if relative_dep[CommandType.DMA] is not None:
- # TODO This can be optimized for yoda
- param = 0
- emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
-
if arch.is_yoda_system:
emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
- for cmd in cmd_stream:
+ dep_watermark = Watermark(0, 0)
+
+ for cmd_index, cmd in enumerate(cmd_stream):
+ dep_watermark, cmd_waits = get_cmd_wait_dependency(arch, cmd_stream, memory_accesses, cmd_index, dep_watermark)
+
if cmd.cmdtype == CommandType.DMA:
start_coord = cmd.box.start_coord
@@ -446,7 +426,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
dma_channel = 0
mode = 0 # From external to external
- emit_wait_commands(cmd)
+ emit_cmd_waits(cmd_waits)
emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
elif cmd.cmdtype == CommandType.NpuStripe:
@@ -1063,8 +1043,6 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
ifm2_prec |= 1 << 6
emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
- emit_wait_commands(cmd)
-
# Get op parameters
cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
@@ -1096,6 +1074,8 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
prev_cmd = cmd
+ emit_cmd_waits(cmd_waits)
+
if npu_block_type == NpuBlockType.ConvolutionMxN:
emit.cmd_do_operation(cmd0.NPU_OP_CONV)
elif npu_block_type == NpuBlockType.ConvolutionDepthWise: