aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/register_command_stream_generator.py
diff options
context:
space:
mode:
Diffstat (limited to 'ethosu/vela/register_command_stream_generator.py')
-rw-r--r--ethosu/vela/register_command_stream_generator.py51
1 files changed, 26 insertions, 25 deletions
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index c46016d7..9dd290a9 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -50,7 +50,7 @@ from .numeric_util import round_up
from .numeric_util import round_up_to_int
from .operation import NpuBlockType
from .shared_buffer_allocation import SharedBufferAllocation
-from .tensor import MemArea
+from .tensor import MemType
from .tensor import TensorBlockTraversal
from .tensor import TensorFormat
@@ -79,8 +79,9 @@ class CmdMode(IntEnum):
class BasePointerIndex(IntEnum):
- ReadOnly = 0 # base address slot index for weights and scaling
- Scratch = 1 # base address slot index for scratch memory area
+ WeightTensor = 0 # base address index for the Weight tensor
+ ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
+ ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
# TODO: Replace with definitions from ethos_u55_regs
@@ -322,12 +323,20 @@ def get_op_padding_lt(cmd):
def generate_register_command_stream(nng, sg, arch, verbose=False):
emit = CommandStreamEmitter()
- base_ptr_idx_map = {
- MemArea.Sram: BasePointerIndex.Scratch,
- MemArea.OnChipFlash: BasePointerIndex.ReadOnly,
- MemArea.OffChipFlash: BasePointerIndex.ReadOnly,
- MemArea.Dram: BasePointerIndex.ReadOnly,
- }
+ if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+ base_ptr_idx_map = {
+ MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+ MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+ MemType.Scratch: BasePointerIndex.ScratchTensor,
+ MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
+ }
+ else:
+ base_ptr_idx_map = {
+ MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+ MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+ MemType.Scratch: BasePointerIndex.ScratchTensor,
+ MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
+ }
# Maps an AccumulatorType enum to the corresponding acc_format value
acc_format_map = {
@@ -377,8 +386,8 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
param = min(param, 0xFFFF) # Clamp to allowable wait amount
if relative_dep[CommandType.DMA] is not None:
- param = relative_dep[CommandType.DMA][0]
- param = min(param, 0xF) # Clamp to allowable wait amount
+ # TODO This can be optimized for yoda
+ param = 0
emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
for cmd in cmd_stream:
@@ -394,10 +403,10 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
else:
sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
- # TODO: Yoda support needs to use feature_maps_not_in_fast_storage and force_outputs_to_fast_storage
- emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_area])
+ emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_type])
emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
- emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_area])
+ emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_type])
+
emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
dma_channel = 0
@@ -682,10 +691,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index)
- # Select weight/scale region depending on where permanent storage was defined
- weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_area]
- if arch.permanent_storage_mem_area == MemArea.Sram:
- weight_region = BasePointerIndex.ReadOnly
+ weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr)
emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len)
@@ -699,9 +705,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr
)
# Emit base address for NPU to access scale & bias data
- scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_area]
- if arch.permanent_storage_mem_area == MemArea.Sram:
- scale_region = BasePointerIndex.ReadOnly
+ scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr)
emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16))
@@ -850,10 +854,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
else:
assert False
- if tens.mem_area == MemArea.Sram:
- emit.cmd0_with_param(region_op, BasePointerIndex.Scratch)
- else:
- emit.cmd0_with_param(region_op, BasePointerIndex.ReadOnly)
+ emit.cmd0_with_param(region_op, base_ptr_idx_map[tens.mem_type])
for idx, addr in enumerate(addresses):
if addr is None: