aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Verhaard <louis.verhaard@arm.com>2020-08-21 14:06:25 +0200
committerLouis Verhaard <louis.verhaard@arm.com>2020-08-26 08:18:27 +0200
commit814cfbb8124ba0b3828db2bb12d9342ae9c39f19 (patch)
tree519f7f41091efa944f6c4e3eb732892c56da40e1
parent7579c75d870c25ee075e46a110b6b89cf266db64 (diff)
downloadethos-u-vela-814cfbb8124ba0b3828db2bb12d9342ae9c39f19.tar.gz
MLBEDSW-2688: LUT DMA may require kernel wait
LUT related updates specific for 16K SHRAM: - prevent LUT DMA transfer from overwriting accumulator SHRAM of an ongoing operation - do not use the last 2K of SHRAM as accumulator during LUT operations Change-Id: I17066e0410c6f07b125ed245002d7b19269a7a8a Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
-rw-r--r--ethosu/vela/high_level_command_stream.py4
-rw-r--r--ethosu/vela/shared_buffer_allocation.py13
2 files changed, 16 insertions, 1 deletions
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index 95af1ccb..b8a19f50 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -243,6 +243,10 @@ class NpuStripe(Command):
MemoryRangeSet(tens.mem_area, tens.address, tens.address + tens.storage_size()),
AccessDirection.Read,
)
+ # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
+ res.add(
+ self.ps.shared_buffer.get_shram_memory_access_range(), AccessDirection.Write,
+ )
return res
def is_npu_pass_command(self):
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
index 053377c4..fdcbe94a 100644
--- a/ethosu/vela/shared_buffer_allocation.py
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -25,6 +25,8 @@ from .architecture_features import SHRAMElements
from .errors import VelaError
from .ethos_u55_regs.ethos_u55_regs import resampling_mode
from .operation import NpuBlockType
+from .range_set import MemoryRangeSet
+from .tensor import MemArea
class SharedBufferAllocation:
@@ -40,6 +42,7 @@ class SharedBufferAllocation:
dilation = (1, 1, 1, 1)
self.kernel = Kernel(1, 1)
is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
+ self.uses_lut = False
if ps.primary_op:
strides = ps.primary_op.attrs.get("strides", strides)
@@ -55,6 +58,7 @@ class SharedBufferAllocation:
k_w = ps.primary_op.attrs.get("filter_width", 1)
self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
+ self.uses_lut = ps.primary_op.activation_lut is not None
self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (
NpuBlockType.ConvolutionDepthWise,
@@ -102,7 +106,7 @@ class SharedBufferAllocation:
# Accumulator area is measured from the end of the buffer
self.bank_locations[SharedBufferArea.Accumulators] = (
- self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]
+ self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]
)
ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
@@ -156,6 +160,13 @@ class SharedBufferAllocation:
return True
+ def get_shram_memory_access_range(self):
+ # Returns the SHRAM memory access range used by this shared buffer,
+ # excluding access to LUT
+ return MemoryRangeSet(
+ MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size
+ )
+
def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
alloc = SharedBufferAllocation(arch, ps)