aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--OPTIONS.md23
-rw-r--r--ethosu/vela/architecture_features.py40
-rw-r--r--ethosu/vela/npu_performance.py41
-rw-r--r--ethosu/vela/tensor.py17
-rw-r--r--vela.ini38
5 files changed, 120 insertions, 39 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index 22538a6b..e191d4ee 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -473,13 +473,22 @@ The following is an in-line explanation of the Vela configuration file format:
; My_Sys_Cfg
[System_Config.My_Sys_Cfg]
-core_clock=??? ---> Clock frequency of the Ethos-U. ??? = {float in Hz}
-axi0_port=??? ---> Memory type connected to AXI0. ??? = {Sram, Dram, OnChipFlash or OffChipFlash}
-axi1_port=??? ---> Memory type connected to AXI1. ??? = {Sram, Dram, OnChipFlash or OffChipFlash}
-Sram_clock_scale=??? ---> Scaling of core_clock to specify the Sram bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0}
-Dram_clock_scale=??? ---> Scaling of core_clock to specify the Dram bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0}
-OnChipFlash_clock_scale=??? ---> Scaling of core_clock to specify the OnChipFlash bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0}
-OffChipFlash_clock_scale=??? ---> Scaling of core_clock to specify the OffChipFlash bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0}
+core_clock=??? ---> Clock frequency of the Ethos-U. ??? = {float in Hz}
+axi0_port=??? ---> Memory type connected to AXI0. ??? = {Sram, Dram, OnChipFlash or OffChipFlash}
+axi1_port=??? ---> Memory type connected to AXI1. ??? = {Sram, Dram, OnChipFlash or OffChipFlash}
+Sram_clock_scale=??? ---> Scaling of core_clock to specify the Sram bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0}
+Sram_burst_length=??? ---> Minimum efficient burst length in Sram. Only required if selected by an AXI port. ??? = {int in Bytes}
+Sram_read_latency=??? ---> Read latency in Sram. Only required if selected by an AXI port. ??? = {int in Cycles}
+Sram_write_latency=??? ---> Write latency in Sram. Only required if selected by an AXI port. ??? = {int in Cycles}
+Dram_clock_scale=??? ---> Scaling of core_clock to specify the Dram bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0}
+Dram_burst_length=??? ---> Minimum efficient burst length in Dram. Only required if selected by an AXI port. ??? = {int in Bytes}
+Dram_read_latency=??? ---> Read latency in Dram. Only required if selected by an AXI port. ??? = {int in Cycles}
+Dram_write_latency=??? ---> Write latency in Dram. Only required if selected by an AXI port. ??? = {int in Cycles}
+OnChipFlash_clock_scale=??? ---> Scaling of core_clock to specify the OnChipFlash bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0}
+OffChipFlash_clock_scale=??? ---> Scaling of core_clock to specify the OffChipFlash bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0}
+OffChipFlash_burst_length=??? ---> Minimum efficient burst length in OffChipFlash. Only required if selected by an AXI port. ??? = {int in Bytes}
+OffChipFlash_read_latency=??? ---> Read latency in OffChipFlash. Only required if selected by an AXI port. ??? = {int in Cycles}
+OffChipFlash_write_latency=??? ---> Write latency in OffChipFlash. Only required if selected by an AXI port. ??? = {int in Cycles}
; -----------------------------------------------------------------------------
; Memory Mode
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 576f793a..9edc87e2 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -32,6 +32,7 @@ from .operation import Kernel
from .operation import NpuBlockType
from .operation import PointXYZ
from .supported_operators import SupportedOperators
+from .tensor import BandwidthDirection
from .tensor import MemArea
from .tensor import MemType
from .tensor import TensorFormat
@@ -465,6 +466,12 @@ class ArchitectureFeatures:
self.axi1_port = MemArea.Dram
self.memory_clock_scales[MemArea.Sram] = 1.0
self.memory_clock_scales[MemArea.Dram] = 0.75 # 3 / 4
+ self.memory_burst_length[MemArea.Sram] = 32
+ self.memory_burst_length[MemArea.Dram] = 128
+ self.memory_latency[MemArea.Sram][BandwidthDirection.Read] = 32
+ self.memory_latency[MemArea.Sram][BandwidthDirection.Write] = 32
+ self.memory_latency[MemArea.Dram][BandwidthDirection.Read] = 500
+ self.memory_latency[MemArea.Dram][BandwidthDirection.Write] = 250
else:
# Default Ethos-U55 system configuration
# Ethos-U55 High-End Embedded: SRAM (4 GB/s) and Flash (0.5 GB/s)
@@ -473,6 +480,12 @@ class ArchitectureFeatures:
self.axi1_port = MemArea.OffChipFlash
self.memory_clock_scales[MemArea.Sram] = 1.0
self.memory_clock_scales[MemArea.OffChipFlash] = 0.125 # 1 / 8
+ self.memory_burst_length[MemArea.Sram] = 32
+ self.memory_burst_length[MemArea.OffChipFlash] = 128
+ self.memory_latency[MemArea.Sram][BandwidthDirection.Read] = 32
+ self.memory_latency[MemArea.Sram][BandwidthDirection.Write] = 32
+ self.memory_latency[MemArea.OffChipFlash][BandwidthDirection.Read] = 64
+ self.memory_latency[MemArea.OffChipFlash][BandwidthDirection.Write] = 64
def _set_default_mem_mode(self):
# ArchitectureFeatures.DEFAULT_CONFIG values
@@ -500,6 +513,8 @@ class ArchitectureFeatures:
self.axi0_port = MemArea(1)
self.axi1_port = MemArea(1)
self.memory_clock_scales = np.ones(MemArea.Size)
+ self.memory_burst_length = np.ones(MemArea.Size)
+ self.memory_latency = np.zeros((MemArea.Size, BandwidthDirection.Size))
self.const_mem_area = MemPort(1)
self.arena_mem_area = MemPort(1)
self.cache_mem_area = MemPort(1)
@@ -526,7 +541,25 @@ class ArchitectureFeatures:
sys_cfg_section, mem_area.name + "_clock_scale", self.memory_clock_scales[mem_area]
)
)
-
+ self.memory_burst_length[mem_area] = int(
+ self._read_config(
+ sys_cfg_section, mem_area.name + "_burst_length", self.memory_burst_length[mem_area]
+ )
+ )
+ self.memory_latency[mem_area][BandwidthDirection.Read] = int(
+ self._read_config(
+ sys_cfg_section,
+ mem_area.name + "_read_latency",
+ self.memory_latency[mem_area][BandwidthDirection.Read],
+ )
+ )
+ self.memory_latency[mem_area][BandwidthDirection.Write] = int(
+ self._read_config(
+ sys_cfg_section,
+ mem_area.name + "_write_latency",
+ self.memory_latency[mem_area][BandwidthDirection.Write],
+ )
+ )
elif self.system_config == ArchitectureFeatures.DEFAULT_CONFIG:
self._set_default_sys_config()
@@ -578,6 +611,8 @@ class ArchitectureFeatures:
self.const_mem_area = MemPort.Axi0
self.axi0_port = MemArea.OnChipFlash
self.memory_clock_scales[MemArea.OnChipFlash] = self.memory_clock_scales[MemArea.Sram]
+ self.memory_burst_length[MemArea.OnChipFlash] = self.memory_burst_length[MemArea.Sram]
+ self.memory_latency[MemArea.OnChipFlash] = self.memory_latency[MemArea.Sram]
# check configuration
if self._mem_port_mapping(self.cache_mem_area) != MemArea.Sram:
@@ -623,6 +658,9 @@ class ArchitectureFeatures:
print(f" axi1_port = {self.axi1_port.name}")
for mem in (MemArea.Sram, MemArea.Dram, MemArea.OnChipFlash, MemArea.OffChipFlash):
print(f" {mem.name}_clock_scales = {self.memory_clock_scales[mem]}")
+ print(f" {mem.name}_burst_length = {self.memory_burst_length[mem]}")
+ print(f" {mem.name}_read_latency = {self.memory_latency[mem][BandwidthDirection.Read]}")
+ print(f" {mem.name}_write_latency = {self.memory_latency[mem][BandwidthDirection.Write]}")
print(f"Memory Mode ({self.memory_mode}):")
print(f" const_mem_area = {self.const_mem_area.name}")
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 8ada1e23..9d83f6fb 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -33,6 +33,7 @@ from .nn_graph import SchedulerRewrite
from .operation import NpuBlockType
from .operation import Op
from .shared_buffer_allocation import is_acc_40bits_used
+from .tensor import BandwidthDirection
from .tensor import MemArea
from .tensor import shape_num_elements
from .tensor import Tensor
@@ -90,22 +91,6 @@ class PassCycles(IntEnum):
)
-class BandwidthDirection(IntEnum):
- Read = 0
- Write = auto()
- Size = auto()
-
- def display_name(self):
- return self.name
-
- def identifier_name(self):
- return self.name.lower()
-
- @staticmethod
- def all():
- return (BandwidthDirection.Read, BandwidthDirection.Write)
-
-
def make_bandwidth_array():
return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
@@ -133,8 +118,6 @@ def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversa
def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, dpu_cycles=0):
- latencies_rd = {MemArea.Sram: 32, MemArea.Dram: 500, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64}
- latencies_wr = {MemArea.Sram: 32, MemArea.Dram: 250, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64}
ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
cycles_ifm_blk = (
@@ -146,11 +129,11 @@ def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk
/ arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
)
return (
- latencies_rd[ifm_tensor.mem_area]
+ arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]
+ cycles_ifm_blk
+ dpu_cycles
+ output_cycles
- + latencies_wr[ofm_tensor.mem_area]
+ + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]
+ cycles_ofm_blk
) / 4
@@ -351,13 +334,12 @@ def estimate_conv_pooling_cycles(
)
if scale_tensor:
- if scale_tensor.mem_area is MemArea.Sram:
- latency = 32
- elif scale_tensor.mem_area is MemArea.Dram:
- latency = 500
- else:
- latency = 64
- cycles_bias_blk = 10 * min(ofm_block.depth, ofm_tens_shape[3]) * latency / 256
+ cycles_bias_blk = (
+ 10
+ * min(ofm_block.depth, ofm_tens_shape[3])
+ * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
+ / 256
+ )
cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
cycles_cmd = get_minimal_cmd_cycles(
@@ -380,7 +362,6 @@ def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block
# Estimate memory transfer efficiency by calculating the burst length
# this is related to data format, block shape, and tensor shape, etc.
- max_burst_len = 32 if mem_area == MemArea.Sram else 128
burst_len = 0
elem_size = tensor.dtype.size_in_bytes()
is_ifm = direction == BandwidthDirection.Read
@@ -408,10 +389,10 @@ def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block
else:
burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
- burst_len = min(max_burst_len, burst_len)
+ burst_len = min(arch.memory_burst_length[mem_area], burst_len)
bw = tens.bandwidth() if replace_bw is None else replace_bw
- return bw * (max_burst_len / burst_len)
+ return bw * (arch.memory_burst_length[mem_area] / burst_len)
def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index de97710a..257cb5ff 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -19,6 +19,7 @@ import copy
import enum
import uuid
from collections import defaultdict
+from enum import auto
from functools import lru_cache
from typing import Dict
from typing import List
@@ -62,6 +63,22 @@ class MemType(enum.IntFlag):
return self.name
+class BandwidthDirection(enum.IntEnum):
+ Read = 0
+ Write = auto()
+ Size = auto()
+
+ def display_name(self):
+ return self.name
+
+ def identifier_name(self):
+ return self.name.lower()
+
+ @staticmethod
+ def all():
+ return (BandwidthDirection.Read, BandwidthDirection.Write)
+
+
class MemArea(enum.IntFlag):
Unknown = 0
Sram = 1
diff --git a/vela.ini b/vela.ini
index 94ab4fae..47fa6696 100644
--- a/vela.ini
+++ b/vela.ini
@@ -26,7 +26,13 @@ core_clock=200e6
axi0_port=Sram
axi1_port=OffChipFlash
Sram_clock_scale=1.0
+Sram_burst_length=32
+Sram_read_latency=32
+Sram_write_latency=32
OffChipFlash_clock_scale=0.0625
+OffChipFlash_burst_length=128
+OffChipFlash_read_latency=64
+OffChipFlash_write_latency=64
; Ethos-U55 High-End Embedded: SRAM (4 GB/s) and Flash (0.5 GB/s)
[System_Config.Ethos_U55_High_End_Embedded]
@@ -34,7 +40,13 @@ core_clock=500e6
axi0_port=Sram
axi1_port=OffChipFlash
Sram_clock_scale=1.0
+Sram_burst_length=32
+Sram_read_latency=32
+Sram_write_latency=32
OffChipFlash_clock_scale=0.125
+OffChipFlash_burst_length=128
+OffChipFlash_read_latency=64
+OffChipFlash_write_latency=64
; Ethos-U65 Embedded: SRAM (8 GB/s) and Flash (0.5 GB/s)
[System_Config.Ethos_U65_Embedded]
@@ -42,7 +54,13 @@ core_clock=500e6
axi0_port=Sram
axi1_port=OffChipFlash
Sram_clock_scale=1.0
+Sram_burst_length=32
+Sram_read_latency=32
+Sram_write_latency=32
OffChipFlash_clock_scale=0.0625
+OffChipFlash_burst_length=128
+OffChipFlash_read_latency=64
+OffChipFlash_write_latency=64
; Ethos-U65 Mid-End: SRAM (8 GB/s) and DRAM (3.75 GB/s)
[System_Config.Ethos_U65_Mid_End]
@@ -50,7 +68,13 @@ core_clock=500e6
axi0_port=Sram
axi1_port=Dram
Sram_clock_scale=1.0
+Sram_burst_length=32
+Sram_read_latency=32
+Sram_write_latency=32
Dram_clock_scale=0.46875
+Dram_burst_length=128
+Dram_read_latency=500
+Dram_write_latency=250
; Ethos-U65 High-End: SRAM (16 GB/s) and DRAM (3.75 GB/s)
[System_Config.Ethos_U65_High_End]
@@ -58,7 +82,13 @@ core_clock=1e9
axi0_port=Sram
axi1_port=Dram
Sram_clock_scale=1.0
+Sram_burst_length=32
+Sram_read_latency=32
+Sram_write_latency=32
Dram_clock_scale=0.234375
+Dram_burst_length=128
+Dram_read_latency=500
+Dram_write_latency=250
; Ethos-U65 Client-Server: SRAM (16 GB/s) and DRAM (12 GB/s)
[System_Config.Ethos_U65_Client_Server]
@@ -66,7 +96,13 @@ core_clock=1e9
axi0_port=Sram
axi1_port=Dram
Sram_clock_scale=1.0
+Sram_burst_length=32
+Sram_read_latency=32
+Sram_write_latency=32
Dram_clock_scale=0.75
+Dram_burst_length=128
+Dram_read_latency=500
+Dram_write_latency=250
; -----------------------------------------------------------------------------
; Memory Mode
@@ -96,4 +132,4 @@ cache_sram_size=393216
; The non-SRAM memory is assumed to be read-writeable
[Memory_Mode.Dedicated_Sram_512KB]
inherit=Memory_Mode.Dedicated_Sram
-cache_sram_size=524288 \ No newline at end of file
+cache_sram_size=524288