diff options
author | Diqing Zhong <diqing.zhong@arm.com> | 2020-12-11 13:07:37 +0100 |
---|---|---|
committer | patrik.gustavsson <patrik.gustavsson@arm.com> | 2020-12-16 16:46:31 +0000 |
commit | f842b69d007e70d70fc5cef3b6f1f50b4cabbd90 (patch) | |
tree | 0757948e7b4eeb8f3f9da70b05ef205b5ac5c255 | |
parent | 7a6f8438aaf750380a9fff799ca81ff5c7e2ae43 (diff) | |
download | ethos-u-vela-f842b69d007e70d70fc5cef3b6f1f50b4cabbd90.tar.gz |
MLBEDSW-3465: Add memory settings into sys config
Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
Change-Id: I4a5c53d0c5957595fc639b174b2b227ea043d409
-rw-r--r-- | OPTIONS.md | 23 | ||||
-rw-r--r-- | ethosu/vela/architecture_features.py | 40 | ||||
-rw-r--r-- | ethosu/vela/npu_performance.py | 41 | ||||
-rw-r--r-- | ethosu/vela/tensor.py | 17 | ||||
-rw-r--r-- | vela.ini | 38 |
5 files changed, 120 insertions, 39 deletions
@@ -473,13 +473,22 @@ The following is an in-line explanation of the Vela configuration file format: ; My_Sys_Cfg [System_Config.My_Sys_Cfg] -core_clock=??? ---> Clock frequency of the Ethos-U. ??? = {float in Hz} -axi0_port=??? ---> Memory type connected to AXI0. ??? = {Sram, Dram, OnChipFlash or OffChipFlash} -axi1_port=??? ---> Memory type connected to AXI1. ??? = {Sram, Dram, OnChipFlash or OffChipFlash} -Sram_clock_scale=??? ---> Scaling of core_clock to specify the Sram bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0} -Dram_clock_scale=??? ---> Scaling of core_clock to specify the Dram bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0} -OnChipFlash_clock_scale=??? ---> Scaling of core_clock to specify the OnChipFlash bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0} -OffChipFlash_clock_scale=??? ---> Scaling of core_clock to specify the OffChipFlash bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0} +core_clock=??? ---> Clock frequency of the Ethos-U. ??? = {float in Hz} +axi0_port=??? ---> Memory type connected to AXI0. ??? = {Sram, Dram, OnChipFlash or OffChipFlash} +axi1_port=??? ---> Memory type connected to AXI1. ??? = {Sram, Dram, OnChipFlash or OffChipFlash} +Sram_clock_scale=??? ---> Scaling of core_clock to specify the Sram bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0} +Sram_burst_length=??? ---> Minimum efficient burst length in Sram. Only required if selected by an AXI port. ??? = {int in Bytes} +Sram_read_latency=??? ---> Read latency in Sram. Only required if selected by an AXI port. ??? = {int in Cycles} +Sram_write_latency=??? ---> Write latency in Sram. Only required if selected by an AXI port. ??? = {int in Cycles} +Dram_clock_scale=??? ---> Scaling of core_clock to specify the Dram bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0} +Dram_burst_length=??? ---> Minimum efficient burst length in Dram. Only required if selected by an AXI port. ??? = {int in Bytes} +Dram_read_latency=??? ---> Read latency in Dram. Only required if selected by an AXI port. ??? = {int in Cycles} +Dram_write_latency=??? ---> Write latency in Dram. Only required if selected by an AXI port. ??? = {int in Cycles} +OnChipFlash_clock_scale=??? ---> Scaling of core_clock to specify the OnChipFlash bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0} +OffChipFlash_clock_scale=??? ---> Scaling of core_clock to specify the OffChipFlash bandwidth. Only required if selected by an AXI port. ??? = {float 0.0 to 1.0} +OffChipFlash_burst_length=??? ---> Minimum efficient burst length in OffChipFlash. Only required if selected by an AXI port. ??? = {int in Bytes} +OffChipFlash_read_latency=??? ---> Read latency in OffChipFlash. Only required if selected by an AXI port. ??? = {int in Cycles} +OffChipFlash_write_latency=??? ---> Write latency in OffChipFlash. Only required if selected by an AXI port. ??? = {int in Cycles} ; ----------------------------------------------------------------------------- ; Memory Mode diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py index 576f793a..9edc87e2 100644 --- a/ethosu/vela/architecture_features.py +++ b/ethosu/vela/architecture_features.py @@ -32,6 +32,7 @@ from .operation import Kernel from .operation import NpuBlockType from .operation import PointXYZ from .supported_operators import SupportedOperators +from .tensor import BandwidthDirection from .tensor import MemArea from .tensor import MemType from .tensor import TensorFormat @@ -465,6 +466,12 @@ class ArchitectureFeatures: self.axi1_port = MemArea.Dram self.memory_clock_scales[MemArea.Sram] = 1.0 self.memory_clock_scales[MemArea.Dram] = 0.75 # 3 / 4 + self.memory_burst_length[MemArea.Sram] = 32 + self.memory_burst_length[MemArea.Dram] = 128 + self.memory_latency[MemArea.Sram][BandwidthDirection.Read] = 32 + self.memory_latency[MemArea.Sram][BandwidthDirection.Write] = 32 + self.memory_latency[MemArea.Dram][BandwidthDirection.Read] = 500 + self.memory_latency[MemArea.Dram][BandwidthDirection.Write] = 250 else: # Default Ethos-U55 system configuration # Ethos-U55 High-End Embedded: SRAM (4 GB/s) and Flash (0.5 GB/s) @@ -473,6 +480,12 @@ class ArchitectureFeatures: self.axi1_port = MemArea.OffChipFlash self.memory_clock_scales[MemArea.Sram] = 1.0 self.memory_clock_scales[MemArea.OffChipFlash] = 0.125 # 1 / 8 + self.memory_burst_length[MemArea.Sram] = 32 + self.memory_burst_length[MemArea.OffChipFlash] = 128 + self.memory_latency[MemArea.Sram][BandwidthDirection.Read] = 32 + self.memory_latency[MemArea.Sram][BandwidthDirection.Write] = 32 + self.memory_latency[MemArea.OffChipFlash][BandwidthDirection.Read] = 64 + self.memory_latency[MemArea.OffChipFlash][BandwidthDirection.Write] = 64 def _set_default_mem_mode(self): # ArchitectureFeatures.DEFAULT_CONFIG values @@ -500,6 +513,8 @@ class ArchitectureFeatures: self.axi0_port = MemArea(1) self.axi1_port = MemArea(1) self.memory_clock_scales = np.ones(MemArea.Size) + self.memory_burst_length = np.ones(MemArea.Size) + self.memory_latency = np.zeros((MemArea.Size, BandwidthDirection.Size)) self.const_mem_area = MemPort(1) self.arena_mem_area = MemPort(1) self.cache_mem_area = MemPort(1) @@ -526,7 +541,25 @@ class ArchitectureFeatures: sys_cfg_section, mem_area.name + "_clock_scale", self.memory_clock_scales[mem_area] ) ) - + self.memory_burst_length[mem_area] = int( + self._read_config( + sys_cfg_section, mem_area.name + "_burst_length", self.memory_burst_length[mem_area] + ) + ) + self.memory_latency[mem_area][BandwidthDirection.Read] = int( + self._read_config( + sys_cfg_section, + mem_area.name + "_read_latency", + self.memory_latency[mem_area][BandwidthDirection.Read], + ) + ) + self.memory_latency[mem_area][BandwidthDirection.Write] = int( + self._read_config( + sys_cfg_section, + mem_area.name + "_write_latency", + self.memory_latency[mem_area][BandwidthDirection.Write], + ) + ) elif self.system_config == ArchitectureFeatures.DEFAULT_CONFIG: self._set_default_sys_config() @@ -578,6 +611,8 @@ class ArchitectureFeatures: self.const_mem_area = MemPort.Axi0 self.axi0_port = MemArea.OnChipFlash self.memory_clock_scales[MemArea.OnChipFlash] = self.memory_clock_scales[MemArea.Sram] + self.memory_burst_length[MemArea.OnChipFlash] = self.memory_burst_length[MemArea.Sram] + self.memory_latency[MemArea.OnChipFlash] = self.memory_latency[MemArea.Sram] # check configuration if self._mem_port_mapping(self.cache_mem_area) != MemArea.Sram: @@ -623,6 +658,9 @@ class ArchitectureFeatures: print(f" axi1_port = {self.axi1_port.name}") for mem in (MemArea.Sram, MemArea.Dram, MemArea.OnChipFlash, MemArea.OffChipFlash): print(f" {mem.name}_clock_scales = {self.memory_clock_scales[mem]}") + print(f" {mem.name}_burst_length = {self.memory_burst_length[mem]}") + print(f" {mem.name}_read_latency = {self.memory_latency[mem][BandwidthDirection.Read]}") + print(f" {mem.name}_write_latency = {self.memory_latency[mem][BandwidthDirection.Write]}") print(f"Memory Mode ({self.memory_mode}):") print(f" const_mem_area = {self.const_mem_area.name}") diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 8ada1e23..9d83f6fb 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -33,6 +33,7 @@ from .nn_graph import SchedulerRewrite from .operation import NpuBlockType from .operation import Op from .shared_buffer_allocation import is_acc_40bits_used +from .tensor import BandwidthDirection from .tensor import MemArea from .tensor import shape_num_elements from .tensor import Tensor @@ -90,22 +91,6 @@ class PassCycles(IntEnum): ) -class BandwidthDirection(IntEnum): - Read = 0 - Write = auto() - Size = auto() - - def display_name(self): - return self.name - - def identifier_name(self): - return self.name.lower() - - @staticmethod - def all(): - return (BandwidthDirection.Read, BandwidthDirection.Write) - - def make_bandwidth_array(): return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) @@ -133,8 +118,6 @@ def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversa def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, dpu_cycles=0): - latencies_rd = {MemArea.Sram: 32, MemArea.Dram: 500, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64} - latencies_wr = {MemArea.Sram: 32, MemArea.Dram: 250, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64} ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk") ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk") cycles_ifm_blk = ( @@ -146,11 +129,11 @@ def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area] ) return ( - latencies_rd[ifm_tensor.mem_area] + arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read] + cycles_ifm_blk + dpu_cycles + output_cycles - + latencies_wr[ofm_tensor.mem_area] + + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write] + cycles_ofm_blk ) / 4 @@ -351,13 +334,12 @@ def estimate_conv_pooling_cycles( ) if scale_tensor: - if scale_tensor.mem_area is MemArea.Sram: - latency = 32 - elif scale_tensor.mem_area is MemArea.Dram: - latency = 500 - else: - latency = 64 - cycles_bias_blk = 10 * min(ofm_block.depth, ofm_tens_shape[3]) * latency / 256 + cycles_bias_blk = ( + 10 + * min(ofm_block.depth, ofm_tens_shape[3]) + * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read] + / 256 + ) cycles_output_blk = max(cycles_output_blk, cycles_bias_blk) cycles_cmd = get_minimal_cmd_cycles( @@ -380,7 +362,6 @@ def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block # Estimate memory transfer efficiency by calculating the burst length # this is related to data format, block shape, and tensor shape, etc. - max_burst_len = 32 if mem_area == MemArea.Sram else 128 burst_len = 0 elem_size = tensor.dtype.size_in_bytes() is_ifm = direction == BandwidthDirection.Read @@ -408,10 +389,10 @@ def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block else: burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size) - burst_len = min(max_burst_len, burst_len) + burst_len = min(arch.memory_burst_length[mem_area], burst_len) bw = tens.bandwidth() if replace_bw is None else replace_bw - return bw * (max_burst_len / burst_len) + return bw * (arch.memory_burst_length[mem_area] / burst_len) def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False): diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index de97710a..257cb5ff 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -19,6 +19,7 @@ import copy import enum import uuid from collections import defaultdict +from enum import auto from functools import lru_cache from typing import Dict from typing import List @@ -62,6 +63,22 @@ class MemType(enum.IntFlag): return self.name +class BandwidthDirection(enum.IntEnum): + Read = 0 + Write = auto() + Size = auto() + + def display_name(self): + return self.name + + def identifier_name(self): + return self.name.lower() + + @staticmethod + def all(): + return (BandwidthDirection.Read, BandwidthDirection.Write) + + class MemArea(enum.IntFlag): Unknown = 0 Sram = 1 @@ -26,7 +26,13 @@ core_clock=200e6 axi0_port=Sram axi1_port=OffChipFlash Sram_clock_scale=1.0 +Sram_burst_length=32 +Sram_read_latency=32 +Sram_write_latency=32 OffChipFlash_clock_scale=0.0625 +OffChipFlash_burst_length=128 +OffChipFlash_read_latency=64 +OffChipFlash_write_latency=64 ; Ethos-U55 High-End Embedded: SRAM (4 GB/s) and Flash (0.5 GB/s) [System_Config.Ethos_U55_High_End_Embedded] @@ -34,7 +40,13 @@ core_clock=500e6 axi0_port=Sram axi1_port=OffChipFlash Sram_clock_scale=1.0 +Sram_burst_length=32 +Sram_read_latency=32 +Sram_write_latency=32 OffChipFlash_clock_scale=0.125 +OffChipFlash_burst_length=128 +OffChipFlash_read_latency=64 +OffChipFlash_write_latency=64 ; Ethos-U65 Embedded: SRAM (8 GB/s) and Flash (0.5 GB/s) [System_Config.Ethos_U65_Embedded] @@ -42,7 +54,13 @@ core_clock=500e6 axi0_port=Sram axi1_port=OffChipFlash Sram_clock_scale=1.0 +Sram_burst_length=32 +Sram_read_latency=32 +Sram_write_latency=32 OffChipFlash_clock_scale=0.0625 +OffChipFlash_burst_length=128 +OffChipFlash_read_latency=64 +OffChipFlash_write_latency=64 ; Ethos-U65 Mid-End: SRAM (8 GB/s) and DRAM (3.75 GB/s) [System_Config.Ethos_U65_Mid_End] @@ -50,7 +68,13 @@ core_clock=500e6 axi0_port=Sram axi1_port=Dram Sram_clock_scale=1.0 +Sram_burst_length=32 +Sram_read_latency=32 +Sram_write_latency=32 Dram_clock_scale=0.46875 +Dram_burst_length=128 +Dram_read_latency=500 +Dram_write_latency=250 ; Ethos-U65 High-End: SRAM (16 GB/s) and DRAM (3.75 GB/s) [System_Config.Ethos_U65_High_End] @@ -58,7 +82,13 @@ core_clock=1e9 axi0_port=Sram axi1_port=Dram Sram_clock_scale=1.0 +Sram_burst_length=32 +Sram_read_latency=32 +Sram_write_latency=32 Dram_clock_scale=0.234375 +Dram_burst_length=128 +Dram_read_latency=500 +Dram_write_latency=250 ; Ethos-U65 Client-Server: SRAM (16 GB/s) and DRAM (12 GB/s) [System_Config.Ethos_U65_Client_Server] @@ -66,7 +96,13 @@ core_clock=1e9 axi0_port=Sram axi1_port=Dram Sram_clock_scale=1.0 +Sram_burst_length=32 +Sram_read_latency=32 +Sram_write_latency=32 Dram_clock_scale=0.75 +Dram_burst_length=128 +Dram_read_latency=500 +Dram_write_latency=250 ; ----------------------------------------------------------------------------- ; Memory Mode @@ -96,4 +132,4 @@ cache_sram_size=393216 ; The non-SRAM memory is assumed to be read-writeable [Memory_Mode.Dedicated_Sram_512KB] inherit=Memory_Mode.Dedicated_Sram -cache_sram_size=524288
\ No newline at end of file +cache_sram_size=524288 |