From 1bd531dec0b4eb745fb8856d14c1aba2b8a73026 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Sun, 1 Nov 2020 20:59:36 +0000 Subject: MLBEDSW-3249: Vela config file examples - Added sample vela.ini config file - Changed vela config format, split into system config and memory mode - Removed unused CPU cycle performance estimation - Added new CLI options for --memory-mode and --verbose-config - Changed CLI option --config to take multiple files - Removed CLI option --global-memory-clock-scales - Changed error helper functions to raise a VelaError exception - Refactored to create a new is_spilling_enabled function Signed-off-by: Tim Hall Change-Id: I27c41577e37a3859edb9524cd99784be10ef0a0d --- ethosu/vela/architecture_features.py | 331 ++++++++++++++++------- ethosu/vela/compiler_driver.py | 35 ++- ethosu/vela/errors.py | 59 ++-- ethosu/vela/high_level_command_to_npu_op.py | 23 +- ethosu/vela/npu_performance.py | 32 +-- ethosu/vela/register_command_stream_generator.py | 7 +- ethosu/vela/scheduler.py | 8 +- ethosu/vela/stats_writer.py | 22 +- ethosu/vela/test/testutil.py | 7 +- ethosu/vela/vela.py | 52 ++-- 10 files changed, 351 insertions(+), 225 deletions(-) (limited to 'ethosu') diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py index 9ca4304c..7b6c3bed 100644 --- a/ethosu/vela/architecture_features.py +++ b/ethosu/vela/architecture_features.py @@ -21,7 +21,8 @@ from configparser import ConfigParser import numpy as np -from .errors import OptionError +from .errors import CliOptionError +from .errors import ConfigOptionError from .ethos_u55_regs.ethos_u55_regs import resampling_mode from .numeric_util import full_shape from .numeric_util import round_up @@ -131,6 +132,12 @@ class Accelerator(enum.Enum): return [e.value for e in cls] +@enum.unique +class MemPort(enum.Enum): + Axi0 = enum.auto() + Axi1 = enum.auto() + + class ArchitectureFeatures: """This class is a container for various parameters of the Ethos-U core and system configuration that can be tuned, either by command line @@ -169,26 +176,29 @@ class ArchitectureFeatures: OFMSplitDepth = 16 SubKernelMax = Block(8, 8, 65536) + DEFAULT_CONFIG = "internal-default" + def __init__( self, - vela_config: ConfigParser, + vela_config_files, accelerator_config, system_config, + memory_mode, override_block_config, block_config_limit, - global_memory_clock_scale, max_blockdep, weight_estimation_scaling, + verbose_config, ): accelerator_config = accelerator_config.lower() - self.vela_config = vela_config if accelerator_config not in Accelerator.member_list(): - raise OptionError("--accelerator-config", self.accelerator_config, "Unknown accelerator configuration") + raise CliOptionError("--accelerator-config", self.accelerator_config, "Unknown accelerator configuration") self.accelerator_config = Accelerator(accelerator_config) accel_config = ArchitectureFeatures.accelerator_configs[self.accelerator_config] self.config = accel_config self.system_config = system_config + self.memory_mode = memory_mode self.is_ethos_u65_system = self.accelerator_config in (Accelerator.Ethos_U65_256, Accelerator.Ethos_U65_512) self.max_outstanding_dma = 2 if self.is_ethos_u65_system else 1 @@ -201,14 +211,6 @@ class ArchitectureFeatures: self.override_block_config = override_block_config self.block_config_limit = block_config_limit - self.global_memory_clock_scale = global_memory_clock_scale - if self.global_memory_clock_scale <= 0.0 or self.global_memory_clock_scale > 1.0: - raise Exception( - "Invalid global_memory_clock_scale = " - + str(self.global_memory_clock_scale) - + " (must be > 0.0 and <= 1.0)" - ) - self.max_blockdep = max_blockdep self.weight_estimation_scaling = weight_estimation_scaling @@ -220,20 +222,13 @@ class ArchitectureFeatures: self.num_elem_wise_units = accel_config.elem_units self.num_macs_per_cycle = dpu_min_height * dpu_min_width * dpu_dot_product_width * dpu_min_ofm_channels - self.memory_clock_scales = np.zeros(MemArea.Size) - self.memory_port_widths = np.zeros(MemArea.Size) - - # Get system configuration - self.__read_sys_config(self.is_ethos_u65_system) + # Get system configuration and memory mode + self._get_vela_config(vela_config_files, verbose_config) - # apply the global memory clock scales to the individual ones from the system config - for mem in MemArea.all(): - self.memory_clock_scales[mem] *= self.global_memory_clock_scale + self.axi_port_width = 128 if self.is_ethos_u65_system else 64 + self.memory_bandwidths_per_cycle = self.axi_port_width * self.memory_clock_scales / 8 - self.memory_clocks = self.memory_clock_scales * self.npu_clock - self.memory_bandwidths_per_cycle = self.memory_port_widths * self.memory_clock_scales / 8 - - self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.npu_clock + self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.core_clock # Get output/activation performance numbers self._generate_output_perf_tables(self.accelerator_config) @@ -303,7 +298,7 @@ class ArchitectureFeatures: self.cycles_weight = 40 self.max_sram_used_weight = 1000 - if self.is_ethos_u65_system and (self.fast_storage_mem_area != self.feature_map_storage_mem_area): + if self.is_spilling_enabled(): self.max_sram_used_weight = 0 # Shared Buffer Block allocations @@ -582,100 +577,226 @@ class ArchitectureFeatures: return blockdep - def cpu_cycle_estimate(self, op): + def is_spilling_enabled(self): """ - Gets estimated performance of a CPU operation, based on a linear model of intercept, slope, - specified in the vela config file, in ConfigParser file format (.ini file). - Example configuration snippet: - [CpuPerformance.MyOperationType] - Cortex-Mx.intercept= - Cortex-Mx.slope= + Spilling is a feature that allows the Ethos-U to use a dedicated SRAM as a cache for various types of data """ - section = "CpuPerformance." + op.type.name - if self.vela_config is not None and section in self.vela_config: - op_config = self.vela_config[section] - try: - intercept = float(op_config.get(self.cpu_config + ".intercept", op_config["default.intercept"])) - slope = float(op_config.get(self.cpu_config + ".slope", op_config["default.slope"])) - n_elements = op.inputs[0].elements() - cycles = intercept + n_elements * slope - return cycles - except Exception: - print("Error: Reading CPU cycle estimate in vela configuration file, section {}".format(section)) - raise - - print("Warning: No configured CPU performance estimate for", op.type) - return 0 - - def __read_sys_config(self, is_ethos_u65_system): + return ( + self._mem_port_mapping(self.cache_mem_area) == MemArea.Sram and self.cache_mem_area != self.arena_mem_area + ) + + def _mem_port_mapping(self, mem_port): + mem_port_mapping = {MemPort.Axi0: self.axi0_port, MemPort.Axi1: self.axi1_port} + return mem_port_mapping[mem_port] + + def _set_default_sys_config(self): + print(f"Warning: Using {ArchitectureFeatures.DEFAULT_CONFIG} values for system configuration") + # ArchitectureFeatures.DEFAULT_CONFIG values + if self.is_ethos_u65_system: + # Default Ethos-U65 system configuration + # Ethos-U65 Client-Server: SRAM (16 GB/s) and DRAM (12 GB/s) + self.core_clock = 1e9 + self.axi0_port = MemArea.Sram + self.axi1_port = MemArea.Dram + self.memory_clock_scales[MemArea.Sram] = 1.0 + self.memory_clock_scales[MemArea.Dram] = 0.75 # 3 / 4 + else: + # Default Ethos-U55 system configuration + # Ethos-U55 High-End Embedded: SRAM (4 GB/s) and Flash (0.5 GB/s) + self.core_clock = 500e6 + self.axi0_port = MemArea.Sram + self.axi1_port = MemArea.OffChipFlash + self.memory_clock_scales[MemArea.Sram] = 1.0 + self.memory_clock_scales[MemArea.OffChipFlash] = 0.125 # 1 / 8 + + def _set_default_mem_mode(self): + print(f"Warning: Using {ArchitectureFeatures.DEFAULT_CONFIG} values for memory mode") + # ArchitectureFeatures.DEFAULT_CONFIG values + if self.is_ethos_u65_system: + # Default Ethos-U65 memory mode + # Dedicated SRAM: SRAM is only used by the Ethos-U + self.const_mem_area = MemPort.Axi1 + self.arena_mem_area = MemPort.Axi1 + self.cache_mem_area = MemPort.Axi0 + self.cache_sram_size = 384 * 1024 + else: + # Default Ethos-U65 memory mode + self.const_mem_area = MemPort.Axi1 + self.arena_mem_area = MemPort.Axi0 + self.cache_mem_area = MemPort.Axi0 + + def _get_vela_config(self, vela_config_files, verbose_config): """ - Gets the system configuration with the given name from the vela configuration file - Example configuration snippet: - [SysConfig.MyConfigName] - npu_freq= - cpu=Cortex-Mx - ... + Gets the system configuration and memory modes from one or more Vela configuration file(s) or uses some + defaults. """ - # Get system configuration from the vela configuration file - if self.vela_config is None: - print("Warning: Using default values for system configuration") - else: - section_key = "SysConfig." + self.system_config - if section_key not in self.vela_config: - raise OptionError("--system-config", self.system_config, "Unknown system configuration") - try: - self.npu_clock = float(self.__sys_config("npu_freq", "500e6")) - self.cpu_config = self.__sys_config("cpu", "Cortex-M7") + # all properties are optional and are initialised to a value of 1 (or the equivalent) + self.core_clock = 1 + self.axi0_port = MemArea(1) + self.axi1_port = MemArea(1) + self.memory_clock_scales = np.ones(MemArea.Size) + self.const_mem_area = MemPort(1) + self.arena_mem_area = MemPort(1) + self.cache_mem_area = MemPort(1) + self.cache_sram_size = 1 + + # read configuration file(s) + self.vela_config = None + + if vela_config_files is not None: + self.vela_config = ConfigParser() + self.vela_config.read(vela_config_files) + + # read system configuration + sys_cfg_section = "System_Config." + self.system_config + + if self.vela_config is not None and self.vela_config.has_section(sys_cfg_section): + self.core_clock = float(self._read_config(sys_cfg_section, "core_clock", self.core_clock)) + self.axi0_port = MemArea[self._read_config(sys_cfg_section, "axi0_port", self.axi0_port)] + self.axi1_port = MemArea[self._read_config(sys_cfg_section, "axi1_port", self.axi1_port)] + + for mem_area in (self.axi0_port, self.axi1_port): + self.memory_clock_scales[mem_area] = float( + self._read_config( + sys_cfg_section, mem_area.name + "_clock_scale", self.memory_clock_scales[mem_area] + ) + ) - self.memory_clock_scales[MemArea.Sram] = float(self.__sys_config("Sram_clock_scale", "1")) - self.memory_port_widths[MemArea.Sram] = int(self.__sys_config("Sram_port_width", "64")) + elif self.system_config == ArchitectureFeatures.DEFAULT_CONFIG: + self._set_default_sys_config() - self.memory_clock_scales[MemArea.OnChipFlash] = float(self.__sys_config("OnChipFlash_clock_scale", "1")) - self.memory_port_widths[MemArea.OnChipFlash] = int(self.__sys_config("OnChipFlash_port_width", "64")) + elif vela_config_files is None: + raise CliOptionError("--config", vela_config_files, "CLI Option not specified") - self.memory_clock_scales[MemArea.OffChipFlash] = float( - self.__sys_config("OffChipFlash_clock_scale", "0.25") + else: + raise CliOptionError( + "--system-config", + self.system_config, + "Section {} not found in Vela config file".format(sys_cfg_section), ) - self.memory_port_widths[MemArea.OffChipFlash] = int(self.__sys_config("OffChipFlash_port_width", "32")) - self.memory_clock_scales[MemArea.Dram] = float(self.__sys_config("Dram_clock_scale", "1")) - self.memory_port_widths[MemArea.Dram] = int(self.__sys_config("Dram_port_width", "32")) + # read the memory mode + mem_mode_section = "Memory_Mode." + self.memory_mode - self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")] - self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")] + if self.vela_config is not None and self.vela_config.has_section(mem_mode_section): + self.const_mem_area = MemPort[ + self._read_config(mem_mode_section, "const_mem_area", self.const_mem_area.name) + ] + self.arena_mem_area = MemPort[ + self._read_config(mem_mode_section, "arena_mem_area", self.arena_mem_area.name) + ] + self.cache_mem_area = MemPort[ + self._read_config(mem_mode_section, "cache_mem_area", self.cache_mem_area.name) + ] + self.cache_sram_size = int(self._read_config(mem_mode_section, "cache_sram_size", self.cache_sram_size)) - self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")] - if is_ethos_u65_system: - if self.permanent_storage_mem_area is not MemArea.Dram: - raise Exception( - "Invalid permanent_storage_mem_area = " - + str(self.permanent_storage_mem_area) - + " (must be 'DRAM' for Ethos-U65)." - ) - else: - if self.permanent_storage_mem_area not in set((MemArea.OnChipFlash, MemArea.OffChipFlash)): - raise Exception( - "Invalid permanent_storage_mem_area = " - + str(self.permanent_storage_mem_area) - + " (must be 'OnChipFlash' or 'OffChipFlash' for Ethos-U55)." - " To store the weights and other constant data in SRAM on Ethos-U55 select 'OnChipFlash'" - ) + elif self.memory_mode == ArchitectureFeatures.DEFAULT_CONFIG: + self._set_default_mem_mode() + + elif vela_config_files is None: + raise CliOptionError("--config", vela_config_files, "CLI Option not specified") + + else: + raise CliOptionError( + "--memory-mode", self.memory_mode, "Section {} not found in Vela config file".format(mem_mode_section), + ) - self.sram_size = 1024 * int(self.__sys_config("sram_size_kb", "204800")) + # override sram to onchipflash + if self._mem_port_mapping(self.const_mem_area) == MemArea.Sram: + if self.const_mem_area == self.arena_mem_area == self.cache_mem_area: + print( + "Info: Changing const_mem_area from Sram to OnChipFlash. This will use the same characteristics as" + " Sram." + ) + if self.const_mem_area == MemPort.Axi0: + self.const_mem_area = MemPort.Axi1 + self.axi1_port = MemArea.OnChipFlash + else: + self.const_mem_area = MemPort.Axi0 + self.axi0_port = MemArea.OnChipFlash + self.memory_clock_scales[MemArea.OnChipFlash] = self.memory_clock_scales[MemArea.Sram] + + # check configuration + if self._mem_port_mapping(self.cache_mem_area) != MemArea.Sram: + raise ConfigOptionError("cache_mem_area", self._mem_port_mapping(self.cache_mem_area).name, "Sram") + + if self.is_ethos_u65_system: + if self._mem_port_mapping(self.const_mem_area) not in ( + MemArea.Dram, + MemArea.OnChipFlash, + MemArea.OffChipFlash, + ): + raise ConfigOptionError( + "const_mem_area", + self._mem_port_mapping(self.const_mem_area).name, + "Dram or OnChipFlash or OffChipFlash", + ) - except Exception: - print("Error: Reading System Configuration in vela configuration file, section {}".format(section_key)) - raise + if self._mem_port_mapping(self.arena_mem_area) not in (MemArea.Sram, MemArea.Dram): + raise ConfigOptionError( + "arena_mem_area", self._mem_port_mapping(self.arena_mem_area).name, "Sram or Dram" + ) + else: + if self._mem_port_mapping(self.const_mem_area) not in (MemArea.OnChipFlash, MemArea.OffChipFlash): + raise ConfigOptionError( + "const_mem_area", self._mem_port_mapping(self.const_mem_area).name, "OnChipFlash or OffChipFlash" + ) - def __sys_config(self, key, default_value): + if self._mem_port_mapping(self.arena_mem_area) != MemArea.Sram: + raise ConfigOptionError("arena_mem_area", self._mem_port_mapping(self.arena_mem_area).name, "Sram") + + # assign existing memory areas + self.permanent_storage_mem_area = self._mem_port_mapping(self.const_mem_area) + self.feature_map_storage_mem_area = self._mem_port_mapping(self.arena_mem_area) + self.fast_storage_mem_area = self._mem_port_mapping(self.cache_mem_area) + + self.sram_size = self.cache_sram_size if self.is_spilling_enabled() else 9999 * 1024 * 1024 + + # display the system configuration and memory mode + if verbose_config: + print(f"System Configuration ({self.system_config}):") + print(f" core_clock = {self.core_clock}") + print(f" axi0_port = {self.axi0_port.name}") + print(f" axi1_port = {self.axi1_port.name}") + for mem in (MemArea.Sram, MemArea.Dram, MemArea.OnChipFlash, MemArea.OffChipFlash): + print(f" {mem.name}_clock_scales = {self.memory_clock_scales[mem]}") + + print(f"Memory Mode ({self.memory_mode}):") + print(f" const_mem_area = {self.const_mem_area.name}") + print(f" arena_mem_area = {self.arena_mem_area.name}") + print(f" cache_mem_area = {self.cache_mem_area.name}") + print(f" cache_sram_size = {self.cache_sram_size}") + + print("Architecture Settings:") + print(f" permanent_storage_mem_area = {self.permanent_storage_mem_area.name}") + print(f" feature_map_storage_mem_area = {self.feature_map_storage_mem_area.name}") + print(f" fast_storage_mem_area = {self.fast_storage_mem_area.name}") + print(f" sram_size = {self.sram_size}") + + def _read_config(self, section, key, current_value): """ - Gets the system configuration value with the given key from the vela config file. + Reads a given key from a particular section in the Vela config file. If the section contains the 'inherit' + option then we recurse into the section specified. If inherited sections result in multiple keys for a + particular option then the key from the parent section is used, regardless of the parsing order """ - if self.vela_config is None: - return default_value - section = "SysConfig." + self.system_config - result = self.vela_config[section].get(key, None) - if result is None: - raise Exception("Error: System Configuration Missing key {} in section [{}] ".format(key, section)) + if not self.vela_config.has_section(section): + raise ConfigOptionError( + "section", "{}. The section was not found in the Vela config file(s)".format(section) + ) + + result = str(current_value) + if self.vela_config.has_option(section, "inherit"): + inheritance_section = self.vela_config.get(section, "inherit") + # check for recursion loop + if inheritance_section == section: + raise ConfigOptionError( + "inherit", + "{}. This references its own section and recursion is not allowed".format(inheritance_section), + ) + result = self._read_config(inheritance_section, key, result) + + if self.vela_config.has_option(section, key): + result = self.vela_config.get(section, key) + return result diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index 9e1cb3ab..0739133b 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -225,31 +225,18 @@ def compiler_driver(nng, arch, options, scheduler_options): root_sg = nng.get_root_subgraph() alloc_list = [] - feature_maps_in_fast_storage = arch.feature_map_storage_mem_area == arch.fast_storage_mem_area - if feature_maps_in_fast_storage: - mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast))) - alloc_list.append(mem_alloc_scratch) - else: + if arch.is_spilling_enabled(): mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,))) mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,))) # Order is important alloc_list.append(mem_alloc_scratch_fast) alloc_list.append(mem_alloc_scratch) + else: + mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast))) + alloc_list.append(mem_alloc_scratch) for mem_area, mem_type_set in alloc_list: - if feature_maps_in_fast_storage or mem_area != arch.fast_storage_mem_area: - tensor_allocation.allocate_tensors( - nng, - root_sg, - arch, - mem_area, - mem_type_set, - tensor_allocator=options.tensor_allocator, - verbose_allocation=options.verbose_allocation, - show_minimum_possible_allocation=options.show_minimum_possible_allocation, - allocation_alignment=options.allocation_alignment, - ) - else: + if arch.is_spilling_enabled() and mem_area == arch.fast_storage_mem_area: # For the case where scratch_fast != scratch: attempt to place feature maps used between # cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM. alloc_results = [] @@ -285,6 +272,18 @@ def compiler_driver(nng, arch, options, scheduler_options): "Increasing the value of --weight-estimation-scaling may help to resolve the issue. " "See OPTIONS.md for more information.".format(arch.sram_size) ) + else: + tensor_allocation.allocate_tensors( + nng, + root_sg, + arch, + mem_area, + mem_type_set, + tensor_allocator=options.tensor_allocator, + verbose_allocation=options.verbose_allocation, + show_minimum_possible_allocation=options.show_minimum_possible_allocation, + allocation_alignment=options.allocation_alignment, + ) # Generate command streams and serialise Npu-ops into tensors for sg in nng.subgraphs: diff --git a/ethosu/vela/errors.py b/ethosu/vela/errors.py index 1a30d546..2a635d0e 100644 --- a/ethosu/vela/errors.py +++ b/ethosu/vela/errors.py @@ -15,8 +15,6 @@ # limitations under the License. # Description: # Defines custom exceptions. -import sys - from .operation import Operation from .tensor import Tensor @@ -25,31 +23,52 @@ class VelaError(Exception): """Base class for vela exceptions""" def __init__(self, data): - self.data = data + self.data = "Error: " + data def __str__(self): return repr(self.data) class InputFileError(VelaError): - """Raised when reading the input file results in errors""" + """Raised when reading an input file results in errors""" def __init__(self, file_name, msg): - self.data = "Error reading input file {}: {}".format(file_name, msg) + self.data = "Reading input file {}: {}".format(file_name, msg) class UnsupportedFeatureError(VelaError): - """Raised when the input file uses non-supported features that cannot be handled""" + """Raised when the input network uses non-supported features that cannot be handled""" def __init__(self, data): - self.data = "The input file uses a feature that is currently not supported: {}".format(data) + self.data = "Input network uses a feature that is currently not supported: {}".format(data) + +class CliOptionError(VelaError): + """Raised for errors encountered with a command line option -class OptionError(VelaError): - """Raised when an incorrect command line option is used""" + :param option: str object that contains the name of the command line option + :param option_value: the command line option that resulted in the error + :param msg: str object that contains a description of the specific error encountered + """ def __init__(self, option, option_value, msg): - self.data = "Incorrect argument to CLI option: {} {}: {}".format(option, option_value, msg) + self.data = "Incorrect argument to CLI option: {} = {}: {}".format(option, option_value, msg) + + +class ConfigOptionError(VelaError): + """Raised for errors encountered with a configuration option + + :param option: str object that contains the name of the configuration option + :param option_value: the configuration option that resulted in the error + :param option_valid_values (optional): str object that contains the valid configuration option values + """ + + def __init__(self, option, option_value, option_valid_values=None): + self.data = "Invalid configuration of {} = {}".format(option, option_value) + if option_valid_values is not None: + self.data += " (must be {}).".format(option_valid_values) + else: + self.data += "." class AllocationError(VelaError): @@ -60,7 +79,12 @@ class AllocationError(VelaError): def OperatorError(op, msg): - """Called when parsing an operator results in errors""" + """ + Raises a VelaError exception for errors encountered when parsing an Operation + + :param op: Operation object that resulted in the error + :param msg: str object that contains a description of the specific error encountered + """ assert isinstance(op, Operation) @@ -91,12 +115,16 @@ def OperatorError(op, msg): data = data[:-1] # remove last newline - print("Error: {}".format(data)) - sys.exit(1) + raise VelaError(data) def TensorError(tens, msg): - """Called when parsing a tensor results in errors""" + """ + Raises a VelaError exception for errors encountered when parsing a Tensor + + :param tens: Tensor object that resulted in the error + :param msg: str object that contains a description of the specific error encountered + """ assert isinstance(tens, Tensor) @@ -126,5 +154,4 @@ def TensorError(tens, msg): data = data[:-1] # remove last newline - print("Error: {}".format(data)) - sys.exit(1) + raise VelaError(data) diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py index f7864442..efd8a03d 100644 --- a/ethosu/vela/high_level_command_to_npu_op.py +++ b/ethosu/vela/high_level_command_to_npu_op.py @@ -171,20 +171,17 @@ def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding: def get_region(tens: Tensor, arch: ArchitectureFeatures) -> int: - if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area: - base_ptr_idx_map = { - MemType.Permanent_NPU: BasePointerIndex.WeightTensor, - MemType.Permanent_CPU: BasePointerIndex.WeightTensor, - MemType.Scratch: BasePointerIndex.ScratchTensor, - MemType.Scratch_fast: BasePointerIndex.ScratchTensor, - } + base_ptr_idx_map = { + MemType.Permanent_NPU: BasePointerIndex.WeightTensor, + MemType.Permanent_CPU: BasePointerIndex.WeightTensor, + MemType.Scratch: BasePointerIndex.ScratchTensor, + } + + if arch.is_spilling_enabled(): + base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchFastTensor else: - base_ptr_idx_map = { - MemType.Permanent_NPU: BasePointerIndex.WeightTensor, - MemType.Permanent_CPU: BasePointerIndex.WeightTensor, - MemType.Scratch: BasePointerIndex.ScratchTensor, - MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor, - } + base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchTensor + return int(base_ptr_idx_map[tens.mem_type]) diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 29e0df9a..d1be5a50 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -60,7 +60,6 @@ def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_conf class PassCycles(IntEnum): Npu = 0 - Cpu = auto() SramAccess = auto() DramAccess = auto() OnChipFlashAccess = auto() @@ -69,34 +68,19 @@ class PassCycles(IntEnum): Size = auto() def display_name(self): - return ( - "NPU", - "CPU", - "SRAM Access", - "DRAM Access", - "On-chip Flash Access", - "Off-chip Flash Access", - "Total", - "Size", - )[self.value] + return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[ + self.value + ] def identifier_name(self): - return ( - "npu", - "cpu", - "sram_access", - "dram_access", - "on_chip_flash_access", - "off_chip_flash_access", - "total", - "size", - )[self.value] + return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[ + self.value + ] @staticmethod def all(): return ( PassCycles.Npu, - PassCycles.Cpu, PassCycles.SramAccess, PassCycles.DramAccess, PassCycles.OnChipFlashAccess, @@ -460,9 +444,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f ofm_block = Block(block_config[1], block_config[0], block_config[3]) ifm_block = Block(block_config[1], block_config[0], block_config[3]) - if ps.placement == PassPlacement.Cpu: - cycles[PassCycles.Cpu] = arch.cpu_cycle_estimate(ps.ops[0]) - elif primary_op: + if ps.placement == PassPlacement.Npu and primary_op: skirt = primary_op.attrs.get("skirt", skirt) explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding) assert primary_op.type.npu_block_type == ps.npu_block_type diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index dd63d2ef..e612c301 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -1281,14 +1281,15 @@ def generate_register_command_stream(npu_op_list: List[NpuOperation], accelerato """ emit = CommandStreamEmitter() arch = ArchitectureFeatures( - vela_config=None, - system_config=None, + vela_config_files=None, accelerator_config=accelerator.value, + system_config=ArchitectureFeatures.DEFAULT_CONFIG, + memory_mode=ArchitectureFeatures.DEFAULT_CONFIG, override_block_config=None, block_config_limit=None, - global_memory_clock_scale=1.0, max_blockdep=ArchitectureFeatures.MAX_BLOCKDEP, weight_estimation_scaling=1.0, + verbose_config=False, ) generate_command_stream(emit, npu_op_list, arch) return emit.to_list() diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 4af83a10..889bd06b 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -249,10 +249,6 @@ class DynamicProgrammingScheduler: self.n_combinations_searched = 0 - self.feature_maps_not_in_fast_storage = ( - arch.tensor_storage_mem_area[TensorPurpose.FeatureMap] != arch.fast_storage_mem_area - ) - self.pareto_max_candidates = 16 self.ifm_stream_npu_blocks = set( @@ -694,7 +690,7 @@ class DynamicProgrammingScheduler: all_candidates = [] for pred_pass in pred_pass_list: # recurse into the next pass - ifm_strat_data = self.search_ifm_streaming_body(pred_pass, self.feature_maps_not_in_fast_storage) + ifm_strat_data = self.search_ifm_streaming_body(pred_pass, self.arch.is_spilling_enabled()) strat_data = self.search_all_but_one_predecessor(ps, pred_pass, ifm_strat_data) for strat_opt in strat_data: @@ -1020,7 +1016,7 @@ class DynamicProgrammingScheduler: output.set_format(TensorFormat.NHCWB16, arch) for rewrite_op in rewrites: rewrite_op.outputs[0].set_format(TensorFormat.NHCWB16, arch) - if self.feature_maps_not_in_fast_storage: + if arch.is_spilling_enabled(): # Remember feature maps that can be moved to fast storage for later use # in use_fast_storage_for_feature_maps self.sg.scheduling_info["feature_map_rewrites"] = fast_storage_tensor_rewrites diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py index 3cd769f0..e4b81561 100644 --- a/ethosu/vela/stats_writer.py +++ b/ethosu/vela/stats_writer.py @@ -46,7 +46,7 @@ def write_summary_metrics_csv(nng, summary_filename, arch): ] labels += ( - ["accelerator_configuration", "system_config", "npu_clock", "sram_size"] + ["accelerator_configuration", "system_config", "memory_mode", "core_clock", "sram_size"] + [area.identifier_name() + "_bandwidth" for area in mem_areas] + ["weights_storage_area", "feature_map_storage_area"] ) @@ -83,7 +83,13 @@ def write_summary_metrics_csv(nng, summary_filename, arch): if arch: data_items += ( - [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024] + [ + arch.accelerator_config.name, + arch.system_config, + arch.memory_mode, + arch.core_clock, + arch.sram_size / 1024, + ] + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas] + [ arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(), @@ -91,7 +97,7 @@ def write_summary_metrics_csv(nng, summary_filename, arch): ] ) - midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock + midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.core_clock if midpoint_inference_time > 0: midpoint_fps = 1 / midpoint_inference_time else: @@ -162,7 +168,6 @@ def write_pass_metrics_csv(nng, pass_filename): all_cycles = ( PassCycles.Total, PassCycles.Npu, - PassCycles.Cpu, PassCycles.SramAccess, PassCycles.DramAccess, PassCycles.OnChipFlashAccess, @@ -239,7 +244,7 @@ def print_performance_metrics_for_strat( orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()] - midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock + midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock if midpoint_inference_time > 0: midpoint_fps = 1 / midpoint_inference_time else: @@ -252,9 +257,10 @@ def print_performance_metrics_for_strat( if name: print("", file=f) print("Network summary for", name, file=f) - print("Accelerator configuration {:20}".format(arch.accelerator_config), file=f) - print("System configuration {:20}".format(arch.system_config), file=f) - print("Accelerator clock {:12d} MHz".format(int(arch.npu_clock / 1e6)), file=f) + print("Accelerator configuration {:>20}".format(arch.accelerator_config.name), file=f) + print("System configuration {:>20}".format(arch.system_config), file=f) + print("Memory mode {:>20}".format(arch.memory_mode), file=f) + print("Accelerator clock {:12d} MHz".format(int(arch.core_clock / 1e6)), file=f) for mem_area, label in mem_area_labels: print( "Design peak {:25} {:12.2f} GB/s".format( diff --git a/ethosu/vela/test/testutil.py b/ethosu/vela/test/testutil.py index 82588278..7cdd4f5e 100644 --- a/ethosu/vela/test/testutil.py +++ b/ethosu/vela/test/testutil.py @@ -28,14 +28,15 @@ from ethosu.vela.tensor import Tensor def create_arch(): return architecture_features.ArchitectureFeatures( - vela_config=None, - system_config=None, + vela_config_files=None, accelerator_config=architecture_features.Accelerator.Ethos_U55_128.value, + system_config=architecture_features.ArchitectureFeatures.DEFAULT_CONFIG, + memory_mode=architecture_features.ArchitectureFeatures.DEFAULT_CONFIG, override_block_config=None, block_config_limit=None, - global_memory_clock_scale=1.0, max_blockdep=0, weight_estimation_scaling=1.0, + verbose_config=False, ) diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index 6835607a..4f632d56 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -19,8 +19,7 @@ # Provides command line interface, options parsing, and network loading. Before calling the compiler driver. import argparse import ast -import configparser -import os.path +import os import sys import time @@ -196,13 +195,13 @@ def main(args=None): parser.add_argument( "--supported-ops-report", action="store_true", - help="Generate the SUPPORTED_OPS.md file in the current working directory and exits.", + help="Generate the SUPPORTED_OPS.md file in the current working directory and exit", ) + # set network nargs to be optional to allow the support-ops-report CLI option to be used standalone parser.add_argument( "network", metavar="NETWORK", type=str, default=None, nargs="?", help="Filename of network to process" ) - parser.add_argument( "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)" ) @@ -212,9 +211,10 @@ def main(args=None): default=None, help="Enables the calculation and writing of a network debug database to output directory", ) - - parser.add_argument("--config", type=str, help="Location of vela configuration file") - + parser.add_argument( + "--config", type=str, action="append", help="Vela configuration file(s) in Python ConfigParser .ini file format" + ) + parser.add_argument("--verbose-config", action="store_true", help="Verbose system configuration and memory mode") parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter") parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization") parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing") @@ -263,8 +263,14 @@ def main(args=None): parser.add_argument( "--system-config", type=str, - default="internal-default", - help="System configuration to use (default: %(default)s)", + default=architecture_features.ArchitectureFeatures.DEFAULT_CONFIG, + help="System configuration to select from the Vela configuration file (default: %(default)s)", + ) + parser.add_argument( + "--memory-mode", + type=str, + default=architecture_features.ArchitectureFeatures.DEFAULT_CONFIG, + help="Memory mode to select from the Vela configuration file (default: %(default)s)", ) parser.add_argument( "--tensor-allocator", @@ -291,15 +297,6 @@ def main(args=None): default=16, help="Limit block config search space, use zero for unlimited (default: %(default)s)", ) - parser.add_argument( - "--global-memory-clock-scale", - type=float, - default=1.0, - help=( - "Performs an additional scaling of the individual memory clock scales specified by the system config " - "(default: %(default)s)" - ), - ) parser.add_argument( "--pareto-metric", default=ParetoMetric.BwCycMem, @@ -344,14 +341,6 @@ def main(args=None): ) args = parser.parse_args(args=args) - # Read configuration file - config_file = args.config - config = None - if config_file is not None: - with open(config_file) as f: - config = configparser.ConfigParser() - config.read_file(f) - # Generate the supported ops report and exit if args.supported_ops_report: generate_supported_ops() @@ -360,6 +349,12 @@ def main(args=None): if args.network is None: parser.error("the following argument is required: NETWORK") + # check all config files exist because they will be read as a group + if args.config is not None: + for filename in args.config: + if not os.access(filename, os.R_OK): + raise InputFileError(filename, "File not found or is not readable.") + sys.setrecursionlimit(args.recursion_limit) if args.force_block_config: @@ -374,14 +369,15 @@ def main(args=None): parser.error("the following argument needs to be a power of 2: ALLOCATION_ALIGNMENT") arch = architecture_features.ArchitectureFeatures( - vela_config=config, + vela_config_files=args.config, system_config=args.system_config, + memory_mode=args.memory_mode, accelerator_config=args.accelerator_config, override_block_config=force_block_config, block_config_limit=args.block_config_limit, - global_memory_clock_scale=args.global_memory_clock_scale, max_blockdep=args.max_block_dependency, weight_estimation_scaling=args.weight_estimation_scaling, + verbose_config=args.verbose_config, ) compiler_options = compiler_driver.CompilerOptions( -- cgit v1.2.1