From 79d07d2cbf1c5013ab40bb46a6ccd4c569966536 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Mon, 27 Apr 2020 18:20:16 +0100 Subject: Add Vela codebase - Added modules ethosu.vela and ethosu.mlw_codec. - Added README and various configuration files. Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee --- ethosu/vela/vela.py | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 ethosu/vela/vela.py (limited to 'ethosu/vela/vela.py') diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py new file mode 100644 index 00000000..f07aec89 --- /dev/null +++ b/ethosu/vela/vela.py @@ -0,0 +1,334 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Main entry point for the Vela compiler. +# +# Provides command line interface, options parsing, and network loading. Before calling the compiler driver. + +import sys +import os.path +import os +import time +import subprocess +import configparser +import argparse +import ast + +from . import architecture_features +from . import stats_writer +from . import tflite_writer +from . import model_reader +from . import compiler_driver +from . import scheduler +from ._version import __version__ +from .scheduler import ParetoMetric +from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement + + +def process(fname, arch, model_reader_options, compiler_options, scheduler_options): + if compiler_options.timing: + start = time.time() + + nng = model_reader.read_model(fname, model_reader_options) + + if not nng: + print("reading of", fname, "failed") + assert False + + if compiler_options.verbose_operators: + nng.print_operators() + + if compiler_options.timing: + stop = time.time() + print("Model reading took %f s" % (stop - start)) + start = time.time() + + compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options) + + passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config) + stats_writer.write_pass_metrics_csv(nng, passes_csv_file) + + summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config) + stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch) + + stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch) + + if fname.endswith(".tflite"): + tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name)) + + if compiler_options.timing: + stop = time.time() + print("Compiler driver took %f s" % (stop - start)) + + return nng + + +def print_subgraph_io_summary(nng): + """Print a summary of all the input and output tensor sizes for all subgraphs. + Also displays the total tensor size and the memory used area for sram. + """ + + print("Subgraph IO Summary") + print("-------------------") + print("NNG: {0}".format(nng.name)) + max_sg_size = 0 + for sg in reversed(nng.subgraphs): + print(" Subgraph: {0} = {1}".format(sg.name, sg.placement)) + sg_size = 0 + + if sg.placement == PassPlacement.Npu: + for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors: + if tens in sg.input_tensors: + tens_dir = "In" + elif tens in sg.output_tensors: + tens_dir = "Out" + else: + tens_dir = "In/Out" + + size = tens.elements() * tens.element_size() / 1024.0 + sg_size = sg_size + size + print(" Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size)) + + print(" Total Size = {0} KiB".format(sg_size)) + print(" SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0)) + max_sg_size = max(sg_size, max_sg_size) + + print(" Maximum Subgraph Size = {0} KiB".format(max_sg_size)) + + +def main(args=None): + if args is None: + args = sys.argv[1:] + + parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55") + + parser.add_argument( + "network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process" + ) + + parser.add_argument("--version", action="version", version=__version__) + parser.add_argument( + "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)" + ) + parser.add_argument("--config", type=str, help="Location of vela configuration file") + parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)") + + parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter") + parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization") + parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing") + parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose") + parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format") + parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule") + parser.add_argument( + "--verbose-pareto-frontier-schedules", + action="store_true", + help="Show all schedules along the pareto frontier of optimisation criteria", + ) + parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation") + parser.add_argument( + "--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream" + ) + parser.add_argument( + "--verbose-register-command-stream", action="store_true", help="Verbose register command stream" + ) + parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list") + + parser.add_argument( + "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation" + ) + parser.add_argument( + "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU" + ) + parser.add_argument( + "--cascading", + type=ast.literal_eval, + default=True, + choices=[True, False], + help="Controls the packing of multiple passes into a cascade (default: %(default)s)", + ) + parser.add_argument( + "--ifm-ofm-overlap", + type=ast.literal_eval, + default=True, + choices=[True, False], + help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)", + ) + parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC") + parser.add_argument( + "--inter-pass-cycle-delay", + type=int, + default=0, + help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)", + ) + parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations") + parser.add_argument( + "--accelerator-config", + type=str, + default="ethos-u55-256", + choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()), + help="Accelerator configuration to use (default: %(default)s)", + ) + parser.add_argument( + "--system-config", + type=str, + default="internal-default", + help="System configuration to use (default: %(default)s)", + ) + parser.add_argument( + "--dram-bandwidth", + type=float, + default=0.0, + help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)", + ) + parser.add_argument( + "--permanent-storage", + default=MemArea.OffChipFlash, + type=lambda s: MemArea[s], + choices=list(MemArea)[3:-1], + help=( + "Memory area for permanent storage. To store the weights and other constant data in SRAM select " + "'OnChipFlash' (default: %(default)s)" + ), + ) + parser.add_argument( + "--tensor-allocator", + default=TensorAllocator.Greedy, + type=lambda s: TensorAllocator[s], + choices=list(TensorAllocator), + help="Tensor Allocator algorithm (default: %(default)s)", + ) + parser.add_argument( + "--show-subgraph-io-summary", + action="store_true", + help="Shows a summary of all the subgraphs and their inputs and outputs", + ) + parser.add_argument( + "--ifm-streaming", + type=ast.literal_eval, + default=True, + choices=[True, False], + help="Controls scheduler IFM streaming search (default: %(default)s)", + ) + parser.add_argument( + "--block-config-limit", + type=int, + default=16, + help="Limit block config search space, use zero for unlimited (default: %(default)s)", + ) + parser.add_argument( + "--global-memory-clock-scale", + type=float, + default=1.0, + help=( + "Performs an additional scaling of the individual memory clock scales specified by the system config " + "(default: %(default)s)" + ), + ) + parser.add_argument( + "--pareto-metric", + default=ParetoMetric.BwCycMem, + type=lambda s: ParetoMetric[s], + choices=list(ParetoMetric), + help="Controls the calculation of the pareto metric (default: %(default)s)", + ) + parser.add_argument( + "--recursion-limit", + type=int, + default=10000, + help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)", + ) + parser.add_argument( + "--max-block-dependency", + type=int, + default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP, + choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1), + help=( + "Set the maximum value that can be used for the block dependency between npu kernel operations " + "(default: %(default)s)" + ), + ) + + args = parser.parse_args(args=args) + + # Read configuration file + config_file = args.config + config = None + if config_file is not None: + with open(config_file) as f: + config = configparser.ConfigParser() + config.read_file(f) + + if args.network is None: + parser.error("the following argument is required: NETWORK") + + sys.setrecursionlimit(args.recursion_limit) + + if args.force_block_config: + force_block_config = architecture_features.Block.from_string(args.force_block_config) + else: + force_block_config = None + + arch = architecture_features.ArchitectureFeatures( + vela_config=config, + system_config=args.system_config, + accelerator_config=args.accelerator_config, + permanent_storage=args.permanent_storage, + inter_pass_cycle_delay=args.inter_pass_cycle_delay, + dram_bandwidth=args.dram_bandwidth, + override_block_config=force_block_config, + block_config_limit=args.block_config_limit, + global_memory_clock_scale=args.global_memory_clock_scale, + max_blockdep=args.max_block_dependency, + ) + + compiler_options = compiler_driver.CompilerOptions( + verbose_graph=args.verbose_graph, + verbose_quantization=args.verbose_quantization, + verbose_packing=args.verbose_packing, + verbose_tensor_purpose=args.verbose_tensor_purpose, + verbose_tensor_format=args.verbose_tensor_format, + verbose_allocation=args.verbose_allocation, + verbose_high_level_command_stream=args.verbose_high_level_command_stream, + verbose_register_command_stream=args.verbose_register_command_stream, + verbose_operators=args.verbose_operators, + show_minimum_possible_allocation=args.show_minimum_possible_allocation, + show_cpu_operations=args.show_cpu_operations, + tensor_allocator=args.tensor_allocator, + timing=args.timing, + output_dir=args.output_dir, + ) + + scheduler_options = scheduler.SchedulerOptions( + use_cascading=args.cascading, + use_ifm_ofm_overlap=args.ifm_ofm_overlap, + verbose_schedule=args.verbose_schedule, + verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules, + use_ifm_streaming=args.ifm_streaming, + pareto_metric=args.pareto_metric, + ) + + model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size) + + os.makedirs(args.output_dir, exist_ok=True) + + nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options) + + if args.show_subgraph_io_summary: + print_subgraph_io_summary(nng) + + return 0 -- cgit v1.2.1