From 79d07d2cbf1c5013ab40bb46a6ccd4c569966536 Mon Sep 17 00:00:00 2001
From: Tim Hall <tim.hall@arm.com>
Date: Mon, 27 Apr 2020 18:20:16 +0100
Subject: Add Vela codebase

 - Added modules ethosu.vela and ethosu.mlw_codec.
 - Added README and various configuration files.

Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee
---
 ethosu/vela/vela.py | 334 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 334 insertions(+)
 create mode 100644 ethosu/vela/vela.py

(limited to 'ethosu/vela/vela.py')

diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
new file mode 100644
index 00000000..f07aec89
--- /dev/null
+++ b/ethosu/vela/vela.py
@@ -0,0 +1,334 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Main entry point for the Vela compiler.
+#
+# Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
+
+import sys
+import os.path
+import os
+import time
+import subprocess
+import configparser
+import argparse
+import ast
+
+from . import architecture_features
+from . import stats_writer
+from . import tflite_writer
+from . import model_reader
+from . import compiler_driver
+from . import scheduler
+from ._version import __version__
+from .scheduler import ParetoMetric
+from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement
+
+
+def process(fname, arch, model_reader_options, compiler_options, scheduler_options):
+    if compiler_options.timing:
+        start = time.time()
+
+    nng = model_reader.read_model(fname, model_reader_options)
+
+    if not nng:
+        print("reading of", fname, "failed")
+        assert False
+
+    if compiler_options.verbose_operators:
+        nng.print_operators()
+
+    if compiler_options.timing:
+        stop = time.time()
+        print("Model reading took %f s" % (stop - start))
+        start = time.time()
+
+    compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options)
+
+    passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
+    stats_writer.write_pass_metrics_csv(nng, passes_csv_file)
+
+    summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
+    stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
+
+    stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
+
+    if fname.endswith(".tflite"):
+        tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name))
+
+    if compiler_options.timing:
+        stop = time.time()
+        print("Compiler driver took %f s" % (stop - start))
+
+    return nng
+
+
+def print_subgraph_io_summary(nng):
+    """Print a summary of all the input and output tensor sizes for all subgraphs.
+    Also displays the total tensor size and the memory used area for sram.
+    """
+
+    print("Subgraph IO Summary")
+    print("-------------------")
+    print("NNG: {0}".format(nng.name))
+    max_sg_size = 0
+    for sg in reversed(nng.subgraphs):
+        print("   Subgraph: {0} = {1}".format(sg.name, sg.placement))
+        sg_size = 0
+
+        if sg.placement == PassPlacement.Npu:
+            for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors:
+                if tens in sg.input_tensors:
+                    tens_dir = "In"
+                elif tens in sg.output_tensors:
+                    tens_dir = "Out"
+                else:
+                    tens_dir = "In/Out"
+
+                size = tens.elements() * tens.element_size() / 1024.0
+                sg_size = sg_size + size
+                print("         Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size))
+
+        print("      Total Size = {0} KiB".format(sg_size))
+        print("      SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0))
+        max_sg_size = max(sg_size, max_sg_size)
+
+    print("   Maximum Subgraph Size = {0} KiB".format(max_sg_size))
+
+
+def main(args=None):
+    if args is None:
+        args = sys.argv[1:]
+
+    parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55")
+
+    parser.add_argument(
+        "network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process"
+    )
+
+    parser.add_argument("--version", action="version", version=__version__)
+    parser.add_argument(
+        "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
+    )
+    parser.add_argument("--config", type=str, help="Location of vela configuration file")
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)")
+
+    parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter")
+    parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization")
+    parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing")
+    parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose")
+    parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format")
+    parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule")
+    parser.add_argument(
+        "--verbose-pareto-frontier-schedules",
+        action="store_true",
+        help="Show all schedules along the pareto frontier of optimisation criteria",
+    )
+    parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation")
+    parser.add_argument(
+        "--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream"
+    )
+    parser.add_argument(
+        "--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
+    )
+    parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
+
+    parser.add_argument(
+        "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
+    )
+    parser.add_argument(
+        "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
+    )
+    parser.add_argument(
+        "--cascading",
+        type=ast.literal_eval,
+        default=True,
+        choices=[True, False],
+        help="Controls the packing of multiple passes into a cascade (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--ifm-ofm-overlap",
+        type=ast.literal_eval,
+        default=True,
+        choices=[True, False],
+        help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)",
+    )
+    parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC")
+    parser.add_argument(
+        "--inter-pass-cycle-delay",
+        type=int,
+        default=0,
+        help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)",
+    )
+    parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
+    parser.add_argument(
+        "--accelerator-config",
+        type=str,
+        default="ethos-u55-256",
+        choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()),
+        help="Accelerator configuration to use (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--system-config",
+        type=str,
+        default="internal-default",
+        help="System configuration to use (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--dram-bandwidth",
+        type=float,
+        default=0.0,
+        help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--permanent-storage",
+        default=MemArea.OffChipFlash,
+        type=lambda s: MemArea[s],
+        choices=list(MemArea)[3:-1],
+        help=(
+            "Memory area for permanent storage. To store the weights and other constant data in SRAM select "
+            "'OnChipFlash' (default: %(default)s)"
+        ),
+    )
+    parser.add_argument(
+        "--tensor-allocator",
+        default=TensorAllocator.Greedy,
+        type=lambda s: TensorAllocator[s],
+        choices=list(TensorAllocator),
+        help="Tensor Allocator algorithm (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--show-subgraph-io-summary",
+        action="store_true",
+        help="Shows a summary of all the subgraphs and their inputs and outputs",
+    )
+    parser.add_argument(
+        "--ifm-streaming",
+        type=ast.literal_eval,
+        default=True,
+        choices=[True, False],
+        help="Controls scheduler IFM streaming search (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--block-config-limit",
+        type=int,
+        default=16,
+        help="Limit block config search space, use zero for unlimited (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--global-memory-clock-scale",
+        type=float,
+        default=1.0,
+        help=(
+            "Performs an additional scaling of the individual memory clock scales specified by the system config "
+            "(default: %(default)s)"
+        ),
+    )
+    parser.add_argument(
+        "--pareto-metric",
+        default=ParetoMetric.BwCycMem,
+        type=lambda s: ParetoMetric[s],
+        choices=list(ParetoMetric),
+        help="Controls the calculation of the pareto metric (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--recursion-limit",
+        type=int,
+        default=10000,
+        help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--max-block-dependency",
+        type=int,
+        default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
+        choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
+        help=(
+            "Set the maximum value that can be used for the block dependency between npu kernel operations "
+            "(default: %(default)s)"
+        ),
+    )
+
+    args = parser.parse_args(args=args)
+
+    # Read configuration file
+    config_file = args.config
+    config = None
+    if config_file is not None:
+        with open(config_file) as f:
+            config = configparser.ConfigParser()
+            config.read_file(f)
+
+    if args.network is None:
+        parser.error("the following argument is required: NETWORK")
+
+    sys.setrecursionlimit(args.recursion_limit)
+
+    if args.force_block_config:
+        force_block_config = architecture_features.Block.from_string(args.force_block_config)
+    else:
+        force_block_config = None
+
+    arch = architecture_features.ArchitectureFeatures(
+        vela_config=config,
+        system_config=args.system_config,
+        accelerator_config=args.accelerator_config,
+        permanent_storage=args.permanent_storage,
+        inter_pass_cycle_delay=args.inter_pass_cycle_delay,
+        dram_bandwidth=args.dram_bandwidth,
+        override_block_config=force_block_config,
+        block_config_limit=args.block_config_limit,
+        global_memory_clock_scale=args.global_memory_clock_scale,
+        max_blockdep=args.max_block_dependency,
+    )
+
+    compiler_options = compiler_driver.CompilerOptions(
+        verbose_graph=args.verbose_graph,
+        verbose_quantization=args.verbose_quantization,
+        verbose_packing=args.verbose_packing,
+        verbose_tensor_purpose=args.verbose_tensor_purpose,
+        verbose_tensor_format=args.verbose_tensor_format,
+        verbose_allocation=args.verbose_allocation,
+        verbose_high_level_command_stream=args.verbose_high_level_command_stream,
+        verbose_register_command_stream=args.verbose_register_command_stream,
+        verbose_operators=args.verbose_operators,
+        show_minimum_possible_allocation=args.show_minimum_possible_allocation,
+        show_cpu_operations=args.show_cpu_operations,
+        tensor_allocator=args.tensor_allocator,
+        timing=args.timing,
+        output_dir=args.output_dir,
+    )
+
+    scheduler_options = scheduler.SchedulerOptions(
+        use_cascading=args.cascading,
+        use_ifm_ofm_overlap=args.ifm_ofm_overlap,
+        verbose_schedule=args.verbose_schedule,
+        verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules,
+        use_ifm_streaming=args.ifm_streaming,
+        pareto_metric=args.pareto_metric,
+    )
+
+    model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size)
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options)
+
+    if args.show_subgraph_io_summary:
+        print_subgraph_io_summary(nng)
+
+    return 0
-- 
cgit v1.2.1