aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2020-04-27 18:20:16 +0100
committerTim Hall <tim.hall@arm.com>2020-04-29 13:00:51 +0100
commit79d07d2cbf1c5013ab40bb46a6ccd4c569966536 (patch)
tree410d17239b417be5593b3e6800001b797f8d3f98
parent47bca71566d4d10e48f5a4d66e1130b8bf60700d (diff)
downloadethos-u-vela-79d07d2cbf1c5013ab40bb46a6ccd4c569966536.tar.gz
Add Vela codebase0.1.0
- Added modules ethosu.vela and ethosu.mlw_codec. - Added README and various configuration files. Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee
-rw-r--r--Pipfile9
-rw-r--r--Pipfile.lock56
-rw-r--r--README.md112
-rw-r--r--ethosu/mlw_codec/makefile49
-rw-r--r--ethosu/mlw_codec/mlw_codecmodule.c174
-rw-r--r--ethosu/mlw_codec/mlw_common.h29
-rw-r--r--ethosu/mlw_codec/mlw_decode.c300
-rw-r--r--ethosu/mlw_codec/mlw_decode.h42
-rw-r--r--ethosu/mlw_codec/mlw_encode.c874
-rw-r--r--ethosu/mlw_codec/mlw_encode.h45
-rw-r--r--ethosu/mlw_codec/mlw_main.c177
-rw-r--r--ethosu/mlw_codec/test_mlw_codec.py43
-rw-r--r--ethosu/vela/__init__.py20
-rw-r--r--ethosu/vela/__main__.py22
-rw-r--r--ethosu/vela/_version.py19
-rw-r--r--ethosu/vela/architecture_features.py618
-rw-r--r--ethosu/vela/compiler_driver.py204
-rw-r--r--ethosu/vela/data_type.py116
-rw-r--r--ethosu/vela/driver_actions.py107
-rw-r--r--ethosu/vela/ethos_u55_regs/ethos_u55_regs.py3138
-rw-r--r--ethosu/vela/extract_npu_subgraphs.py253
-rw-r--r--ethosu/vela/graph_optimiser.py485
-rw-r--r--ethosu/vela/greedy_allocation.py95
-rw-r--r--ethosu/vela/high_level_command_stream.py365
-rw-r--r--ethosu/vela/high_level_command_stream_generator.py315
-rw-r--r--ethosu/vela/insert_dma.py60
-rw-r--r--ethosu/vela/live_range.py324
-rw-r--r--ethosu/vela/mark_tensors.py363
-rw-r--r--ethosu/vela/model_reader.py45
-rw-r--r--ethosu/vela/nn_graph.py548
-rw-r--r--ethosu/vela/npu_performance.py516
-rw-r--r--ethosu/vela/npu_serialisation.py145
-rw-r--r--ethosu/vela/numeric_util.py89
-rw-r--r--ethosu/vela/operation.py285
-rw-r--r--ethosu/vela/pass_packing.py489
-rw-r--r--ethosu/vela/range_set.py154
-rw-r--r--ethosu/vela/register_command_stream_generator.py945
-rw-r--r--ethosu/vela/rewrite_graph.py171
-rw-r--r--ethosu/vela/scaling.py91
-rw-r--r--ethosu/vela/scheduler.py949
-rw-r--r--ethosu/vela/shared_buffer_allocation.py199
-rw-r--r--ethosu/vela/stats_writer.py367
-rw-r--r--ethosu/vela/supported_operators.py243
-rw-r--r--ethosu/vela/tensor.py629
-rw-r--r--ethosu/vela/tensor_allocation.py139
-rw-r--r--ethosu/vela/tflite/AbsOptions.py22
-rw-r--r--ethosu/vela/tflite/ActivationFunctionType.py11
-rw-r--r--ethosu/vela/tflite/AddNOptions.py22
-rw-r--r--ethosu/vela/tflite/AddOptions.py30
-rw-r--r--ethosu/vela/tflite/ArgMaxOptions.py30
-rw-r--r--ethosu/vela/tflite/ArgMinOptions.py30
-rw-r--r--ethosu/vela/tflite/BatchToSpaceNDOptions.py22
-rw-r--r--ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py62
-rw-r--r--ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py46
-rw-r--r--ethosu/vela/tflite/Buffer.py46
-rw-r--r--ethosu/vela/tflite/BuiltinOperator.py131
-rw-r--r--ethosu/vela/tflite/BuiltinOptions.py106
-rw-r--r--ethosu/vela/tflite/CallOptions.py30
-rw-r--r--ethosu/vela/tflite/CastOptions.py38
-rw-r--r--ethosu/vela/tflite/CombinerType.py8
-rw-r--r--ethosu/vela/tflite/ConcatEmbeddingsOptions.py78
-rw-r--r--ethosu/vela/tflite/ConcatenationOptions.py38
-rw-r--r--ethosu/vela/tflite/Conv2DOptions.py70
-rw-r--r--ethosu/vela/tflite/CosOptions.py22
-rw-r--r--ethosu/vela/tflite/CustomOptionsFormat.py6
-rw-r--r--ethosu/vela/tflite/CustomQuantization.py46
-rw-r--r--ethosu/vela/tflite/DensifyOptions.py22
-rw-r--r--ethosu/vela/tflite/DepthToSpaceOptions.py30
-rw-r--r--ethosu/vela/tflite/DepthwiseConv2DOptions.py78
-rw-r--r--ethosu/vela/tflite/DequantizeOptions.py22
-rw-r--r--ethosu/vela/tflite/DimensionMetadata.py76
-rw-r--r--ethosu/vela/tflite/DimensionType.py7
-rw-r--r--ethosu/vela/tflite/DivOptions.py30
-rw-r--r--ethosu/vela/tflite/EmbeddingLookupSparseOptions.py30
-rw-r--r--ethosu/vela/tflite/EqualOptions.py22
-rw-r--r--ethosu/vela/tflite/ExpOptions.py22
-rw-r--r--ethosu/vela/tflite/ExpandDimsOptions.py22
-rw-r--r--ethosu/vela/tflite/FakeQuantOptions.py54
-rw-r--r--ethosu/vela/tflite/FillOptions.py22
-rw-r--r--ethosu/vela/tflite/FloorDivOptions.py22
-rw-r--r--ethosu/vela/tflite/FloorModOptions.py22
-rw-r--r--ethosu/vela/tflite/FullyConnectedOptions.py46
-rw-r--r--ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py7
-rw-r--r--ethosu/vela/tflite/GatherNdOptions.py22
-rw-r--r--ethosu/vela/tflite/GatherOptions.py30
-rw-r--r--ethosu/vela/tflite/GreaterEqualOptions.py22
-rw-r--r--ethosu/vela/tflite/GreaterOptions.py22
-rw-r--r--ethosu/vela/tflite/HardSwishOptions.py22
-rw-r--r--ethosu/vela/tflite/IfOptions.py38
-rw-r--r--ethosu/vela/tflite/Int32Vector.py46
-rw-r--r--ethosu/vela/tflite/L2NormOptions.py30
-rw-r--r--ethosu/vela/tflite/LSHProjectionOptions.py30
-rw-r--r--ethosu/vela/tflite/LSHProjectionType.py8
-rw-r--r--ethosu/vela/tflite/LSTMKernelType.py7
-rw-r--r--ethosu/vela/tflite/LSTMOptions.py54
-rw-r--r--ethosu/vela/tflite/LeakyReluOptions.py30
-rw-r--r--ethosu/vela/tflite/LessEqualOptions.py22
-rw-r--r--ethosu/vela/tflite/LessOptions.py22
-rw-r--r--ethosu/vela/tflite/LocalResponseNormalizationOptions.py54
-rw-r--r--ethosu/vela/tflite/LogSoftmaxOptions.py22
-rw-r--r--ethosu/vela/tflite/LogicalAndOptions.py22
-rw-r--r--ethosu/vela/tflite/LogicalNotOptions.py22
-rw-r--r--ethosu/vela/tflite/LogicalOrOptions.py22
-rw-r--r--ethosu/vela/tflite/MatrixDiagOptions.py22
-rw-r--r--ethosu/vela/tflite/MatrixSetDiagOptions.py22
-rw-r--r--ethosu/vela/tflite/MaximumMinimumOptions.py22
-rw-r--r--ethosu/vela/tflite/Metadata.py38
-rw-r--r--ethosu/vela/tflite/MirrorPadMode.py7
-rw-r--r--ethosu/vela/tflite/MirrorPadOptions.py30
-rw-r--r--ethosu/vela/tflite/Model.py150
-rw-r--r--ethosu/vela/tflite/MulOptions.py30
-rw-r--r--ethosu/vela/tflite/NegOptions.py22
-rw-r--r--ethosu/vela/tflite/NonMaxSuppressionV4Options.py22
-rw-r--r--ethosu/vela/tflite/NonMaxSuppressionV5Options.py22
-rw-r--r--ethosu/vela/tflite/NotEqualOptions.py22
-rw-r--r--ethosu/vela/tflite/OneHotOptions.py30
-rw-r--r--ethosu/vela/tflite/Operator.py177
-rw-r--r--ethosu/vela/tflite/OperatorCode.py46
-rw-r--r--ethosu/vela/tflite/PackOptions.py38
-rw-r--r--ethosu/vela/tflite/PadOptions.py22
-rw-r--r--ethosu/vela/tflite/PadV2Options.py22
-rw-r--r--ethosu/vela/tflite/Padding.py7
-rw-r--r--ethosu/vela/tflite/Pool2DOptions.py70
-rw-r--r--ethosu/vela/tflite/PowOptions.py22
-rw-r--r--ethosu/vela/tflite/QuantizationDetails.py7
-rw-r--r--ethosu/vela/tflite/QuantizationParameters.py145
-rw-r--r--ethosu/vela/tflite/QuantizeOptions.py22
-rw-r--r--ethosu/vela/tflite/RNNOptions.py30
-rw-r--r--ethosu/vela/tflite/RangeOptions.py22
-rw-r--r--ethosu/vela/tflite/RankOptions.py22
-rw-r--r--ethosu/vela/tflite/ReducerOptions.py30
-rw-r--r--ethosu/vela/tflite/ReshapeOptions.py46
-rw-r--r--ethosu/vela/tflite/ResizeBilinearOptions.py38
-rw-r--r--ethosu/vela/tflite/ResizeNearestNeighborOptions.py30
-rw-r--r--ethosu/vela/tflite/ReverseSequenceOptions.py38
-rw-r--r--ethosu/vela/tflite/ReverseV2Options.py22
-rw-r--r--ethosu/vela/tflite/SVDFOptions.py38
-rw-r--r--ethosu/vela/tflite/ScatterNdOptions.py22
-rw-r--r--ethosu/vela/tflite/SegmentSumOptions.py22
-rw-r--r--ethosu/vela/tflite/SelectOptions.py22
-rw-r--r--ethosu/vela/tflite/SelectV2Options.py22
-rw-r--r--ethosu/vela/tflite/SequenceRNNOptions.py38
-rw-r--r--ethosu/vela/tflite/ShapeOptions.py30
-rw-r--r--ethosu/vela/tflite/SkipGramOptions.py46
-rw-r--r--ethosu/vela/tflite/SliceOptions.py22
-rw-r--r--ethosu/vela/tflite/SoftmaxOptions.py30
-rw-r--r--ethosu/vela/tflite/SpaceToBatchNDOptions.py22
-rw-r--r--ethosu/vela/tflite/SpaceToDepthOptions.py30
-rw-r--r--ethosu/vela/tflite/SparseIndexVector.py9
-rw-r--r--ethosu/vela/tflite/SparseToDenseOptions.py30
-rw-r--r--ethosu/vela/tflite/SparsityParameters.py92
-rw-r--r--ethosu/vela/tflite/SplitOptions.py30
-rw-r--r--ethosu/vela/tflite/SplitVOptions.py30
-rw-r--r--ethosu/vela/tflite/SquareOptions.py22
-rw-r--r--ethosu/vela/tflite/SquaredDifferenceOptions.py22
-rw-r--r--ethosu/vela/tflite/SqueezeOptions.py46
-rw-r--r--ethosu/vela/tflite/StridedSliceOptions.py62
-rw-r--r--ethosu/vela/tflite/SubGraph.py122
-rw-r--r--ethosu/vela/tflite/SubOptions.py30
-rw-r--r--ethosu/vela/tflite/Tensor.py126
-rw-r--r--ethosu/vela/tflite/TensorType.py15
-rw-r--r--ethosu/vela/tflite/TileOptions.py22
-rw-r--r--ethosu/vela/tflite/TopKV2Options.py22
-rw-r--r--ethosu/vela/tflite/TransposeConvOptions.py46
-rw-r--r--ethosu/vela/tflite/TransposeOptions.py22
-rw-r--r--ethosu/vela/tflite/Uint16Vector.py46
-rw-r--r--ethosu/vela/tflite/Uint8Vector.py46
-rw-r--r--ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py54
-rw-r--r--ethosu/vela/tflite/UniqueOptions.py30
-rw-r--r--ethosu/vela/tflite/UnpackOptions.py38
-rw-r--r--ethosu/vela/tflite/WhereOptions.py22
-rw-r--r--ethosu/vela/tflite/WhileOptions.py38
-rw-r--r--ethosu/vela/tflite/ZerosLikeOptions.py22
-rw-r--r--ethosu/vela/tflite/__init__.py0
-rw-r--r--ethosu/vela/tflite_mapping.py644
-rw-r--r--ethosu/vela/tflite_reader.py252
-rw-r--r--ethosu/vela/tflite_writer.py424
-rw-r--r--ethosu/vela/vela.py334
-rw-r--r--ethosu/vela/weight_compressor.py387
-rw-r--r--setup.py63
180 files changed, 21180 insertions, 0 deletions
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 00000000..300bef65
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,9 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+
+[packages]
+ethos-u-vela = {editable = true,path = "."}
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 00000000..6fa01549
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,56 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "2d930644f3f81f11dae3317cae890fe083479342c80da44161b46ac83d6972d5"
+ },
+ "pipfile-spec": 6,
+ "requires": {},
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "ethos-u-vela": {
+ "editable": true,
+ "path": "."
+ },
+ "flatbuffers": {
+ "hashes": [
+ "sha256:776a959c5f70b41819fa75de44ed14fd984fa1a79b378f27e6f4fff338cbdca2",
+ "sha256:f24185db54193540e3d684dc98aa7c2d89882341641548ceb36fd2589fef6c4e"
+ ],
+ "version": "==1.11.0"
+ },
+ "numpy": {
+ "hashes": [
+ "sha256:1598a6de323508cfeed6b7cd6c4efb43324f4692e20d1f76e1feec7f59013448",
+ "sha256:1b0ece94018ae21163d1f651b527156e1f03943b986188dd81bc7e066eae9d1c",
+ "sha256:2e40be731ad618cb4974d5ba60d373cdf4f1b8dcbf1dcf4d9dff5e212baf69c5",
+ "sha256:4ba59db1fcc27ea31368af524dcf874d9277f21fd2e1f7f1e2e0c75ee61419ed",
+ "sha256:59ca9c6592da581a03d42cc4e270732552243dc45e87248aa8d636d53812f6a5",
+ "sha256:5e0feb76849ca3e83dd396254e47c7dba65b3fa9ed3df67c2556293ae3e16de3",
+ "sha256:6d205249a0293e62bbb3898c4c2e1ff8a22f98375a34775a259a0523111a8f6c",
+ "sha256:6fcc5a3990e269f86d388f165a089259893851437b904f422d301cdce4ff25c8",
+ "sha256:82847f2765835c8e5308f136bc34018d09b49037ec23ecc42b246424c767056b",
+ "sha256:87902e5c03355335fc5992a74ba0247a70d937f326d852fc613b7f53516c0963",
+ "sha256:9ab21d1cb156a620d3999dd92f7d1c86824c622873841d6b080ca5495fa10fef",
+ "sha256:a1baa1dc8ecd88fb2d2a651671a84b9938461e8a8eed13e2f0a812a94084d1fa",
+ "sha256:a244f7af80dacf21054386539699ce29bcc64796ed9850c99a34b41305630286",
+ "sha256:a35af656a7ba1d3decdd4fae5322b87277de8ac98b7d9da657d9e212ece76a61",
+ "sha256:b1fe1a6f3a6f355f6c29789b5927f8bd4f134a4bd9a781099a7c4f66af8850f5",
+ "sha256:b5ad0adb51b2dee7d0ee75a69e9871e2ddfb061c73ea8bc439376298141f77f5",
+ "sha256:ba3c7a2814ec8a176bb71f91478293d633c08582119e713a0c5351c0f77698da",
+ "sha256:cd77d58fb2acf57c1d1ee2835567cd70e6f1835e32090538f17f8a3a99e5e34b",
+ "sha256:cdb3a70285e8220875e4d2bc394e49b4988bdb1298ffa4e0bd81b2f613be397c",
+ "sha256:deb529c40c3f1e38d53d5ae6cd077c21f1d49e13afc7936f7f868455e16b64a0",
+ "sha256:e7894793e6e8540dbeac77c87b489e331947813511108ae097f1715c018b8f3d"
+ ],
+ "version": "==1.18.2"
+ }
+ },
+ "develop": {}
+}
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..03ad7fec
--- /dev/null
+++ b/README.md
@@ -0,0 +1,112 @@
+# Vela
+This tool is used to compile a [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers) neural network model into an optimised version that can run on an embedded system containing an [Ethos-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55).
+
+The optimised model will contain TensorFlow Lite Custom operators for those parts of the model that can be accelerated by the Ethos-U55. Parts of the model that cannot be accelerated are left unchanged and will instead run on the Cortex-M series CPU using an appropriate kernel (such as the [Arm](https://www.arm.com) optimised [CMSIS-NN](https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN) kernels).
+
+After compilation the optimised model can only be run on an Ethos-U55 NPU embedded system.
+
+The tool will also generate performance estimates (EXPERIMENTAL) for the compiled model.
+
+## Environment
+Vela runs on the Linux operating system.
+
+## Prerequisites
+The following should be installed prior to the installation of Vela:
+ - Python >= 3.6
+ - GNU toolchain (GCC, Binutils and libraries) or alternative C compiler/linker toolchain
+
+## Installation
+Before running, the Vela package must be installed along with all its dependencies. To do this, first change to the directory that contains this README.md file. Then use the command:
+```
+pip3 install -U setuptools>=40.1.0
+pip3 install .
+```
+
+Or, if you use the `pipenv` virtual environment tool:
+```
+pipenv install .
+```
+
+## Running
+Vela is run with an input `.tflite` file passed on the command line. This file contains the neural network to be compiled. The tool then outputs an optimised version with a `_vela.tflite` file prefix, along with the performance estimate (EXPERIMENTAL) CSV files, all to the output directory.
+
+If you use the `pipenv` virtual environment tool then first start by spawning a shell in the virtual environment.:
+```
+pipenv shell
+```
+After which running Vela is the same regardless of whether you are in a virtual environment or not.
+
+Example usage:
+1) Compile the network `my_model.tflite`. The optimised version will be output to `./output/my_network_vela.tflite`.
+```
+vela my_model.tflite
+```
+2) Compile the network `/path/to/my_model.tflite` and specify the output to go in the directory `./results_dir/`.
+```
+vela --output-dir ./results_dir /path/to/my_model.tflite
+```
+3) To get a list of all available options:
+```
+vela --help
+```
+4) To specifiy information about the embedded system's configuration use Vela's system configuration file. The following command selects the `MySysConfig` settings that are described in the `sys_cfg_vela.ini` system configuration file. More details can be found in the next section.
+```
+vela --config sys_cfg_vela.ini --system-config MySysConfig my_model.tflite
+```
+
+### Vela's System Configuration file
+This is used to describe various properties of the embedded system that the network will run in.
+
+Example of a Vela system configuration file.
+```
+; File: sys_cfg_vela.ini
+; The file contains two parts; a system config part and a CPU operator
+; performance part.
+
+; System config
+; Specifies properties such as the core clock speed, the size and speed of the
+; four potential memory areas, and for various types of data which memory area
+; is used to store them. The cpu property is used to link with the CPU operator
+; performance.
+; The four potential memory areas are: Sram, Dram, OnChipFlash, OffChipFlash.
+
+[SysConfig.MySysConfig]
+npu_freq=500e6
+cpu=MyCpu
+Sram_clock_scale=1
+Sram_port_width=64
+Dram_clock_scale=1
+Dram_port_width=64
+OnChipFlash_clock_scale=1
+OnChipFlash_port_width=64
+OffChipFlash_clock_scale=0.25
+OffChipFlash_port_width=32
+permanent_storage_mem_area=OffChipFlash
+feature_map_storage_mem_area=Sram
+fast_storage_mem_area=Sram
+
+; CPU operator performance
+; Specifies properties that are used by a linear model to estimate the
+; performance for any operations that will be run on the CPU (such as those not
+; supported by the NPU). Setting the intercept and slope to 0 will result in
+; the operator being excluded from the performance estimation. This is the same
+; as not specifying the operator. If an explicit cpu is specified rather than
+; using the default then the cpu name must match the cpu specified in the
+; SysConfig.<system config name> section.
+
+[CpuPerformance.MyCpuOperator]
+default.intercept=0.0
+default.slope=1.0
+
+MyCpu.intercept=0.0
+MyCpu.slope=1.0
+```
+
+## Contribution Guidlines and Pull Requests
+Contributions are accepted under [Apache License 2.0](LICENSE.txt). Only submit contributions where you have authored all of the code.
+
+## Resources
+* [Ethos-U55](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55)
+
+## License
+Vela is licensed under [Apache License 2.0](LICENSE.txt)
diff --git a/ethosu/mlw_codec/makefile b/ethosu/mlw_codec/makefile
new file mode 100644
index 00000000..6eb418dd
--- /dev/null
+++ b/ethosu/mlw_codec/makefile
@@ -0,0 +1,49 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Makefile to build mlw_codec
+
+UNAME=$(shell uname -o)
+
+CFLAGS=-Wall -Wno-unused-function -Wno-unused-variable
+
+ifeq ($(DEBUG),1)
+ CFLAGS+=-g -O0 -DDEBUG
+else
+ CFLAGS+=-O3
+endif
+
+LIBSRCS=mlw_encode.c mlw_decode.c
+LIBHDRS=mlw_encode.h mlw_decode.h mlw_common.h
+
+ifeq ($(UNAME),Cygwin)
+ MLWEXE=mlw_codec.exe
+else
+ MLWEXE=mlw_codec
+endif
+
+all: mlwexe
+
+.PHONY: mlwexe
+mlwexe: $(MLWEXE)
+
+clean:
+ rm -f $(MLWEXE)
+
+$(MLWEXE): mlw_main.c $(LIBSRCS) $(LIBHDRS) makefile
+ gcc $(CFLAGS) mlw_main.c $(LIBSRCS) -o $(MLWEXE) -lm
diff --git a/ethosu/mlw_codec/mlw_codecmodule.c b/ethosu/mlw_codec/mlw_codecmodule.c
new file mode 100644
index 00000000..de945ab3
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_codecmodule.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include "mlw_decode.h"
+#include "mlw_encode.h"
+
+/* C extension wrapper for mlw_encode
+ *
+ * This method is exposed directly in python with the arguments with a
+ * prototype of the form:
+ *
+ * output = mlw_codec.encode(input, verbose=0)
+ *
+ * input: [int]
+ * verbose: int
+ * output: bytearray
+ */
+
+static PyObject *
+method_encode (PyObject *self, PyObject *args)
+{
+ /* Object to hold the input integer list. */
+ PyObject *input_list_object;
+
+ /* Object to hold the input verbosity integer, the verbose argument
+ * is optional so defaulted to 0.
+ */
+ int verbose = 0;
+
+ /* Arguments to the method are delivered as a tuple, unpack the
+ * tuple to get the individual arguments, note the second is
+ * optional.
+ */
+ if (!PyArg_ParseTuple(args, "O|i", &input_list_object, &verbose))
+ return NULL;
+
+ /* Unpack the length of the input integer list. */
+ int input_length = PyObject_Length (input_list_object);
+ if (input_length < 0)
+ input_length = 0;
+
+ /* We need to marshall the integer list into an input buffer
+ * suitable for mlw_encode, use a temporary heap allocated buffer
+ * for that purpose.
+ */
+ int16_t *input_buffer = (int16_t *) malloc(sizeof(int16_t *) * input_length);
+ if (input_buffer == NULL)
+ return PyErr_NoMemory();
+
+ /* Unpack the input integer list into the temporary buffer.
+ */
+ for (int i = 0; i < input_length; i++)
+ {
+ PyObject *item;
+ item = PyList_GetItem(input_list_object, i);
+ if (!PyLong_Check(item))
+ input_buffer[i] = 0;
+ input_buffer[i] = PyLong_AsLong(item);
+ }
+
+ /* We don't know the output length required, we guess worst case,
+ * the mlw_encode call will do a resize (downwards) anyway.
+ */
+ uint8_t *output_buffer = malloc(input_length);
+ if (output_buffer == NULL)
+ return PyErr_NoMemory();
+
+ int output_length = mlw_encode(input_buffer, input_length, &output_buffer, verbose);
+
+ PyObject *output_byte_array = PyByteArray_FromStringAndSize ((char *) output_buffer, output_length);
+
+ /* Discard the temporary input and output buffers. */
+ free (input_buffer);
+ free (output_buffer);
+
+ return output_byte_array;
+}
+
+/* C extension wrapper for mlw_decode
+ *
+ * This method is exposed directly in python with the arguments with a
+ * prototype of the form:
+ *
+ * output = mlw_codec.decode(input, verbose=0)
+ *
+ * input: bytearray
+ * verbose: int
+ * output: [int]
+ */
+
+static PyObject *
+method_decode(PyObject *self, PyObject *args)
+{
+ /* Object to hold the input bytearray. */
+ PyObject *input_bytearray_object;
+
+ /* Object to hold the input verbosity integer, the verbose argument
+ * is optional so defaulted to 0.
+ */
+ int verbose = 0;
+
+ /* Arguments to the method are delivered as a tuple, unpack the
+ * tuple to get the individual arguments, note the second is
+ * optional.
+ */
+ if (!PyArg_ParseTuple(args, "Y|i", &input_bytearray_object, &verbose))
+ return NULL;
+
+ /* Unpack the input buffer and length from the bytearray object. */
+ uint8_t *input_buffer = (uint8_t *) PyByteArray_AsString(input_bytearray_object);
+ int input_length = PyByteArray_Size(input_bytearray_object);
+
+ /* We don't know the output length required, we guess, but the guess
+ * will be too small, the mlw_decode call will do a resize (upwards)
+ * anyway.
+ */
+ int16_t *output_buffer = malloc (input_length);
+ if (output_buffer == NULL)
+ return PyErr_NoMemory();
+
+ int output_length = mlw_decode (input_buffer, input_length, &output_buffer, verbose);
+
+ /* Construct a new integer list and marshall the output buffer
+ * contents into the list. */
+ PyObject *output_list = PyList_New(output_length);
+ for (int i = 0; i <output_length; i++)
+ PyList_SetItem (output_list, i, PyLong_FromLong (output_buffer[i]));
+
+ free (output_buffer);
+
+ return output_list;
+}
+
+/* mlw_codec method descriptors.
+ */
+
+static PyMethodDef mlw_methods[] = {
+ {"decode", method_decode, METH_VARARGS, "Python interface for decode"},
+ {"encode", method_encode, METH_VARARGS, "Python interface for encode"},
+ {NULL, NULL, 0, NULL}
+};
+
+/* mlw_codec module descriptor.
+ */
+
+static struct PyModuleDef mlw_codecmodule = {
+ PyModuleDef_HEAD_INIT,
+ "mlw_codec",
+ "Python interface for the mlw encoder",
+ -1,
+ mlw_methods
+};
+
+PyMODINIT_FUNC PyInit_mlw_codec(void) {
+ return PyModule_Create(&mlw_codecmodule);
+}
diff --git a/ethosu/mlw_codec/mlw_common.h b/ethosu/mlw_codec/mlw_common.h
new file mode 100644
index 00000000..008473a5
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_common.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#ifndef __MLW_COMMON_H__
+#define __MLW_COMMON_H__
+
+#define ZDIV_DISABLE 6 // not alternating mode
+#define ZDIV_EOS 7 // indicates end of stream
+
+#define WDIV_UNCOMPRESSED 7 // indicates uncompressed weights
+
+#endif
diff --git a/ethosu/mlw_codec/mlw_decode.c b/ethosu/mlw_codec/mlw_decode.c
new file mode 100644
index 00000000..92aaea67
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_decode.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <math.h>
+#include "mlw_common.h"
+#include "mlw_decode.h"
+
+
+/////////////////////////////// Read from bitstream
+
+typedef struct bitbuf {
+ uint8_t *buf;
+ int buf_size; // in bytes
+ int pos; // bit pos of next bit
+ int log_symbols;
+} bitbuf_t;
+
+
+// size in byte
+static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols) {
+ bb->buf = buf;
+ bb->pos = 0;
+ bb->buf_size = size;
+ bb->log_symbols = log_symbols;
+}
+
+static int bitbuf_getbit( bitbuf_t *bb) {
+ int byte_pos = bb->pos>>3;
+ int bit_pos = bb->pos&7;
+ if ( byte_pos < 0 || byte_pos >= bb->buf_size ) {
+ printf("bitbuf_getbit: underrun, bit_pos %3d byte_pos %3d buf_size %3d\n", bit_pos, byte_pos, bb->buf_size);
+ exit(1);
+ }
+ int bit = bb->buf[ byte_pos ] & (1<<bit_pos) ? 1 : 0;
+ bb->pos++;
+ return bit;
+}
+
+static int bitbuf_get( bitbuf_t *bb, const char *name, int len) {
+ int i, data=0, save_pos=bb->pos;
+ if (len>0) {
+ for(i=0; i<len; i++) {
+ data |= bitbuf_getbit(bb)<<i;
+ }
+ if (bb->log_symbols)
+ printf("bitbuf: pos %3d %7s len %d data %x\n", save_pos, name, len, data);
+ }
+ return data;
+}
+
+// Decode the given weight stream
+// inbuf compressed bitstream
+// inbuf_size size of compressed bitstream in bytes
+// outbuf uncompressed 9bit signed weights, buffer malloced
+// verbose if non-zero, printf log
+// Return value is the number of uncompressed weights
+int mlw_decode( uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose) {
+ int nvalues;
+ int w_grc_div;
+ int w_grc_trunc;
+ int w_uncompressed;
+ int z_grc_div, z_prev_grc_div=0;
+ int new_palette;
+ int palsize=0, palbits=0;
+ int direct_offset=0;
+ int16_t palette[512];
+ int first=1;
+ int use_zero_run, i, j;
+ int outbuf_size=0;
+ int nchunks=0;
+
+ *outbuf=0;
+
+ bitbuf_t bitbuf_s, *bb=&bitbuf_s;
+ bitbuf_init( bb, inbuf, inbuf_size, (verbose&2)?1:0 );
+
+ // Loop over all slices
+ while(1) {
+ // Decode slice header
+ z_grc_div = bitbuf_get( bb, "ZDIV", 3 );
+ while(z_grc_div==ZDIV_EOS) { // TODO: change to ZDIV_PAD
+ // End of stream
+ // Byte align
+ bitbuf_get( bb, "BYTEALIGN", (8-(bb->pos&7))&7 );
+ first=1;
+ if ( (bb->pos/8) == inbuf_size) {
+ // Quit if we actually reached end of input stream
+ break;
+ }
+ z_grc_div = bitbuf_get( bb, "ZDIV", 3 );
+ }
+ if ( (bb->pos/8) == inbuf_size) {
+ break; // reached end of input stream
+ }
+ assert(z_grc_div<4 || z_grc_div==ZDIV_DISABLE);
+ use_zero_run = z_grc_div!=ZDIV_DISABLE; // alternating grc
+ nvalues = bitbuf_get( bb, "SLICELEN", 15 )+1;
+ w_grc_div = bitbuf_get( bb, "WDIV", 3 );
+ w_grc_trunc = bitbuf_get( bb, "WTRUNC", 1 );
+ new_palette = bitbuf_get( bb, "NEWPAL", 1 );
+ if (first) {
+ // the first slice must have a palette/direct mode setup
+ assert(new_palette);
+ first=0;
+ }
+ if (!new_palette) {
+ // At the moment it is not supported to change between alternating
+ // and non-alternating without redefining the palette (this is because
+ // the zero is not included in the palette in case of alternating)
+ int prev_use_zero_run = z_prev_grc_div!=ZDIV_DISABLE;
+ (void)(prev_use_zero_run);
+ assert( use_zero_run == prev_use_zero_run);
+ }
+ z_prev_grc_div = z_grc_div;
+ if (new_palette) {
+ direct_offset = bitbuf_get( bb, "DIROFS", 5 );
+ palsize = bitbuf_get( bb, "PALSIZE", 5 );
+ if (palsize>0)
+ palsize++;
+ palbits = bitbuf_get( bb, "PALBITS", 3 )+2;
+ for(i=0; i<palsize; i++) {
+ palette[i] = bitbuf_get( bb, "PALETTE", palbits );
+ }
+ }
+
+ if (w_grc_div==WDIV_UNCOMPRESSED) {
+ // Uncompressed mode
+ w_uncompressed = 1;
+ int uncompressed_bits;
+ if (palsize>0) {
+ // Uncompressed bits is given by palette size.
+ uncompressed_bits=0;
+ while( (1<<uncompressed_bits) < palsize )
+ uncompressed_bits++;
+ } else {
+ // No palette. PALBITS is used to specify uncompressed bits.
+ uncompressed_bits=palbits;
+ }
+ // In uncompressed mode there's only a remainder part (no unary)
+ // This is achieved by setting w_grc_div to index bit width
+ w_grc_div = uncompressed_bits;
+ } else {
+ w_uncompressed = 0;
+ assert(w_grc_div<6);
+ }
+
+ // Decode the slice
+ int z_nvalues = nvalues + (new_palette?1:0);
+ int *w_value = malloc( nvalues*sizeof(int) );
+ int *z_value = malloc( z_nvalues*sizeof(int) );
+ int w_pos=0, z_pos=0;
+ int w_prev_pos=0, z_prev_pos=0;
+ int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q[12]={0}, w_carry=0;
+ int z_unary=0, z_q[12]={0}, z_carry=0;
+ int w_nsymbols=0;
+ int w_prev_enable=0, w_prev_nsymbols=0, w_prev_q[12]={0};
+ int z_nsymbols=0;
+ int z_prev_enable=0, z_prev_nsymbols=0, z_prev_q[12]={0};
+ int total_zcnt=0;
+ int z_unary_len = z_grc_div<3 ? 12 : 8;
+
+ // Loop over all chunks in the slice
+ do {
+ // Flow control to possibly throttle either the weights or zero-runs
+ int balance = use_zero_run ? w_pos - z_pos : 0;
+ int w_enable = (balance<8 || !use_zero_run) && w_pos<nvalues;
+ int z_enable = balance>=0 && use_zero_run && z_pos<z_nvalues;
+ if (w_enable) {
+ if (!w_uncompressed)
+ w_unary0 = bitbuf_get( bb, "WUNARY0", 12 );
+ else
+ w_unary0 = 0;
+ }
+ if (z_enable) {
+ z_unary = bitbuf_get( bb, "ZUNARY", z_unary_len );
+ z_nsymbols=0;
+ int cnt = z_carry;
+ for(i=0; i<z_unary_len; i++) {
+ if (z_unary & (1<<i)) {
+ cnt++;
+ } else {
+ z_q[z_nsymbols++] = cnt;
+ cnt=0;
+ }
+ }
+ z_carry = cnt;
+ z_pos += z_nsymbols;
+ }
+ if (w_enable) {
+ w_unary1_len=0;
+ int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12;
+ for(i=0; i<max_symbols; i++) {
+ if (w_unary0&(1<<i))
+ w_unary1_len++;
+ }
+ w_unary1 = bitbuf_get( bb, "WUNARY1", w_unary1_len );
+ w_nsymbols=0;
+ int cnt = w_carry;
+ for(i=0; i<max_symbols; i++) {
+ int code=0;
+ if (w_unary0 & (1<<i)) {
+ code++;
+ if (w_unary1&1) {
+ code++;
+ }
+ w_unary1 = w_unary1>>1;
+ }
+ cnt += code;
+ if (code<2 || w_grc_trunc) {
+ w_q[w_nsymbols++] = cnt;
+ cnt=0;
+ }
+ }
+ w_carry = cnt;
+ w_pos += w_nsymbols;
+ }
+ if (w_prev_enable) {
+ for(i=0; i<w_prev_nsymbols && w_prev_pos<nvalues; i++, w_prev_pos++) {
+ int remain = bitbuf_get( bb, "WREMAIN", w_grc_div );
+ w_value[w_prev_pos] = (w_prev_q[i]<<w_grc_div) + remain;
+ }
+ }
+ if (z_prev_enable) {
+ for(i=0; i<z_prev_nsymbols && z_prev_pos<z_nvalues; i++, z_prev_pos++) {
+ int remain = bitbuf_get( bb, "ZREMAIN", z_grc_div );
+ z_value[z_prev_pos] = (z_prev_q[i]<<z_grc_div) + remain;
+ total_zcnt += z_value[z_prev_pos];
+ }
+ }
+ w_prev_enable = w_enable;
+ w_prev_nsymbols = w_nsymbols;
+ memcpy( w_prev_q, w_q, sizeof(w_prev_q));
+ z_prev_enable = z_enable;
+ z_prev_nsymbols = z_nsymbols;
+ memcpy( z_prev_q, z_q, sizeof(z_prev_q));
+ nchunks++;
+ } while( w_prev_enable || z_prev_enable );
+
+ // Interleave non-zero and zeros into the outbut buffer
+ // Increase the outbuffer to fit the new slice
+ *outbuf = realloc( *outbuf, (outbuf_size + nvalues + total_zcnt)*sizeof(int16_t));
+
+ int k=outbuf_size;
+
+ // Insert initial zeros
+ // (slices redefining the palette may start with zeros)
+ if (new_palette && use_zero_run) {
+ for(j=0; j<z_value[0]; j++) {
+ (*outbuf)[k++] = 0;
+ }
+ }
+
+ // Loop over all weights and insert zeros in-between
+ for(i=0; i<nvalues; i++) {
+ int val;
+ assert(w_value[i]<512); // HW supports 9bit
+ if (w_value[i]<palsize) {
+ val = palette[w_value[i]];
+ } else {
+ val = w_value[i]-palsize+direct_offset;
+ }
+ int sign = val&1;
+ int mag = val>>1;
+ (*outbuf)[k++] = sign ? -mag : mag;
+ if (use_zero_run) {
+ for(j=0; j<z_value[i+(new_palette?1:0)]; j++) {
+ (*outbuf)[k++] = 0;
+ }
+ }
+ }
+
+ outbuf_size = k;
+ free(w_value);
+ free(z_value);
+ }
+ return outbuf_size;
+}
diff --git a/ethosu/mlw_codec/mlw_decode.h b/ethosu/mlw_codec/mlw_decode.h
new file mode 100644
index 00000000..a15261ad
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_decode.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#ifndef __MLW_DECODE_H__
+#define __MLW_DECODE_H__
+
+#ifdef _MSC_VER
+ #define EXPORTED __declspec(dllexport)
+#else
+ #define EXPORTED __attribute__((visibility("default")))
+#endif
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+EXPORTED
+int mlw_decode(uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose);
+
+#if __cplusplus
+}
+#endif
+
+#endif
diff --git a/ethosu/mlw_codec/mlw_encode.c b/ethosu/mlw_codec/mlw_encode.c
new file mode 100644
index 00000000..ac25fc52
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_encode.c
@@ -0,0 +1,874 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <stdarg.h>
+#include <math.h>
+#include "mlw_common.h"
+#include "mlw_encode.h"
+
+#define DPRINTF(...)
+//#define DPRINTF(...) printf(__VA_ARGS__)
+
+#define ZERO_RUN_THRES 4
+
+#define min(a,b) ((a)<(b)?(a):(b))
+#define max(a,b) ((a)>(b)?(a):(b))
+
+typedef struct palette {
+ int16_t lut[32];
+ int16_t inv_lut[512];
+ int palsize; // number of palette entries
+ int palbits; // bit width of palette entries
+ int use_zero_runs; // zeros are coded separately
+ int only_palette; // no values outside the palette
+ int direct_offset; // added to the decoded weight index before direct conversion to sign/mag
+ int only_zeros; // special case that the section is all zeros
+} palette_t;
+
+static int is_power_of_two( int x ) {
+ return ((x-1) & x)==0;
+}
+
+static int get_palette_index_bits( int size ) {
+ int i;
+ for(i=7; i>=0; i--)
+ if (size > (1<<i) )
+ return i+1;
+ return 0;
+}
+
+// Search the stream for suitable palette restart positions
+// Return the number of restarts
+static int search_palette_sections( int16_t *buf, int size, int **palette_restart_positions ) {
+ int i,j,got_palette,restart_i,palette_size=0, last_restart_idx, zero_cnt;
+ int prev_idx[512]; // For each value, keep track of the index of the previous occurence
+ int *restart_pos;
+ int max_palettes = size/64;
+
+ // Preliminary allocation of sufficient size
+ restart_pos = (int*)malloc( max_palettes*sizeof(int) );
+ last_restart_idx=0;
+ got_palette=0;
+ restart_i=1;
+ restart_pos[0] = 0;
+ zero_cnt=0;
+ memset( prev_idx, -1, sizeof(prev_idx));
+ for(i=0; i<size; i++) {
+ // Guess if zeros should be excluded from the palette
+ int exclude_zero = zero_cnt > (i-last_restart_idx)/4;
+
+ if (got_palette) {
+ // Check if the next value is not covered by the current palette
+ if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) {
+ // New value: increase the palette size
+ palette_size++;
+ DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size);
+ if ( is_power_of_two(palette_size-1-exclude_zero) ) {
+ if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) {
+ // create a new palette because we extend a long lasting palette to require one more index bit
+ DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt );
+ assert( restart_i < max_palettes );
+ DPRINTF("restart %d pos %d\n", restart_i, i);
+ restart_pos[restart_i++] = i;
+ last_restart_idx = i;
+ got_palette=0;
+ zero_cnt=0;
+ }
+ }
+ }
+ }
+
+ prev_idx[ buf[i]+256 ] = i;
+ if (buf[i]==0)
+ zero_cnt++;
+
+ static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}};
+ int k;
+ // loop over window sizes
+ for(k=0; k<5; k++) {
+ // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ.
+ int N = window_sizes[k][0] * (got_palette?2:1);
+ if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) {
+ // Search backward to the position N nonzero values earlier
+ int nzcnt=0;
+ for( j=i; j>last_restart_idx; j--) {
+ if ( buf[j]!=0 ) {
+ if (nzcnt==N+1)
+ break;
+ nzcnt++;
+ }
+ }
+ int restart_idx = j;
+
+ // Calculate the size of a new palette (starting at restart_idx)
+ int new_palette_size=0;
+ for(j=0; j<512; j++) {
+ if ( prev_idx[j] >= restart_idx ) {
+ new_palette_size++;
+ }
+ }
+
+ int create_new_palette=0;
+ if (got_palette) {
+ int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
+ int old_size_bits = get_palette_index_bits( palette_size - exclude_zero );
+ int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20;
+ if ( savings>0 ) {
+ // Create new palette because it can be smaller than the existing palette
+ create_new_palette=1;
+ DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx);
+ }
+ } else {
+ if ( (new_palette_size-exclude_zero) <= 32) {
+ int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero );
+ // estimate if we will make savings by using palette mode
+ int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20;
+ create_new_palette = savings>0;
+ }
+ }
+ if (create_new_palette) {
+ palette_size=new_palette_size;
+ got_palette=1;
+ last_restart_idx = restart_idx;
+ DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size);
+ if ( restart_pos[restart_i-1] != last_restart_idx) {
+ assert( restart_i < max_palettes );
+ restart_pos[restart_i++] = last_restart_idx;
+ }
+ zero_cnt=0;
+ for( j=last_restart_idx; j<=i; j++)
+ if (buf[j]==0)
+ zero_cnt++;
+ }
+ }
+ }
+ }
+ // Reallocate to actual size
+ *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) );
+ return restart_i;
+}
+
+// Calculate frequency table
+static void calc_freq( const int16_t *buf, int size, int freq[512] ) {
+ int i;
+ memset(freq, 0, 512*sizeof(int));
+ for(i=0; i<size; i++) {
+ freq[buf[i]+256]++;
+ }
+}
+
+static int cmp_uint64(const void * a, const void * b) {
+ uint64_t aa = *(uint64_t*)a;
+ uint64_t bb = *(uint64_t*)b;
+ return aa>bb ? -1 : aa<bb ? 1 : 0;
+}
+
+// Create palette from the given frequencies
+// Freq index 0-511 correspond to weights -256..255
+static void create_palette( int freq[512],
+ int use_zero_runs,
+ palette_t *p ) {
+ uint64_t freq64[512];
+ int i,all_cnt,all_max_val;
+
+ // Pair the frequency with the value so that
+ // the array can be sorted on frequency while keeping
+ // track of the corresponding palette value
+ memset(freq64, 0, sizeof(freq64));
+ all_cnt=0;
+ all_max_val=0;
+ for(i=-255; i<256; i++) {
+ if (i==0 && use_zero_runs)
+ continue;
+ int sign = i<0;
+ int mag = abs(i);
+ int palval = (mag<<1) | sign;
+
+ // Store palette value in 16 LSB bits, which will not affect the sorting
+ freq64[palval] = (((uint64_t)freq[i+256])<<16) | palval;
+ all_cnt+=freq[i+256];
+
+ if (freq[i+256]>0) {
+ all_max_val = max(all_max_val, palval);
+ }
+ }
+
+ // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc)
+ for(i=0; i<31; i++) {
+ if ((freq64[i]>>16)!=0)
+ break;
+ }
+ p->direct_offset = i;
+
+ // Sort in descending frequency order
+ qsort(freq64, 512, sizeof(uint64_t), cmp_uint64);
+
+ // Identify special case that there are no weights to code
+ // in the weight index stream (i.e. all weights are zeros)
+ p->only_zeros = (freq64[0]>>16)==0;
+ if (p->only_zeros) {
+ p->direct_offset=0;
+ }
+
+ // Check if all weights fit into the palette (and the palette is not empty)
+ p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0;
+
+ int max_palette_size;
+ if (p->only_palette) {
+ max_palette_size = 32;
+ } else {
+ // For direct-lut we must make sure that the encoded weight
+ // index is not > 511. We do that by limiting the palette size
+ // such that the greatest value can be reached after subtracting
+ // the palette size.
+ max_palette_size = min(32, 511-all_max_val);
+ if (max_palette_size==1) {
+ max_palette_size=0; // because palette of size 1 is not supported
+ }
+ }
+
+ // Setup the 32 entry palette
+ int palette_max_val = 0, val, cnt, pal_cnt=0;
+ for(i=0; i<max_palette_size; i++) {
+ cnt = freq64[i]>>16;
+ val = freq64[i]&0xffff;
+ if ( cnt==0 )
+ break;
+ p->lut[i] = val;
+ palette_max_val = max(palette_max_val, val);
+ pal_cnt+=cnt;
+ }
+ if (i==1)
+ i++; // palette size of 1 is not supported, make it 2
+
+ // Heuristic for when to use the palette. If more than half of the
+ // weights are in the palette then we use it. This ensures we don't
+ // use palette for e.g. rectangular distributions.
+ int palbits_val;
+ if (pal_cnt > all_cnt/2) {
+ p->palsize = i;
+ palbits_val = palette_max_val;
+ } else {
+ // No palette
+ p->palsize = 0;
+ // If no palette, then palbits is used to specify the
+ // number of bits required for uncompressed mode, i.e.
+ // the number of bits for the greatest weight value
+ palbits_val = all_max_val;
+ }
+
+ // the palette entry bit width
+ // minimum 2bits (because PALBITS is in range 2..9)
+ int palbits=2;
+ while( (1<<palbits) <= palbits_val )
+ palbits++;
+ assert(palbits<=9);
+ p->palbits = palbits;
+ p->use_zero_runs = use_zero_runs;
+}
+
+// Return 1 if zero runs should be used
+// If palette_size is 512, then palette is not used (in that case the palette is setup
+// with the standard alternating unsigned to signed mapping)
+static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) {
+ int freq[512], i;
+
+ // Calculate frequencies of the given weight stream
+ calc_freq( inbuf, inbuf_size, freq);
+
+ // Find two most common values
+ int most_common_freq[2]={0}, most_common_val[2]={0};
+ for(i=0; i<512; i++) {
+ if ( freq[i] > most_common_freq[0] ) {
+ most_common_freq[1] = most_common_freq[0];
+ most_common_val[1] = most_common_val[0];
+ most_common_freq[0] = freq[i];
+ most_common_val[0] = i-256;
+ } else if ( freq[i] > most_common_freq[1] ) {
+ most_common_freq[1] = freq[i];
+ most_common_val[1] = i-256;
+ }
+ }
+
+ // Decide if zero-runs (alternating mode) should be used:
+ // * zero should be the most common symbol
+ // * zero should be sufficiently more common than the second most common symbol
+ int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1];
+
+ // Create the palette
+ create_palette( freq, use_zero_runs, p);
+
+ return use_zero_runs;
+}
+
+static void create_inverse_palette( palette_t *p) {
+ int i;
+ memset( p->inv_lut, 0, sizeof(p->inv_lut));
+ for(i=0; i<512; i++) {
+ int val = i;
+ int sign = val&1;
+ int mag = val>>1;
+ int weight = sign ? -mag : mag;
+ if (weight+256 < 512)
+ p->inv_lut[ weight+256 ] = i + p->palsize - p->direct_offset;
+ }
+ for(i=0; i<p->palsize; i++) {
+ int val = p->lut[i];
+ int sign = val&1;
+ int mag = val>>1;
+ int weight = sign ? -mag : mag;
+ if (weight+256 < 512)
+ p->inv_lut[ weight+256 ] = i;
+ }
+}
+
+#define NWCFG 13
+#define NZCFG 4 // restrict search to ZDIV=0..3
+#define MAX_ZWCFG (max(NWCFG,NZCFG))
+
+// search state
+typedef struct search_state {
+ int bitcnt; // number of bits to reach this state
+ uint8_t prev_cfg; // previous grc parameter config
+} search_state_t;
+
+// (trunc<<4) | div, 0x20 means uncompressed
+static const char w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 };
+static const char z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 };
+
+
+
+// An algorithm similar to the Viterbi algorithm is used to search for a
+// good GRC parameter sequence for the given input value sequence.
+// The inval buffer can contain weights, weight indices or runs.
+// The return value is the resulting number of bitstream sections.
+static int search_grc_params( const int *inval_buf,
+ int n_inval,
+ int zrun_mode,
+ int uncompressed_bits,
+ uint8_t *grc_param_cfg,
+ int *grc_param_pos,
+ int max_grc_param_cfg,
+ int *existing_grc_param_pos,
+ int n_existing_grc_param_pos,
+ int *bitcnt )
+{
+ int n_cfg = zrun_mode ? NZCFG : NWCFG;
+ const char *grc_params = zrun_mode ? z_grc_params : w_grc_params;
+ int i,j;
+
+ search_state_t *state[MAX_ZWCFG];
+ for(i=0; i<n_cfg; i++) {
+ state[i] = malloc( sizeof(search_state_t) * (n_inval+1) );
+ state[i][0].bitcnt=0;
+ state[i][0].prev_cfg=i;
+ }
+
+ // Loop over inval_buf
+ int existing_idx=0;
+ for(i=0; i<n_inval; i++) {
+ int value = inval_buf[i];
+
+ // Best GRC parameter so far
+ int best_bitcnt=0x7fffffff, best_cfg=0;
+ for(j=0; j<n_cfg; j++) {
+ if (state[j][i].bitcnt < best_bitcnt) {
+ best_bitcnt = state[j][i].bitcnt;
+ best_cfg = j;
+ }
+ }
+
+ int cmd_cost = 40;
+ if (existing_idx < n_existing_grc_param_pos && existing_grc_param_pos[existing_idx] == (i+1)) {
+ // free transition, because the weight stream already inserted a command at this position
+ cmd_cost = 0;
+ existing_idx++;
+ }
+
+ // Loop over GRC parameters, calculate bits to code value, and then update the search state
+ for(j=0; j<n_cfg; j++) {
+ int div = grc_params[j]&15;
+ int trunc = grc_params[j]>>4;
+ int q = value>>div;
+ int bits = trunc ? min(q+1,2) + div : q+1+div;
+ if (!zrun_mode && ((trunc && q>2) || q>31))
+ bits=10000; // it's not possible to code the current value; give it a high cost
+ if (trunc==2)
+ bits=uncompressed_bits;
+
+ if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) {
+ // Change GRC parameters
+ state[j][i+1].prev_cfg = best_cfg;
+ state[j][i+1].bitcnt = best_bitcnt + cmd_cost + bits;
+ } else {
+ // Keep same GRC parameters
+ state[j][i+1].prev_cfg = j;
+ state[j][i+1].bitcnt = state[j][i].bitcnt + bits;
+ }
+ }
+ }
+
+
+ // Best GRC parameter
+ int best_bitcnt=0x7fffffff, best_cfg=0;
+ for(j=0; j<n_cfg; j++) {
+ if (state[j][n_inval].bitcnt < best_bitcnt) {
+ best_bitcnt = state[j][n_inval].bitcnt;
+ best_cfg = j;
+ }
+ }
+
+ int cfg = best_cfg;
+ int n_cmds=0;
+ for(i=n_inval; i>=0; i--) {
+ if (state[cfg][i].prev_cfg != cfg || i==0) {
+ n_cmds++;
+ cfg = state[cfg][i].prev_cfg;
+ }
+ }
+
+ (void)(max_grc_param_cfg);
+ assert(n_cmds<=max_grc_param_cfg);
+
+ cfg = best_cfg;
+ j=n_cmds-1;
+ int endpos=n_inval;
+ for(i=n_inval; i>=0; i--) {
+ if (state[cfg][i].prev_cfg != cfg || i==0) {
+ grc_param_cfg[j] = cfg;
+ grc_param_pos[j] = endpos;
+ j--;
+ cfg = state[cfg][i].prev_cfg;
+ endpos = i-1;
+ }
+ }
+ assert(j==-1);
+
+ for(i=0; i<n_cfg; i++) {
+ free(state[i]);
+ }
+
+ *bitcnt = best_bitcnt;
+
+ return n_cmds;
+}
+
+
+/////////////////////////////// Write to bitstream
+
+typedef struct bitbuf {
+ uint8_t *buf;
+ int buf_size; // in bytes
+ int pos; // bit pos of next bit
+ int log_symbols;
+} bitbuf_t;
+
+// size in byte
+static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols ) {
+ bb->buf = buf;
+ bb->pos = 0;
+ bb->buf_size = size;
+ bb->log_symbols = log_symbols;
+}
+
+static void bitbuf_putbit( bitbuf_t *bb, int bit) {
+ int byte_pos = bb->pos>>3;
+ int bit_pos = bb->pos&7;
+ assert( byte_pos >= 0 );
+ assert( byte_pos < bb->buf_size );
+ bb->buf[ byte_pos ] = (bb->buf[ byte_pos ] & ~(1<<bit_pos)) | (bit<<bit_pos);
+ bb->pos += 1;
+}
+
+static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) {
+ int i;
+ if (len>0) {
+ if (bb->log_symbols)
+ printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data);
+ for(i=0; i<len; i++) {
+ bitbuf_putbit(bb, (data>>i)&1);
+ }
+ }
+}
+
+// Return new bitpos
+static int encode_slice( const int *w_value,
+ const int *z_value,
+ int nvalues,
+ palette_t *p,
+ int new_palette,
+ int uncompressed_bits,
+ int w_cfg,
+ int z_cfg,
+ uint8_t *bitbuf,
+ int bitbuf_size,
+ int bitpos,
+ int verbose )
+{
+ int i,j;
+ bitbuf_t bitbuf_s, *bb=&bitbuf_s;
+ bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 );
+ bb->pos = bitpos;
+
+ assert(nvalues<32768);
+ // GRC parameters for this slice
+ int w_grc_div = w_grc_params[w_cfg] & 15;
+ int w_grc_trunc = (w_grc_params[w_cfg] >> 4)==1;
+ int w_uncompressed = (w_grc_params[w_cfg] >> 4)==2;
+ int z_grc_div = z_grc_params[z_cfg] & 15;
+
+ if (w_uncompressed) {
+ w_grc_div = uncompressed_bits;
+ }
+
+ int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE;
+ int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED;
+
+ if (verbose&1) {
+ printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n",
+ bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize);
+ }
+
+ // Write slice header
+ bitbuf_put( bb, "ZDIV", 3, zdiv);
+ bitbuf_put( bb, "SLICELEN", 15, nvalues-1 );
+ bitbuf_put( bb, "WDIV", 3, wdiv);
+ bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc );
+ bitbuf_put( bb, "NEWPAL", 1, new_palette );
+ if (new_palette) {
+ bitbuf_put( bb, "DIROFS", 5, p->direct_offset );
+ bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1));
+ bitbuf_put( bb, "PALBITS", 3, p->palbits-2 );
+ for(i=0; i<p->palsize; i++) {
+ bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] );
+ }
+ }
+
+ int z_nvalues = nvalues + (new_palette?1:0);
+ int w_pos=0, z_pos=0;
+ int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0;
+ int z_unary=0, z_q=-1, z_r=0;
+ int w_nsymbols=0, w_remain[12]={0};
+ int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0};
+ int z_nsymbols=0, z_remain[12]={0};
+ int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0};
+ int z_unary_len = z_grc_div<3 ? 12 : 8;
+ do {
+ int balance = p->use_zero_runs ? w_pos - z_pos : 0;
+ int w_enable = balance<8 && w_pos<nvalues;
+ int z_enable = balance>=0 && p->use_zero_runs && z_pos<z_nvalues;
+ if (w_enable) {
+ // Encode chunk (weights)
+ j=0;
+ w_nsymbols=0;
+ w_unary0=0;
+ w_unary1=0;
+ w_unary1_len=0;
+ int max_symbols = w_uncompressed && w_grc_div>5 ? 8 : 12;
+ while(j<max_symbols) {
+ if (w_q<0) {
+ if (w_pos<nvalues) {
+ int value = w_value[w_pos];
+ assert(value<512);
+ w_q = value>>w_grc_div;
+ w_r = value&((1<<w_grc_div)-1);
+ assert( w_q<=31 && (!w_grc_trunc || w_q<=2));
+ } else {
+ w_q = 0;
+ w_r = -1; // don't send remainder
+ }
+ }
+ while( w_q>=0 && j<max_symbols) {
+ w_unary0 |= w_q>0 ? (1<<j) : 0;
+ if (w_q>0) {
+ w_unary1 |= w_q>1 ? (1<<w_unary1_len) : 0;
+ w_unary1_len++;
+ }
+ j++;
+ w_q-=2;
+ if (w_grc_trunc)
+ w_q--;
+ }
+ if (w_q<0 && w_r>=0) {
+ w_remain[w_nsymbols] = w_r;
+ w_nsymbols++;
+ w_pos++;
+ }
+ }
+ }
+
+ if (z_enable) {
+ // Encode chunk (zrun)
+ j=0;
+ z_nsymbols=0;
+ z_unary=0;
+ while(j<z_unary_len) {
+ if (z_q<0) {
+ if (z_pos<z_nvalues) {
+ int value = z_value[z_pos];
+ z_q = value>>z_grc_div;
+ z_r = value&((1<<z_grc_div)-1);
+ } else {
+ z_q = 0;
+ z_r = -1;
+ }
+ }
+ while( z_q>=0 && j<z_unary_len) {
+ z_unary |= z_q>0 ? (1<<j) : 0;
+ j++;
+ z_q--;
+ }
+ if (z_q<0 && z_r>=0) {
+ z_remain[z_nsymbols] = z_r;
+ z_nsymbols++;
+ z_pos++;
+ }
+ }
+ }
+
+ // Write chunk to bitstream
+ if (w_enable && !w_uncompressed) {
+ bitbuf_put( bb, "WUNARY0", 12, w_unary0);
+ }
+ if (z_enable) {
+ bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary);
+ }
+ if (w_enable && !w_uncompressed) {
+ bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1);
+ }
+ if (w_prev_enable) {
+ for(i=0; i<w_prev_nsymbols; i++) {
+ bitbuf_put( bb, "WREMAIN", w_grc_div, w_prev_remain[i]);
+ }
+ }
+ if (z_prev_enable) {
+ for(i=0; i<z_prev_nsymbols; i++) {
+ bitbuf_put( bb, "ZREMAIN", z_grc_div, z_prev_remain[i]);
+ }
+ }
+ w_prev_enable = w_enable;
+ w_prev_nsymbols = w_nsymbols;
+ memcpy( w_prev_remain, w_remain, sizeof(w_prev_remain));
+ z_prev_enable = z_enable;
+ z_prev_nsymbols = z_nsymbols;
+ memcpy( z_prev_remain, z_remain, sizeof(z_prev_remain));
+ } while( w_prev_enable || z_prev_enable );
+
+ return bb->pos;
+}
+
+
+// return new bitpos
+static int encode_section( const int16_t *inbuf,
+ int size,
+ palette_t *p,
+ uint8_t *bitbuf,
+ int bitbuf_size,
+ int bitpos,
+ int verbose )
+{
+ int uncompressed_bits;
+
+ // Uncompressed mode can only be used if either all weights
+ // are in the palette OR if the palette is not used.
+ if (p->only_palette) {
+ // Uncompressed bits derived from palette size
+ uncompressed_bits=0;
+ while( (1<<uncompressed_bits) < p->palsize )
+ uncompressed_bits++;
+ } else if (p->palsize==0) {
+ // Uncompressed bits is palbits (which is the bitdepth of the greatest weight)
+ uncompressed_bits = p->palbits;
+ } else {
+ // Don't use uncompressed
+ uncompressed_bits = 100;
+ }
+
+ int *weight_values = malloc( size*sizeof(int) );
+ int *zrun_values = malloc( size*sizeof(int) );
+
+ // Get weights (or weight indicies) AND zero-runs from the input weight stream.
+ int i=0, n_weights = 0, zcnt;
+ while(1) {
+ if (p->use_zero_runs) {
+ zcnt=0;
+ // Count zero run
+ // Special case: if all weights in the section are zero, we must
+ // still ensure we have one coded weight so the the slice length
+ // doesn't become 0. Therefore we skip the first zero run and code
+ // the zero explicitly as a weight value instead
+ if (!p->only_zeros || i>0) {
+ while( i<size && inbuf[i]==0) {
+ zcnt++;
+ i++;
+ }
+ }
+ zrun_values[n_weights] = zcnt;
+ }
+ if (i==size)
+ break;
+ int value = p->inv_lut[inbuf[i]+256];
+ weight_values[n_weights] = value;
+ n_weights++;
+ i++;
+ }
+
+ // Search for good GRC parameters for the weight stream
+ int n_w_slice, w_bitcnt;
+ uint8_t *w_slice_cfg;
+ int *w_slice_pos;
+ w_slice_cfg = malloc( size );
+ w_slice_pos = malloc( size*sizeof(int) );
+ n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt);
+ if (n_weights==0)
+ n_w_slice = 0;
+
+ // Search for good GRC parameters for the zrun stream
+ int n_z_slice=0, z_bitcnt=0;
+ uint8_t *z_slice_cfg=0;
+ int *z_slice_pos=0;
+ if (p->use_zero_runs) {
+ z_slice_cfg = malloc( size );
+ z_slice_pos = malloc( size*sizeof(int) );
+ n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt);
+ }
+
+ // Encode bitstream slice
+ int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1;
+ while(pos<n_weights || new_palette) {
+ int endpos=pos+32767; // max slice length
+
+ if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]<endpos) {
+ endpos = w_slice_pos[i_w_slice];
+ }
+
+ if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]<endpos) {
+ endpos = z_slice_pos[i_z_slice];
+ }
+
+ if (n_weights < endpos) {
+ endpos = n_weights;
+ }
+
+ // The first slice (when new_palette is 1) encodes zero runs both at the
+ // beginning and end (i.e. number of zero runs are len+1).
+ // The following slices only encode zero runs at the end (there cannot be
+ // any zeros in the beginning since they are encoded by the previous slice)
+ int len = endpos - pos;
+ int *zrun_buf = p->use_zero_runs ? zrun_values+pos+(!new_palette) : 0;
+ bitpos = encode_slice( weight_values+pos, zrun_buf, len,
+ p, new_palette, uncompressed_bits,
+ w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0,
+ bitbuf, bitbuf_size, bitpos, verbose );
+ new_palette = 0;
+
+ if (i_w_slice<n_w_slice && w_slice_pos[i_w_slice]==endpos) {
+ i_w_slice++;
+ }
+ if (i_z_slice<n_z_slice && z_slice_pos[i_z_slice]==endpos) {
+ i_z_slice++;
+ }
+ pos = endpos;
+ }
+
+ // Free temporary buffers
+ free(w_slice_cfg);
+ free(w_slice_pos);
+ if (p->use_zero_runs) {
+ free(z_slice_cfg);
+ free(z_slice_pos);
+ }
+ free(weight_values);
+ free(zrun_values);
+
+ return bitpos;
+}
+
+// Encode the given weight stream
+// inbuf uncompressed 9bit signed weights
+// inbuf_size number of weights
+// outbuf compressed bitstream, buffer is malloced
+// verbose if non-zero, printf log
+// Return value is the size in bytes of the compressed output
+// Return -1 if error
+int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) {
+ int i;
+ // Range check
+ for(i=0; i<inbuf_size; i++) {
+ if (inbuf[i]<-255 || inbuf[i]>255) {
+ printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]);
+ return -1;
+ }
+ }
+
+ int bitbuf_size = inbuf_size*2+1024;
+ *outbuf = malloc( bitbuf_size );
+
+ // Analyse input data to find palette re-programming points
+ int n_restarts;
+ int *palette_restart_pos;
+ n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos);
+
+ // Compress each section (using a single palette) separately
+ int bitpos=0;
+ for(i=0; i<n_restarts; i++) {
+ palette_t palette;
+ int pos, size;
+ pos = palette_restart_pos[i];
+ size = (i<n_restarts-1 ? palette_restart_pos[i+1] : inbuf_size) - pos;
+ find_palette( inbuf+pos, size, &palette);
+ create_inverse_palette( &palette);
+ bitpos = encode_section( inbuf+pos, size, &palette,
+ *outbuf, bitbuf_size, bitpos, verbose );
+ }
+
+
+ // Add end of stream marker and align to 128bit
+ {
+ bitbuf_t bitbuf_s, *bb=&bitbuf_s;
+ bitbuf_init( bb, *outbuf, bitbuf_size, verbose&2?1:0 );
+ bb->pos = bitpos;
+ bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS);
+ bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff );
+
+ // Pad with 0xff until 64bit aligned
+ while( bb->pos & 127 ) {
+ bitbuf_put( bb, "PAD", 8, 0xff );
+ }
+ bitpos = bb->pos;
+ }
+ assert((bitpos&127)==0);
+ int outbuf_size = bitpos/8;
+ *outbuf = realloc( *outbuf, outbuf_size);
+
+ free(palette_restart_pos);
+
+ return outbuf_size;
+}
+
+void mlw_free_outbuf( uint8_t *outbuf ) {
+ if (outbuf)
+ free(outbuf);
+}
diff --git a/ethosu/mlw_codec/mlw_encode.h b/ethosu/mlw_codec/mlw_encode.h
new file mode 100644
index 00000000..a995ac6e
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_encode.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+
+#ifndef __MLW_ENCODE_H__
+#define __MLW_ENCODE_H__
+
+#ifdef _MSC_VER
+ #define EXPORTED __declspec(dllexport)
+#else
+ #define EXPORTED __attribute__((visibility("default")))
+#endif
+
+#if __cplusplus
+extern "C"
+{
+#endif
+
+EXPORTED
+int mlw_encode(int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose);
+
+EXPORTED
+void mlw_free_outbuf(uint8_t *outbuf);
+
+#if __cplusplus
+}
+#endif
+
+#endif
diff --git a/ethosu/mlw_codec/mlw_main.c b/ethosu/mlw_codec/mlw_main.c
new file mode 100644
index 00000000..9f720495
--- /dev/null
+++ b/ethosu/mlw_codec/mlw_main.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include "mlw_encode.h"
+#include "mlw_decode.h"
+
+static void fatal_error(const char *format, ...) {
+ va_list ap;
+ va_start (ap, format);
+ vfprintf(stderr, format, ap);
+ va_end(ap);
+ exit(1);
+}
+
+static void print_usage(void) {
+ printf("Usage:\n");
+ printf(" Encode: ./mlw_codec [<options>] [-o <outfile.mlw>] infiles.bin\n");
+ printf(" Decode: ./mlw_codec [<options>] -d [-o <outfile.bin>] infiles.mlw\n");
+ printf("\n");
+ printf("Options:\n");
+ printf(" -w The uncompressed weight file is an int16_t (word) stream.\n");
+ printf(" This is to support 9bit signed weights. Little endian is assuemd.\n");
+ printf(" The default format is int8_t (byte) stream (if -w is not specified)\n");
+ printf("\n");
+}
+
+// Read file into allocated buffer. Return length in bytes.
+static int read_file( FILE *f, uint8_t **buf) {
+
+ fseek(f, 0, SEEK_END);
+ int size = ftell(f);
+ fseek(f, 0, SEEK_SET);
+ *buf = malloc(size);
+ assert(*buf);
+ int rsize = fread(*buf, 1, size, f);
+ assert(rsize==size);
+ fclose(f);
+ return size;
+}
+
+
+#define MAX_INFILES 1000
+
+int main(int argc, char *argv[])
+{
+ int c, decode=0, inbuf_size, outbuf_size;
+ char *infile_name[MAX_INFILES], *outfile_name=0;
+ uint8_t *inbuf=0, *outbuf=0;
+ FILE *infile, *outfile=0;
+ int verbose=0, infile_idx=0;
+ int int16_format=0;
+
+ if (argc==1) {
+ print_usage();
+ exit(1);
+ }
+
+ // Parse command line options
+ while( optind < argc) {
+ // Parse options
+ while ((c = getopt (argc, argv, "di:o:v:w?")) != -1) {
+ switch (c) {
+ case 'd':
+ decode=1;
+ break;
+ case 'i':
+ assert(infile_idx<MAX_INFILES);
+ infile_name[infile_idx++]=optarg;
+ break;
+ case 'o':
+ outfile_name=optarg;
+ break;
+ case 'v':
+ verbose=atoi(optarg);
+ break;
+ case 'w':
+ int16_format=1;
+ break;
+ case '?':
+ print_usage();
+ exit(0);
+ }
+ }
+
+ if (optind<argc) {
+ assert(infile_idx<MAX_INFILES);
+ infile_name[infile_idx++]=argv[optind];
+ optind++;
+
+ }
+ }
+
+ if (outfile_name) {
+ outfile=fopen(outfile_name, "wb");
+ if (!outfile)
+ fatal_error("ERROR: cannot open outfile %s\n", outfile_name);
+ }
+
+ // Loop over input files
+ int nbr_of_infiles=infile_idx;
+ for(infile_idx=0; infile_idx<nbr_of_infiles; infile_idx++) {
+ infile=fopen(infile_name[infile_idx], "rb");
+ if (!infile)
+ fatal_error("ERROR: cannot open infile %s\n", infile_name[infile_idx]);
+
+ // Read infile to buffer
+ inbuf_size = read_file(infile, &inbuf);
+
+ if (!decode) {
+ // Encode
+ int i, n = int16_format ? inbuf_size/sizeof(int16_t) : inbuf_size;
+ int16_t *weights = malloc( n * sizeof(int16_t) );
+ for(i=0; i<n; i++) {
+ weights[i] = int16_format ? ((int16_t*)inbuf)[i] : ((int8_t*)inbuf)[i];
+ }
+ outbuf_size = mlw_encode( weights, n, &outbuf, verbose);
+ free(weights);
+ printf("Input size %d output size %d bpw %4.2f\n", n, outbuf_size, outbuf_size*8.0/n);
+ } else {
+ // Decode
+ int i, n;
+ int16_t *weights;
+ n = mlw_decode( inbuf, inbuf_size, &weights, verbose);
+ outbuf_size = int16_format ? n*sizeof(int16_t) : n;
+ outbuf = malloc( outbuf_size );
+ assert(outbuf);
+ for(i=0; i<n; i++) {
+ if (int16_format)
+ ((int16_t*)outbuf)[i] = weights[i];
+ else
+ outbuf[i] = weights[i];
+ }
+ free(weights);
+ printf("Input size %d output size %d bpw %4.2f\n", inbuf_size, n, inbuf_size*8.0/n);
+
+ }
+
+ if (outfile) {
+ fwrite(outbuf, 1, outbuf_size, outfile);
+ }
+
+ if (inbuf)
+ free(inbuf);
+ if (outbuf)
+ free(outbuf);
+ }
+
+ if (outfile) {
+ fclose(outfile);
+ }
+
+ return 0;
+}
diff --git a/ethosu/mlw_codec/test_mlw_codec.py b/ethosu/mlw_codec/test_mlw_codec.py
new file mode 100644
index 00000000..b8687210
--- /dev/null
+++ b/ethosu/mlw_codec/test_mlw_codec.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Simple example of the usage of mlw_codec.
+
+import sys
+
+from ethosu import mlw_codec
+
+
+# Simple example
+if __name__ == "__main__":
+ weights = [0, 2, 3, 0, -1, -2, -3, 0, 0, 0, 1, -250, 240] * 3
+ print("Original weights :", weights)
+
+ compressed_weights = mlw_codec.encode(weights)
+ print("Compressed weights :", len(compressed_weights), compressed_weights)
+
+ uncompressed_weights = mlw_codec.decode(compressed_weights)
+ print("Uncompressed weights:", uncompressed_weights)
+
+ if weights != uncompressed_weights:
+ print("TEST FAILED")
+ sys.exit(1)
+ else:
+ print("TEST PASSED")
+ sys.exit(0)
diff --git a/ethosu/vela/__init__.py b/ethosu/vela/__init__.py
new file mode 100644
index 00000000..07d8d792
--- /dev/null
+++ b/ethosu/vela/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._version import __version__
+from .vela import main
+
+__all__ = [main, __version__]
diff --git a/ethosu/vela/__main__.py b/ethosu/vela/__main__.py
new file mode 100644
index 00000000..9bf74c73
--- /dev/null
+++ b/ethosu/vela/__main__.py
@@ -0,0 +1,22 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+from .vela import main
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/ethosu/vela/_version.py b/ethosu/vela/_version.py
new file mode 100644
index 00000000..f3888c31
--- /dev/null
+++ b/ethosu/vela/_version.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pkg_resources
+
+__version__ = pkg_resources.get_distribution("ethos-u-vela").version \ No newline at end of file
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
new file mode 100644
index 00000000..4a03d0ef
--- /dev/null
+++ b/ethosu/vela/architecture_features.py
@@ -0,0 +1,618 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Holds a container for Ethos-U55/System architecture parameters.
+
+from .nn_graph import MemArea, TensorPurpose, NpuBlockType, TensorFormat
+from .numeric_util import round_up, round_up_divide
+from collections import namedtuple
+from configparser import ConfigParser
+from .supported_operators import SupportedOperators
+import numpy as np
+import enum
+
+PointXY = namedtuple("PointXY", "x y")
+PointXYZ = namedtuple("PointXYZ", "x y z")
+
+
+class Block:
+ def __init__(self, w, h, d):
+ self.width = w
+ self.height = h
+ self.depth = d
+
+ def __eq__(self, other):
+ if self.width == other.width and self.height == other.height and self.depth == other.depth:
+ return True
+ else:
+ return False
+
+ def __repr__(self):
+ return "<Block: {0},{1},{2}>".format(self.width, self.height, self.depth)
+
+ @classmethod
+ def from_string(cls, s):
+ w, h, c = (int(v) for v in s.split("x"))
+ return cls(w, h, c)
+
+
+class Rect:
+ def __init__(self, x, y, z, x2, y2, z2):
+ self.x = x
+ self.y = y
+ self.z = z
+ self.x2 = x2
+ self.y2 = y2
+ self.z2 = z2
+
+ def start(self):
+ return PointXYZ(self.x, self.y, self.z)
+
+ def end(self):
+ return PointXYZ(self.x2, self.y2, self.z2)
+
+ def size(self):
+ return Block(self.x2 - self.x + 1, self.y2 - self.y + 1, self.z2 - self.z + 1)
+
+ def __repr__(self):
+ return "<Rect: ({0},{1},{2}) ({3},{4},{5})>".format(self.x, self.y, self.z, self.x2, self.y2, self.z2)
+
+
+class Kernel:
+ def __init__(self, w, h, sx=1, sy=1, dx=1, dy=1):
+ assert sx > 0 and sy > 0
+ assert dx > 0 and dy > 0
+ self.width = w
+ self.height = h
+ self.stride = PointXY(sx, sy)
+ self.dilation = PointXY(dx, dy)
+
+
+class SHRAMElements:
+ IFM8 = 0
+ IFM16 = 1
+ IFM8_Elementwise = 2
+ IFM16_Elementwise = 3
+ Acc16 = 4
+ Acc32 = 5
+ Acc40 = 6
+ Last = Acc40
+ BitSizes = np.array([8, 16, 8, 16, 16, 32, 40], np.int32)
+
+
+class SHRAMBlockConfig:
+ def __init__(self, sizes, banks):
+ assert len(banks) == SHRAMElements.Last + 1
+ self.sizes = sizes
+ self.banks = banks
+
+
+# Area indices must match Ethos-U55 SHRAM layout spec
+class SharedBufferArea(enum.IntEnum):
+ OFM = 0
+ Weights = 1
+ IFM = 2
+ Accumulators = 3
+ Size = Accumulators + 1
+
+
+class ArchitectureFeatures:
+ """This class is a container for various parameters of the Ethos-U55 core
+and system configuration that can be tuned, either by command line
+parameters or by the Ethos-U55 architects. The class is often passed
+around to passes that need to do architecture-dependent actions.
+
+Note the difference between ArchitectureFeatures and CompilerOptions
+- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
+- CompilerOptions is for changing the behaviour of the compiler
+
+"""
+
+ ArchitectureConfig = namedtuple(
+ "ArchitectureConfig", "macs cores ofm_ublock ifm_ublock shram_banks shram_granules elem_units"
+ )
+ accelerator_configs = {
+ "ethos-u55-256": ArchitectureConfig(256, 1, Block(2, 2, 8), Block(2, 2, 8), 48, [8, 8, 8, 8, 8, 16, 20], 8),
+ "ethos-u55-128": ArchitectureConfig(128, 1, Block(2, 1, 8), Block(2, 2, 8), 24, [4, 4, 4, 4, 4, 8, 12], 4),
+ "ethos-u55-64": ArchitectureConfig(64, 1, Block(1, 1, 8), Block(1, 1, 8), 16, [2, 2, 2, 2, 4, 4, 8], 2),
+ "ethos-u55-32": ArchitectureConfig(32, 1, Block(1, 1, 4), Block(1, 1, 8), 16, [2, 2, 2, 2, 4, 4, 4], 1),
+ }
+
+ OFMSplitDepth = 16
+
+ def __init__(
+ self,
+ vela_config: ConfigParser,
+ accelerator_config,
+ system_config,
+ permanent_storage,
+ inter_pass_cycle_delay,
+ dram_bandwidth,
+ override_block_config,
+ block_config_limit,
+ global_memory_clock_scale,
+ max_blockdep,
+ ):
+ accelerator_config = accelerator_config.lower()
+ self.vela_config = vela_config
+ self.accelerator_config = accelerator_config
+ if not self.accelerator_config in ArchitectureFeatures.accelerator_configs:
+ raise Exception("Unknown accelerator configuration " + self.accelerator_config)
+ accel_config = ArchitectureFeatures.accelerator_configs[self.accelerator_config]
+ self.config = accel_config
+
+ self.system_config = system_config
+
+ is_yoda_system = "yoda-" in self.accelerator_config
+
+ if is_yoda_system:
+ self.sram_size = 256 * 1024
+ else:
+ self.sram_size = 200 * 1024 * 1024
+
+ self.ncores = accel_config.cores
+ self.ofm_ublock = accel_config.ofm_ublock
+ self.ifm_ublock = accel_config.ifm_ublock
+ self.subkernel_max = Block(8, 8, 65536)
+ self.ofm_block_max = Block(64, 32, 128)
+ self.override_block_config = override_block_config
+ self.block_config_limit = block_config_limit
+
+ self.global_memory_clock_scale = global_memory_clock_scale
+ if self.global_memory_clock_scale <= 0.0 or self.global_memory_clock_scale > 1.0:
+ raise Exception(
+ "Invalid global_memory_clock_scale = "
+ + str(self.global_memory_clock_scale)
+ + " (must be > 0.0 and <= 1.0)"
+ )
+
+ self.max_blockdep = max_blockdep
+
+ dpu_min_height = accel_config.ofm_ublock.height
+ dpu_min_width = accel_config.ofm_ublock.width
+ dpu_dot_product_width = 8
+ dpu_min_ofm_channels = accel_config.ofm_ublock.depth
+
+ self.num_elem_wise_units = accel_config.elem_units
+ self.num_macs_per_cycle = dpu_min_height * dpu_min_width * dpu_dot_product_width * dpu_min_ofm_channels
+
+ self.memory_clock_scales = np.zeros(MemArea.Size)
+ self.memory_port_widths = np.zeros(MemArea.Size)
+
+ # Get system configuration
+ self.__read_sys_config()
+
+ # apply the global memory clock scales to the individual ones from the system config
+ for mem in MemArea.all():
+ self.memory_clock_scales[mem] *= self.global_memory_clock_scale
+
+ self.memory_clocks = self.memory_clock_scales * self.npu_clock
+ self.memory_bandwidths_per_cycle = self.memory_port_widths * self.memory_clock_scales / 8
+
+ if dram_bandwidth != 0:
+ self.memory_bandwidths_per_cycle[MemArea.Dram] = dram_bandwidth * 1e9 / self.npu_clock
+
+ self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.npu_clock
+
+ # sizes as N x H x W x C. we need to round up to these when allocating storage
+ self.storage_rounding_quantums = {
+ TensorFormat.Unknown: (1, 1, 1, 1),
+ TensorFormat.WeightsCompressed: (1, 1, 1, 1),
+ TensorFormat.NHWC: (1, 1, 1, 1),
+ TensorFormat.NHCWB16: (1, 1, 1, 16),
+ }
+
+ # brick sizes as N x H x W x C. We have to fetch whole bricks at a time
+ self.brick_sizes = {
+ TensorFormat.Unknown: (1, 1, 1, 1),
+ TensorFormat.WeightsCompressed: (1, 1, 1, 1),
+ TensorFormat.NHWC: (1, 1, 1, 1),
+ TensorFormat.NHCWB16: (1, 1, 1, 16),
+ }
+
+ self.inter_pass_cycle_delay = inter_pass_cycle_delay
+
+ self.default_weight_format = TensorFormat.WeightsCompressed
+ self.default_feature_map_format = TensorFormat.NHWC
+
+ if permanent_storage != MemArea.OffChipFlash:
+ self.permanent_storage_mem_area = permanent_storage
+
+ self.tensor_storage_mem_area = {
+ # permanent mem_area
+ TensorPurpose.Weights: self.permanent_storage_mem_area,
+ TensorPurpose.FeatureMap: self.feature_map_storage_mem_area,
+ }
+
+ self.tensor_load_mem_area = dict(self.tensor_storage_mem_area)
+
+ if self.tensor_storage_mem_area[TensorPurpose.Weights] in (MemArea.OffChipFlash,):
+ self.tensor_load_mem_area[TensorPurpose.Weights] = MemArea.Sram
+
+ self.min_block_sizes = {
+ NpuBlockType.Default: (dpu_min_height, dpu_min_width),
+ NpuBlockType.VectorProduct: (1, 1),
+ NpuBlockType.ConvolutionMxN: (dpu_min_height, dpu_min_width),
+ NpuBlockType.Pooling: (dpu_min_height, dpu_min_width),
+ NpuBlockType.ConvolutionDepthWise: (dpu_min_height, dpu_min_width),
+ NpuBlockType.ElementWise: (1, 1),
+ }
+
+ self.sub_kernel_limits = {
+ NpuBlockType.Default: (8, 8),
+ NpuBlockType.VectorProduct: (1, 1),
+ NpuBlockType.ConvolutionMxN: (8, 8),
+ NpuBlockType.Pooling: (8, 8),
+ NpuBlockType.ConvolutionDepthWise: (8, 8),
+ NpuBlockType.ElementWise: (1, 1),
+ }
+
+ # weights for scheduler search
+ from .npu_performance import make_bandwidth_array
+
+ self.bandwidth_weights = make_bandwidth_array()
+ self.bandwidth_weights[MemArea.Sram] = 1.0
+ self.bandwidth_weights[MemArea.Dram] = 10.0
+ self.bandwidth_weights[MemArea.OnChipFlash] = 2.0
+ self.bandwidth_weights[MemArea.OffChipFlash] = 20.0
+ self.cycles_weight = 40
+ self.max_sram_used_weight = 1000
+
+ if is_yoda_system:
+ self.max_sram_used_weight = 0
+
+ # Shared Buffer Block allocations
+ self.shram_bank_size = 1024 # bytes
+ self.shram_size_bytes = accel_config.shram_banks * self.shram_bank_size
+ self.shram_reserved_output_banks = 2
+ self.shram_reserved_weight_banks = 0
+ self.shram_reserved_unused_banks = 2 if accel_config.shram_banks > 16 else 0
+ self.shram_total_banks = accel_config.shram_banks - self.shram_reserved_unused_banks
+ self.shram_bank_granules = np.array(accel_config.shram_granules, np.int32)
+
+ # Build a map of acceptable IFM/OFM block configurations up to the maximum
+ # IFM/OFM block size.
+ ifm_block_max = self.get_ifm_block_size(32, self.ofm_block_max, Kernel(8, 8))
+ self.block_config_map = dict()
+ self.generate_block_config_map(Block(ifm_block_max.width, ifm_block_max.height, 128))
+
+ # Setup supported operators and restriction checkers class
+ self.supported_operators = SupportedOperators()
+
+ # Calculate block configuration for ALL known IFM operations and
+ # accumulator sizes. Consumers will need to select their preferred
+ # operation and bit-width at read-time.
+ def generate_block_config(self, width, height, depth):
+ # Number of bytes required for any SRAM element for a FM of given dimensions
+ size_bytes = (SHRAMElements.BitSizes * (height * width * depth)) // 8
+ # Convert byte size (rounded) to size in banks
+ size_banks = round_up_divide(size_bytes, self.shram_bank_size)
+ size_banks *= 2 # Double buffer the IFM/Acc (need twice as many banks)
+ # Round bank requirement to bank granularity
+ required_banks = round_up(size_banks, self.shram_bank_granules)
+ return SHRAMBlockConfig(size_bytes, required_banks)
+
+ @staticmethod
+ def make_block_config_key(width, height, depth):
+ return (int(height), int(width), int(depth))
+
+ def get_block_config(self, width, height, depth):
+ assert depth <= self.ofm_block_max.depth
+ key = ArchitectureFeatures.make_block_config_key(width, height, depth)
+ config = self.block_config_map.get(key, None)
+ return config
+
+ # Generate a key:value map of possible block configurations, where the
+ # key is compounded from the block dimensions: 0x00HHWWCC
+ def generate_block_config_map(self, block: Block):
+ for h in range(1, block.height + 1):
+ for w in range(1, block.width + 1):
+ # All possible IFM/OFM depth values
+ for c in [4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128]:
+ key = ArchitectureFeatures.make_block_config_key(w, h, c)
+ self.block_config_map[key] = self.generate_block_config(w, h, c)
+
+ def calc_ifm_block_depth(self, ifm_depth, ifm_bits):
+ assert ifm_bits == 8 or ifm_bits == 16
+ assert ifm_depth > 0
+ ifm_depth = round_up(ifm_depth, self.ifm_ublock.depth)
+ max_block_depth = 32 if ifm_bits == 8 else 16
+ return min(max_block_depth, ifm_depth)
+
+ # Calculate the size of the IFM block given a depth, target OFM block and a kernel
+ def get_ifm_block_size(
+ self, ifm_block_depth, ofm_block: Block, kernel: Kernel, subkernel: Block = Block(8, 8, 65536)
+ ):
+ upscaling = 1
+ # Height
+ ifm_odd_2x_height_enable = 0
+ dilated_kernel_height = ((kernel.height - 1) * kernel.dilation.y) + 1
+ ifm_block_height = (
+ (ofm_block.height - 1) * kernel.stride.y
+ + min(subkernel.height, dilated_kernel_height)
+ + ifm_odd_2x_height_enable
+ ) // upscaling
+
+ if kernel.stride.y == 1:
+ ifm_block_height = round_up(ifm_block_height, self.ofm_ublock.height)
+ elif kernel.stride.y == 2:
+ if (self.ofm_ublock.height == 2) and (ifm_block_height % 4 == 2):
+ ifm_block_height = ifm_block_height + 2
+ else:
+ ifm_block_height = round_up(ifm_block_height, self.ofm_ublock.height)
+ else:
+ assert False
+
+ # Width
+ ifm_odd_2x_width_enable = 0
+ dilated_kernel_width = ((kernel.width - 1) * kernel.dilation.x) + 1
+ ifm_block_width = (
+ (ofm_block.width - 1) * kernel.stride.x
+ + min(subkernel.width, dilated_kernel_width)
+ + ifm_odd_2x_width_enable
+ ) // upscaling
+
+ if kernel.stride.x == 1:
+ ifm_block_width = round_up(ifm_block_width, self.ofm_ublock.width)
+ elif kernel.stride.x == 2:
+ if (self.ofm_ublock.width == 2) and (ifm_block_width % 4 == 2):
+ ifm_block_width = ifm_block_width + 2
+ else:
+ ifm_block_width = round_up(ifm_block_width, self.ofm_ublock.width)
+ else:
+ assert False
+
+ return Block(ifm_block_width, ifm_block_height, ifm_block_depth)
+
+ @staticmethod
+ def intersects(start_a, end_a, start_b, end_b):
+ start_x = max(start_a[0], start_b[0])
+ end_x = min(end_a[0], end_b[0])
+ start_y = max(start_a[1], start_b[1])
+ end_y = min(end_a[1], end_b[1])
+ start_z = max(start_a[2], start_b[2])
+ end_z = min(end_a[2], end_b[2])
+ return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
+
+ # Block job dependency:
+ # Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
+ #
+ # A | B
+ # ----------------------+------------------
+ # .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
+ # |<------->| dependency offset
+ #
+ MAX_BLOCKDEP = 3
+
+ # Get the coordinates of a block offset from either the end (negative)
+ # or the start (zero or positive) of the given 3d area
+ def get_offset_block_coords(self, area: Rect, block: Block, offset):
+ size = area.size()
+ # Dimensions of the region, in blocks
+ width_blocks = round_up_divide(size.width, block.width)
+ height_blocks = round_up_divide(size.height, block.height)
+ depth_blocks = round_up_divide(size.depth, block.depth)
+ total_blocks = width_blocks * height_blocks * depth_blocks
+ if offset < 0:
+ index = total_blocks + offset
+ else:
+ index = offset
+
+ if index >= total_blocks:
+ return None
+
+ # Coordinates of the indexed block
+ coord_z = block.depth * (index % depth_blocks)
+ coord_y = block.height * (index // (depth_blocks * width_blocks))
+ coord_x = block.width * ((index // depth_blocks) % width_blocks)
+
+ return (coord_x + area.x, coord_y + area.y, coord_z + area.z)
+
+ def get_first_job_input_volume(
+ self, ifm: Rect, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, padLT, block_offset
+ ):
+ # Get ifm block size (jobs are invisibly decomposed into subkernels)
+ ifm_block = self.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, self.ofm_block_max)
+ ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth)
+
+ # Which OFM block are we calculating
+ ofm_coord = self.get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
+ if ofm_coord is None:
+ return None
+
+ # Coordinate of the source IFM block
+ ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padLT[0])
+ ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padLT[1])
+ ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
+
+ # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
+ start_coord = (ifm_coord_x, ifm_coord_y, ifm_coord_z)
+ end_coord = (
+ start_coord[0] + ifm_block.width,
+ start_coord[1] + ifm_block.height,
+ start_coord[2] + ifm_block.depth,
+ )
+
+ return (start_coord, end_coord, 1) # start, end, total jobs
+
+ def get_prev_job_output_volume(
+ self, ifm: Block, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, block_offset
+ ):
+ assert block_offset >= 0
+
+ # Get OFM block's volume coordinates
+ start_coord = self.get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
+ if start_coord is None:
+ return None
+ end_coord = (
+ start_coord[0] + ofm_block.width,
+ start_coord[1] + ofm_block.height,
+ start_coord[2] + ofm_block.depth,
+ )
+
+ # Calculate how many IFM blocks this OFM block requires (i.e how many jobs)
+ ifm_block = self.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, self.ofm_block_max)
+ ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth)
+ ifm_depth_blocks = 1 # Overwrite with 1 to force OFM block dependency, not IFM
+
+ return (start_coord, end_coord, ifm_depth_blocks) # start, end, total jobs for this OFM block
+
+ def calc_block_dep(
+ self,
+ prev_ifm: Block,
+ prev_ofm: Block,
+ prev_ifm_block_depth,
+ prev_ofm_block: Block,
+ prev_kernel: Kernel,
+ ifm: Block,
+ ofm: Block,
+ ifm_block_depth,
+ ofm_block: Block,
+ kernel: Kernel,
+ padLT,
+ ):
+
+ blockdep = ArchitectureFeatures.MAX_BLOCKDEP
+
+ # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
+ # of IFM area overlaps with any previous OFM block generation.
+ elapsed_jobs = 0
+ ifm_depth = ifm.size().depth
+ for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
+ # This is the IFM block we want to sample from
+ in_area = self.get_first_job_input_volume(
+ ifm, ofm, ifm_block_depth, ofm_block, kernel, padLT, forward_offset
+ )
+ if in_area is None:
+ break
+
+ # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
+ outstanding_jobs = 0
+ for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
+ # This is the OFM block being generated by the previous op
+ out_area = self.get_prev_job_output_volume(
+ prev_ifm, prev_ofm, prev_ifm_block_depth, prev_ofm_block, prev_kernel, block_offset
+ )
+ if out_area is None:
+ break
+
+ # Block dependency is the max number of allowed outstanding jobs
+ # in the pipeline. Selected by determining how many jobs occur
+ # in between two operators' overlapping OFM->IFM block volumes
+ if ArchitectureFeatures.intersects(in_area[0], in_area[1], out_area[0], out_area[1]):
+ break
+ # Early exit if no intersections and we've seen enough jobs in the pipeline
+ elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
+ break
+
+ # This OFM had this many jobs (accumulate over multiple OFM blocks)
+ outstanding_jobs += out_area[2]
+
+ blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
+ elapsed_jobs += in_area[2]
+ # Early exit if no intersections and we've seen enough jobs in the pipeline
+ if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
+ break
+
+ return blockdep
+
+ def cpu_cycle_estimate(self, op):
+ """
+ Gets estimated performance of a CPU operation, based on a linear model of intercept, slope,
+ specified in the vela config file, in ConfigParser file format (.ini file).
+ Example configuration snippet:
+ [CpuPerformance.MyOperationType]
+ Cortex-Mx.intercept=<some float value>
+ Cortex-Mx.slope=<some float value>
+ """
+ section = "CpuPerformance." + op.type
+ if self.vela_config is not None and section in self.vela_config:
+ op_config = self.vela_config[section]
+ try:
+ intercept = float(op_config.get(self.cpu_config + ".intercept", op_config["default.intercept"]))
+ slope = float(op_config.get(self.cpu_config + ".slope", op_config["default.slope"]))
+ n_elements = op.inputs[0].elements()
+ cycles = intercept + n_elements * slope
+ return cycles
+ except:
+ print("Error: Reading CPU cycle estimate in vela configuration file, section {}".format(section))
+ raise
+
+ print("Warning: No configured CPU performance estimate for", op.type)
+ return 0
+
+ def __read_sys_config(self):
+ """
+ Gets the system configuration with the given name from the vela configuration file
+ Example configuration snippet:
+ [SysConfig.MyConfigName]
+ npu_freq=<some float value>
+ cpu=Cortex-Mx
+ ...
+ """
+ # Get system configuration from the vela configuration file
+ if self.vela_config is None:
+ print("Warning: Using default values for system configuration")
+ else:
+ section_key = "SysConfig." + self.system_config
+ if not section_key in self.vela_config:
+ raise Exception("Unknown system configuration " + self.system_config)
+
+ try:
+ self.npu_clock = float(self.__sys_config("npu_freq", "500e6"))
+ self.cpu_config = self.__sys_config("cpu", "Cortex-M7")
+
+ self.memory_clock_scales[MemArea.Sram] = float(self.__sys_config("Sram_clock_scale", "1"))
+ self.memory_port_widths[MemArea.Sram] = int(self.__sys_config("Sram_port_width", "64"))
+
+ self.memory_clock_scales[MemArea.OnChipFlash] = float(self.__sys_config("OnChipFlash_clock_scale", "1"))
+ self.memory_port_widths[MemArea.OnChipFlash] = int(self.__sys_config("OnChipFlash_port_width", "64"))
+
+ self.memory_clock_scales[MemArea.OffChipFlash] = float(
+ self.__sys_config("OffChipFlash_clock_scale", "0.25")
+ )
+ self.memory_port_widths[MemArea.OffChipFlash] = int(self.__sys_config("OffChipFlash_port_width", "32"))
+
+ self.memory_clock_scales[MemArea.Dram] = float(self.__sys_config("Dram_clock_scale", "1"))
+ self.memory_port_widths[MemArea.Dram] = int(self.__sys_config("Dram_port_width", "32"))
+
+ self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")]
+ self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")]
+ self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")]
+ if self.permanent_storage_mem_area not in set((MemArea.OnChipFlash, MemArea.OffChipFlash)):
+ raise Exception(
+ "Invalid permanent_storage_mem_area = "
+ + str(self.permanent_storage_mem_area)
+ + " (must be 'OnChipFlash' or 'OffChipFlash'). To store the weights and other constant data in SRAM"
+ " select 'OnChipFlash'"
+ )
+ except:
+ print("Error: Reading System Configuration in vela configuration file, section {}".format(section_key))
+ raise
+
+ def __sys_config(self, key, default_value):
+ """
+ Gets the system configuration value with the given key from the vela config file.
+ """
+ if self.vela_config is None:
+ return default_value
+ section = "SysConfig." + self.system_config
+ result = self.vela_config[section].get(key, None)
+ if result is None:
+ raise Exception("Error: System Configuration Missing key {} in section [{}] ".format(key, section))
+ return result
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
new file mode 100644
index 00000000..7f8c4ca4
--- /dev/null
+++ b/ethosu/vela/compiler_driver.py
@@ -0,0 +1,204 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Contains the main sequencing of the compiler.
+
+from . import graph_optimiser
+from . import mark_tensors
+from . import insert_dma
+from . import pass_packing
+from . import scheduler
+from . import tensor_allocation
+from . import npu_performance
+import time
+
+from . import high_level_command_stream
+from . import high_level_command_stream_generator
+from . import register_command_stream_generator
+from . import extract_npu_subgraphs
+from . import npu_serialisation
+from . import weight_compressor
+from . import live_range
+from .tensor import MemArea
+from .nn_graph import TensorAllocator, PassPlacement
+from .rewrite_graph import verify_graph_health, verify_subgraph_health
+
+
+class CompilerOptions:
+ """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
+
+Note the difference between ArchitectureFeatures and CompilerOptions
+- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
+- CompilerOptions is for changing the behaviour of the compiler
+"""
+
+ def __init__(
+ self,
+ verbose_graph=False,
+ verbose_quantization=False,
+ verbose_packing=False,
+ verbose_tensor_purpose=False,
+ verbose_tensor_format=False,
+ verbose_allocation=False,
+ verbose_high_level_command_stream=False,
+ verbose_register_command_stream=False,
+ verbose_operators=False,
+ show_minimum_possible_allocation=False,
+ show_cpu_operations=False,
+ tensor_allocator=TensorAllocator.Greedy,
+ timing=False,
+ output_dir="outputs",
+ ):
+
+ self.verbose_graph = verbose_graph
+ self.verbose_quantization = verbose_quantization
+ self.verbose_packing = verbose_packing
+ self.verbose_tensor_purpose = verbose_tensor_purpose
+ self.verbose_tensor_format = verbose_tensor_format
+ self.verbose_allocation = verbose_allocation
+ self.verbose_high_level_command_stream = verbose_high_level_command_stream
+ self.verbose_register_command_stream = verbose_register_command_stream
+ self.verbose_operators = verbose_operators
+ self.show_minimum_possible_allocation = show_minimum_possible_allocation
+ self.show_cpu_operations = show_cpu_operations
+ self.tensor_allocator = tensor_allocator
+ self.timing = timing
+ self.output_dir = output_dir
+
+ def __str__(self):
+ return type(self).__name__ + ": " + str(self.__dict__)
+
+ __repr__ = __str__
+
+
+def compiler_driver(nng, arch, options, scheduler_options):
+ assert verify_graph_health(nng)
+ nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
+ assert verify_graph_health(nng)
+
+ if options.verbose_quantization:
+ nng.print_graph_with_tensor_quantization()
+
+ nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
+ assert verify_graph_health(nng)
+
+ nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
+ assert verify_graph_health(nng)
+ nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
+ assert verify_graph_health(nng)
+ pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
+ assert verify_graph_health(nng)
+
+ extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
+
+ mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
+ assert verify_graph_health(nng)
+ if options.timing:
+ start = time.time()
+
+ # Run the scheduler
+ scheduler.schedule_passes(nng, arch, scheduler_options)
+
+ if options.timing:
+ stop = time.time()
+ print("Scheduling took %f s" % (stop - start))
+ start = time.time()
+
+ # Update the compressed weights now that we have determined the
+ # block config, and calc and pack the scales and biases
+ weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
+
+ # Memory area for all non-constant tensors (Cpu and Npu)
+ non_const_mem_area = MemArea.Sram
+
+ # LiveRanges for constant tensors for all Npu subgraphs
+ permanent_storage = arch.permanent_storage_mem_area
+ lr_graph_flash = live_range.LiveRangeGraph()
+
+ # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
+ scratch_tens = None
+ flash_tens = None
+
+ # Calculate live ranges for all constant Npu tensors, in permanent storage
+ for sg in nng.subgraphs:
+ if sg.placement == PassPlacement.Npu:
+ lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
+ sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash
+ )
+
+ # Allocate all Npu constant tensors to the first Npu subgraph since it is
+ # processed first during serialization into tensors
+ first_npu_sg = nng.subgraphs[1]
+ assert first_npu_sg.placement == PassPlacement.Npu
+ tensor_allocation.allocate_tensors(
+ nng,
+ first_npu_sg,
+ arch,
+ permanent_storage,
+ scheduler_options.use_ifm_ofm_overlap,
+ options.tensor_allocator,
+ options.verbose_allocation,
+ options.show_minimum_possible_allocation,
+ lr_graph_flash,
+ )
+
+ # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
+ # will start at the root subgraph's input and traverse from top to bottom. When
+ # it comes across an Npu-op it will extract live ranges for it's corresponding
+ # Npu subgraph and add them to the root's live range graph. Finally, all of the
+ # non-constant tensors are allocated together
+ root_sg = nng.get_root_subgraph()
+ tensor_allocation.allocate_tensors(
+ nng,
+ root_sg,
+ arch,
+ non_const_mem_area,
+ scheduler_options.use_ifm_ofm_overlap,
+ options.tensor_allocator,
+ options.verbose_allocation,
+ options.show_minimum_possible_allocation,
+ )
+
+ # Generate command streams and serialise Npu-ops into tensors
+ for sg in nng.subgraphs:
+ high_level_command_stream_generator.generate_high_level_command_stream(
+ nng, sg, arch, options.verbose_high_level_command_stream
+ )
+ register_command_stream_generator.generate_register_command_stream(
+ nng, sg, arch, options.verbose_register_command_stream
+ )
+ scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
+ nng, sg, arch, scratch_tens, flash_tens
+ )
+
+ npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
+
+ # Allocate all Cpu constant tensors, this is done last because the Npu-ops
+ # have to be serialized into flash and scratch tensors first
+ tensor_allocation.allocate_tensors(
+ nng,
+ root_sg,
+ arch,
+ permanent_storage,
+ scheduler_options.use_ifm_ofm_overlap,
+ options.tensor_allocator,
+ options.verbose_allocation,
+ options.show_minimum_possible_allocation,
+ )
+
+ npu_performance.calc_performance_for_network(nng, arch)
diff --git a/ethosu/vela/data_type.py b/ethosu/vela/data_type.py
new file mode 100644
index 00000000..1d3e94ed
--- /dev/null
+++ b/ethosu/vela/data_type.py
@@ -0,0 +1,116 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Defines the basic numeric type classes for tensors.
+
+from .numeric_util import round_up_divide
+import enum
+
+
+class BaseType(enum.Flag):
+ Signed = 1
+ Unsigned = 2
+ Asymmetric = 4
+ Int = 8
+ SignedInt = Int | Signed
+ UnsignedInt = Int | Unsigned
+ AsymmSInt = Int | Asymmetric | Signed
+ AsymmUInt = Int | Asymmetric | Unsigned
+ Float = 16
+ BFloat = 32
+ Bool = 64
+ String = 128
+ Resource = 256
+ Variant = 512
+
+
+class DataType:
+ """Defines a data type. Consists of a base type, and the number of bits used for this type"""
+
+ __slots__ = "type", "bits"
+
+ def __init__(self, type_, bits):
+ self.type = type_
+ self.bits = bits
+
+ def __eq__(self, other):
+ return self.type == other.type and self.bits == other.bits
+
+ def __hash__(self):
+ return hash((self.type, self.bits))
+
+ def size_in_bytes(self):
+ return round_up_divide(self.bits, 8)
+
+ def size_in_bits(self):
+ return self.bits
+
+ def __str__(self):
+ stem, needs_format = DataType.stem_name[self.type]
+ if not needs_format:
+ return stem
+ else:
+ return stem % (self.bits,)
+
+ __repr__ = __str__
+
+ stem_name = {
+ BaseType.UnsignedInt: ("uint%s", True),
+ BaseType.SignedInt: ("int%s", True),
+ BaseType.AsymmUInt: ("quint%s", True),
+ BaseType.AsymmSInt: ("qint%s", True),
+ BaseType.Float: ("float%s", True),
+ BaseType.BFloat: ("bfloat%s", True),
+ BaseType.Bool: ("bool", False),
+ BaseType.String: ("string", False),
+ BaseType.Resource: ("resource", False),
+ BaseType.Variant: ("variant", False),
+ }
+
+
+# generate the standard set of data types
+DataType.int8 = DataType(BaseType.SignedInt, 8)
+DataType.int16 = DataType(BaseType.SignedInt, 16)
+DataType.int32 = DataType(BaseType.SignedInt, 32)
+DataType.int64 = DataType(BaseType.SignedInt, 64)
+
+DataType.uint8 = DataType(BaseType.UnsignedInt, 8)
+DataType.uint16 = DataType(BaseType.UnsignedInt, 16)
+DataType.uint32 = DataType(BaseType.UnsignedInt, 32)
+DataType.uint64 = DataType(BaseType.UnsignedInt, 64)
+
+DataType.quint4 = DataType(BaseType.AsymmUInt, 4)
+DataType.quint8 = DataType(BaseType.AsymmUInt, 8)
+DataType.quint12 = DataType(BaseType.AsymmUInt, 12)
+DataType.quint16 = DataType(BaseType.AsymmUInt, 16)
+DataType.quint32 = DataType(BaseType.AsymmUInt, 32)
+
+DataType.qint4 = DataType(BaseType.AsymmSInt, 4)
+DataType.qint8 = DataType(BaseType.AsymmSInt, 8)
+DataType.qint12 = DataType(BaseType.AsymmSInt, 12)
+DataType.qint16 = DataType(BaseType.AsymmSInt, 16)
+DataType.qint32 = DataType(BaseType.AsymmSInt, 32)
+
+DataType.float16 = DataType(BaseType.Float, 16)
+DataType.float32 = DataType(BaseType.Float, 32)
+DataType.float64 = DataType(BaseType.Float, 64)
+
+DataType.string = DataType(BaseType.String, 64)
+DataType.bool = DataType(BaseType.Bool, 8)
+DataType.resource = DataType(BaseType.Resource, 8)
+DataType.variant = DataType(BaseType.Variant, 8)
diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py
new file mode 100644
index 00000000..86c4a369
--- /dev/null
+++ b/ethosu/vela/driver_actions.py
@@ -0,0 +1,107 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Creates driver actions that are embedded in the custom operator payload.
+
+import numpy as np
+from typing import List
+from .ethos_u55_regs.ethos_u55_regs import *
+
+
+class DACommands:
+ Reserved = 0x00
+ Config = 0x01
+ Config_PatchShift = 4
+ CmdStream = 0x02
+ ReadAPB = 0x03
+ ReadAPB_CountShift = 12
+ ReadAPB_IndexMask = (1 << ReadAPB_CountShift) - 1
+ DumpSHRAM = 0x04
+ NOP = 0x05
+
+
+def make_da_tag(id: int, reserved: int, param: int) -> int:
+ tag: int = id
+ tag |= reserved << 8
+ tag |= param << 16
+ return tag
+
+
+def emit_fourcc(data: List[int], fourcc: str):
+ assert data != None
+ assert fourcc != None
+ assert len(fourcc) == 4
+ value: int = 0
+ value = fourcc[0].encode()[0]
+ value |= fourcc[1].encode()[0] << 8
+ value |= fourcc[2].encode()[0] << 16
+ value |= fourcc[3].encode()[0] << 24
+ data.append(value)
+
+
+def build_id_word():
+ arch_major_rev, arch_minor_rev, arch_patch_rev = (int(x) for x in ARCH_VER.split("."))
+ n = id_r()
+ n.set_arch_major_rev(arch_major_rev)
+ n.set_arch_minor_rev(arch_minor_rev)
+ n.set_arch_patch_rev(arch_patch_rev)
+ return n.word
+
+
+def build_config_word(arch):
+ macs_cc = arch.config.macs
+ log2_macs_cc = int(np.log2(macs_cc) + 0.5)
+ shram_size = int(arch.shram_size_bytes / 1024)
+ n = config_r()
+ n.set_shram_size(shram_size)
+ n.set_cmd_stream_version(0) # may be incremented in the future
+ n.set_macs_per_cc(log2_macs_cc)
+ return n.word
+
+
+def emit_config(data: List[int], rel: int, patch: int, arch):
+ assert data != None
+ data.append(make_da_tag(DACommands.Config, 0, (patch << DACommands.Config_PatchShift) | rel))
+ data.append(build_config_word(arch))
+ data.append(build_id_word())
+
+
+def emit_cmd_stream_header(data: List[int], length: int):
+ assert data != None
+ # Insert NOPs to align start of command stream to 16 bytes
+ num_nops = 4 - ((len(data) + 1) % 4)
+ for _ in range(num_nops):
+ data.append(make_da_tag(DACommands.NOP, 0, 0))
+
+ # Use the reserved 8 bit as the length high
+ length_high = (length & 0x00FF0000) >> 16
+ length_low = length & 0x0000FFFF
+ data.append(make_da_tag(DACommands.CmdStream, length_high, length_low))
+
+
+def emit_reg_read(data: List[int], reg_index: int, reg_count: int = 1):
+ assert data != None
+ assert reg_index >= 0
+ assert reg_count >= 1
+ payload: int = (reg_index & DACommands.ReadAPB_IndexMask) | ((reg_count << DACommands.ReadAPB_CountShift) - 1)
+ data.append(make_da_tag(DACommands.ReadAPB, 0, payload))
+
+
+def emit_dump_shram(data: List[int]):
+ assert data != None
+ data.append(make_da_tag(DACommands.DumpSHRAM, 0, 0))
diff --git a/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py b/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py
new file mode 100644
index 00000000..37f7a67a
--- /dev/null
+++ b/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py
@@ -0,0 +1,3138 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ctypes import *
+from enum import Enum
+
+ARCH_VER = '0.154.0'
+
+
+class DEBUG_INTERNAL(Enum):
+ SHARED_BUFFER0 = 0x0400
+ SHARED_BUFFER1 = 0x0404
+ SHARED_BUFFER2 = 0x0408
+ SHARED_BUFFER3 = 0x040C
+ SHARED_BUFFER4 = 0x0410
+ SHARED_BUFFER5 = 0x0414
+ SHARED_BUFFER6 = 0x0418
+ SHARED_BUFFER7 = 0x041C
+ SHARED_BUFFER8 = 0x0420
+ SHARED_BUFFER9 = 0x0424
+ SHARED_BUFFER10 = 0x0428
+ SHARED_BUFFER11 = 0x042C
+ SHARED_BUFFER12 = 0x0430
+ SHARED_BUFFER13 = 0x0434
+ SHARED_BUFFER14 = 0x0438
+ SHARED_BUFFER15 = 0x043C
+ SHARED_BUFFER16 = 0x0440
+ SHARED_BUFFER17 = 0x0444
+ SHARED_BUFFER18 = 0x0448
+ SHARED_BUFFER19 = 0x044C
+ SHARED_BUFFER20 = 0x0450
+ SHARED_BUFFER21 = 0x0454
+ SHARED_BUFFER22 = 0x0458
+ SHARED_BUFFER23 = 0x045C
+ SHARED_BUFFER24 = 0x0460
+ SHARED_BUFFER25 = 0x0464
+ SHARED_BUFFER26 = 0x0468
+ SHARED_BUFFER27 = 0x046C
+ SHARED_BUFFER28 = 0x0470
+ SHARED_BUFFER29 = 0x0474
+ SHARED_BUFFER30 = 0x0478
+ SHARED_BUFFER31 = 0x047C
+ SHARED_BUFFER32 = 0x0480
+ SHARED_BUFFER33 = 0x0484
+ SHARED_BUFFER34 = 0x0488
+ SHARED_BUFFER35 = 0x048C
+ SHARED_BUFFER36 = 0x0490
+ SHARED_BUFFER37 = 0x0494
+ SHARED_BUFFER38 = 0x0498
+ SHARED_BUFFER39 = 0x049C
+ SHARED_BUFFER40 = 0x04A0
+ SHARED_BUFFER41 = 0x04A4
+ SHARED_BUFFER42 = 0x04A8
+ SHARED_BUFFER43 = 0x04AC
+ SHARED_BUFFER44 = 0x04B0
+ SHARED_BUFFER45 = 0x04B4
+ SHARED_BUFFER46 = 0x04B8
+ SHARED_BUFFER47 = 0x04BC
+ SHARED_BUFFER48 = 0x04C0
+ SHARED_BUFFER49 = 0x04C4
+ SHARED_BUFFER50 = 0x04C8
+ SHARED_BUFFER51 = 0x04CC
+ SHARED_BUFFER52 = 0x04D0
+ SHARED_BUFFER53 = 0x04D4
+ SHARED_BUFFER54 = 0x04D8
+ SHARED_BUFFER55 = 0x04DC
+ SHARED_BUFFER56 = 0x04E0
+ SHARED_BUFFER57 = 0x04E4
+ SHARED_BUFFER58 = 0x04E8
+ SHARED_BUFFER59 = 0x04EC
+ SHARED_BUFFER60 = 0x04F0
+ SHARED_BUFFER61 = 0x04F4
+ SHARED_BUFFER62 = 0x04F8
+ SHARED_BUFFER63 = 0x04FC
+ SHARED_BUFFER64 = 0x0500
+ SHARED_BUFFER65 = 0x0504
+ SHARED_BUFFER66 = 0x0508
+ SHARED_BUFFER67 = 0x050C
+ SHARED_BUFFER68 = 0x0510
+ SHARED_BUFFER69 = 0x0514
+ SHARED_BUFFER70 = 0x0518
+ SHARED_BUFFER71 = 0x051C
+ SHARED_BUFFER72 = 0x0520
+ SHARED_BUFFER73 = 0x0524
+ SHARED_BUFFER74 = 0x0528
+ SHARED_BUFFER75 = 0x052C
+ SHARED_BUFFER76 = 0x0530
+ SHARED_BUFFER77 = 0x0534
+ SHARED_BUFFER78 = 0x0538
+ SHARED_BUFFER79 = 0x053C
+ SHARED_BUFFER80 = 0x0540
+ SHARED_BUFFER81 = 0x0544
+ SHARED_BUFFER82 = 0x0548
+ SHARED_BUFFER83 = 0x054C
+ SHARED_BUFFER84 = 0x0550
+ SHARED_BUFFER85 = 0x0554
+ SHARED_BUFFER86 = 0x0558
+ SHARED_BUFFER87 = 0x055C
+ SHARED_BUFFER88 = 0x0560
+ SHARED_BUFFER89 = 0x0564
+ SHARED_BUFFER90 = 0x0568
+ SHARED_BUFFER91 = 0x056C
+ SHARED_BUFFER92 = 0x0570
+ SHARED_BUFFER93 = 0x0574
+ SHARED_BUFFER94 = 0x0578
+ SHARED_BUFFER95 = 0x057C
+ SHARED_BUFFER96 = 0x0580
+ SHARED_BUFFER97 = 0x0584
+ SHARED_BUFFER98 = 0x0588
+ SHARED_BUFFER99 = 0x058C
+ SHARED_BUFFER100 = 0x0590
+ SHARED_BUFFER101 = 0x0594
+ SHARED_BUFFER102 = 0x0598
+ SHARED_BUFFER103 = 0x059C
+ SHARED_BUFFER104 = 0x05A0
+ SHARED_BUFFER105 = 0x05A4
+ SHARED_BUFFER106 = 0x05A8
+ SHARED_BUFFER107 = 0x05AC
+ SHARED_BUFFER108 = 0x05B0
+ SHARED_BUFFER109 = 0x05B4
+ SHARED_BUFFER110 = 0x05B8
+ SHARED_BUFFER111 = 0x05BC
+ SHARED_BUFFER112 = 0x05C0
+ SHARED_BUFFER113 = 0x05C4
+ SHARED_BUFFER114 = 0x05C8
+ SHARED_BUFFER115 = 0x05CC
+ SHARED_BUFFER116 = 0x05D0
+ SHARED_BUFFER117 = 0x05D4
+ SHARED_BUFFER118 = 0x05D8
+ SHARED_BUFFER119 = 0x05DC
+ SHARED_BUFFER120 = 0x05E0
+ SHARED_BUFFER121 = 0x05E4
+ SHARED_BUFFER122 = 0x05E8
+ SHARED_BUFFER123 = 0x05EC
+ SHARED_BUFFER124 = 0x05F0
+ SHARED_BUFFER125 = 0x05F4
+ SHARED_BUFFER126 = 0x05F8
+ SHARED_BUFFER127 = 0x05FC
+ SHARED_BUFFER128 = 0x0600
+ SHARED_BUFFER129 = 0x0604
+ SHARED_BUFFER130 = 0x0608
+ SHARED_BUFFER131 = 0x060C
+ SHARED_BUFFER132 = 0x0610
+ SHARED_BUFFER133 = 0x0614
+ SHARED_BUFFER134 = 0x0618
+ SHARED_BUFFER135 = 0x061C
+ SHARED_BUFFER136 = 0x0620
+ SHARED_BUFFER137 = 0x0624
+ SHARED_BUFFER138 = 0x0628
+ SHARED_BUFFER139 = 0x062C
+ SHARED_BUFFER140 = 0x0630
+ SHARED_BUFFER141 = 0x0634
+ SHARED_BUFFER142 = 0x0638
+ SHARED_BUFFER143 = 0x063C
+ SHARED_BUFFER144 = 0x0640
+ SHARED_BUFFER145 = 0x0644
+ SHARED_BUFFER146 = 0x0648
+ SHARED_BUFFER147 = 0x064C
+ SHARED_BUFFER148 = 0x0650
+ SHARED_BUFFER149 = 0x0654
+ SHARED_BUFFER150 = 0x0658
+ SHARED_BUFFER151 = 0x065C
+ SHARED_BUFFER152 = 0x0660
+ SHARED_BUFFER153 = 0x0664
+ SHARED_BUFFER154 = 0x0668
+ SHARED_BUFFER155 = 0x066C
+ SHARED_BUFFER156 = 0x0670
+ SHARED_BUFFER157 = 0x0674
+ SHARED_BUFFER158 = 0x0678
+ SHARED_BUFFER159 = 0x067C
+ SHARED_BUFFER160 = 0x0680
+ SHARED_BUFFER161 = 0x0684
+ SHARED_BUFFER162 = 0x0688
+ SHARED_BUFFER163 = 0x068C
+ SHARED_BUFFER164 = 0x0690
+ SHARED_BUFFER165 = 0x0694
+ SHARED_BUFFER166 = 0x0698
+ SHARED_BUFFER167 = 0x069C
+ SHARED_BUFFER168 = 0x06A0
+ SHARED_BUFFER169 = 0x06A4
+ SHARED_BUFFER170 = 0x06A8
+ SHARED_BUFFER171 = 0x06AC
+ SHARED_BUFFER172 = 0x06B0
+ SHARED_BUFFER173 = 0x06B4
+ SHARED_BUFFER174 = 0x06B8
+ SHARED_BUFFER175 = 0x06BC
+ SHARED_BUFFER176 = 0x06C0
+ SHARED_BUFFER177 = 0x06C4
+ SHARED_BUFFER178 = 0x06C8
+ SHARED_BUFFER179 = 0x06CC
+ SHARED_BUFFER180 = 0x06D0
+ SHARED_BUFFER181 = 0x06D4
+ SHARED_BUFFER182 = 0x06D8
+ SHARED_BUFFER183 = 0x06DC
+ SHARED_BUFFER184 = 0x06E0
+ SHARED_BUFFER185 = 0x06E4
+ SHARED_BUFFER186 = 0x06E8
+ SHARED_BUFFER187 = 0x06EC
+ SHARED_BUFFER188 = 0x06F0
+ SHARED_BUFFER189 = 0x06F4
+ SHARED_BUFFER190 = 0x06F8
+ SHARED_BUFFER191 = 0x06FC
+ SHARED_BUFFER192 = 0x0700
+ SHARED_BUFFER193 = 0x0704
+ SHARED_BUFFER194 = 0x0708
+ SHARED_BUFFER195 = 0x070C
+ SHARED_BUFFER196 = 0x0710
+ SHARED_BUFFER197 = 0x0714
+ SHARED_BUFFER198 = 0x0718
+ SHARED_BUFFER199 = 0x071C
+ SHARED_BUFFER200 = 0x0720
+ SHARED_BUFFER201 = 0x0724
+ SHARED_BUFFER202 = 0x0728
+ SHARED_BUFFER203 = 0x072C
+ SHARED_BUFFER204 = 0x0730
+ SHARED_BUFFER205 = 0x0734
+ SHARED_BUFFER206 = 0x0738
+ SHARED_BUFFER207 = 0x073C
+ SHARED_BUFFER208 = 0x0740
+ SHARED_BUFFER209 = 0x0744
+ SHARED_BUFFER210 = 0x0748
+ SHARED_BUFFER211 = 0x074C
+ SHARED_BUFFER212 = 0x0750
+ SHARED_BUFFER213 = 0x0754
+ SHARED_BUFFER214 = 0x0758
+ SHARED_BUFFER215 = 0x075C
+ SHARED_BUFFER216 = 0x0760
+ SHARED_BUFFER217 = 0x0764
+ SHARED_BUFFER218 = 0x0768
+ SHARED_BUFFER219 = 0x076C
+ SHARED_BUFFER220 = 0x0770
+ SHARED_BUFFER221 = 0x0774
+ SHARED_BUFFER222 = 0x0778
+ SHARED_BUFFER223 = 0x077C
+ SHARED_BUFFER224 = 0x0780
+ SHARED_BUFFER225 = 0x0784
+ SHARED_BUFFER226 = 0x0788
+ SHARED_BUFFER227 = 0x078C
+ SHARED_BUFFER228 = 0x0790
+ SHARED_BUFFER229 = 0x0794
+ SHARED_BUFFER230 = 0x0798
+ SHARED_BUFFER231 = 0x079C
+ SHARED_BUFFER232 = 0x07A0
+ SHARED_BUFFER233 = 0x07A4
+ SHARED_BUFFER234 = 0x07A8
+ SHARED_BUFFER235 = 0x07AC
+ SHARED_BUFFER236 = 0x07B0
+ SHARED_BUFFER237 = 0x07B4
+ SHARED_BUFFER238 = 0x07B8
+ SHARED_BUFFER239 = 0x07BC
+ SHARED_BUFFER240 = 0x07C0
+ SHARED_BUFFER241 = 0x07C4
+ SHARED_BUFFER242 = 0x07C8
+ SHARED_BUFFER243 = 0x07CC
+ SHARED_BUFFER244 = 0x07D0
+ SHARED_BUFFER245 = 0x07D4
+ SHARED_BUFFER246 = 0x07D8
+ SHARED_BUFFER247 = 0x07DC
+ SHARED_BUFFER248 = 0x07E0
+ SHARED_BUFFER249 = 0x07E4
+ SHARED_BUFFER250 = 0x07E8
+ SHARED_BUFFER251 = 0x07EC
+ SHARED_BUFFER252 = 0x07F0
+ SHARED_BUFFER253 = 0x07F4
+ SHARED_BUFFER254 = 0x07F8
+ SHARED_BUFFER255 = 0x07FC
+ SIZE = 0x0800
+
+class HW_DEBUG_INTERNAL(Enum):
+ CLKFORCE = 0x0140
+ DEBUG = 0x0144
+ DEBUG2 = 0x0148
+ DEBUGCORE = 0x014C
+ SIZE = 0x0150
+
+class NPU_BP(Enum):
+ BASEP0 = 0x0080
+ BASEP1 = 0x0084
+ BASEP2 = 0x0088
+ BASEP3 = 0x008C
+ BASEP4 = 0x0090
+ BASEP5 = 0x0094
+ BASEP6 = 0x0098
+ BASEP7 = 0x009C
+ BASEP8 = 0x00A0
+ BASEP9 = 0x00A4
+ BASEP10 = 0x00A8
+ BASEP11 = 0x00AC
+ BASEP12 = 0x00B0
+ BASEP13 = 0x00B4
+ BASEP14 = 0x00B8
+ BASEP15 = 0x00BC
+ SIZE = 0x00C0
+
+class NPU_IDS(Enum):
+ REVISION = 0x0FC0
+ PID4 = 0x0FD0
+ PID5 = 0x0FD4
+ PID6 = 0x0FD8
+ PID7 = 0x0FDC
+ PID0 = 0x0FE0
+ PID1 = 0x0FE4
+ PID2 = 0x0FE8
+ PID3 = 0x0FEC
+ CID0 = 0x0FF0
+ CID1 = 0x0FF4
+ CID2 = 0x0FF8
+ CID3 = 0x0FFC
+ SIZE = 0x1000
+
+class NPU_REG(Enum):
+ ID = 0x0000
+ STATUS = 0x0004
+ CMD = 0x0008
+ RESET = 0x000C
+ QBASE0 = 0x0010
+ QBASE1 = 0x0014
+ QREAD = 0x0018
+ QCONFIG = 0x001C
+ QSIZE = 0x0020
+ PROT = 0x0024
+ CONFIG = 0x0028
+ LOCK = 0x002C
+ REGIONCFG = 0x003C
+ AXI_LIMIT0 = 0x0040
+ AXI_LIMIT1 = 0x0044
+ AXI_LIMIT2 = 0x0048
+ AXI_LIMIT3 = 0x004C
+ SIZE = 0x0050
+
+class PMU_INTERNAL(Enum):
+ PMCR = 0x0180
+ PMCNTENSET = 0x0184
+ PMCNTENCLR = 0x0188
+ PMOVSSET = 0x018C
+ PMOVSCLR = 0x0190
+ PMINTSET = 0x0194
+ PMINTCLR = 0x0198
+ PMCCNTR_LO = 0x01A0
+ PMCCNTR_HI = 0x01A4
+ PMCCNTR_CFG = 0x01A8
+ PMCAXI_CHAN = 0x01AC
+ PMEVCNTR0 = 0x0300
+ PMEVCNTR1 = 0x0304
+ PMEVCNTR2 = 0x0308
+ PMEVCNTR3 = 0x030C
+ PMEVTYPER0 = 0x0380
+ PMEVTYPER1 = 0x0384
+ PMEVTYPER2 = 0x0388
+ PMEVTYPER3 = 0x038C
+ SIZE = 0x0390
+
+class TSU_DEBUG_INTERNAL(Enum):
+ IFM_PAD_TOP = 0x0800
+ IFM_PAD_LEFT = 0x0804
+ IFM_PAD_RIGHT = 0x0808
+ IFM_PAD_BOTTOM = 0x080C
+ IFM_DEPTH_M1 = 0x0810
+ IFM_PRECISION = 0x0814
+ IFM_UPSCALE = 0x081C
+ IFM_ZERO_POINT = 0x0824
+ IFM_WIDTH0_M1 = 0x0828
+ IFM_HEIGHT0_M1 = 0x082C
+ IFM_HEIGHT1_M1 = 0x0830
+ IFM_IB_END = 0x0834
+ IFM_REGION = 0x083C
+ OFM_WIDTH_M1 = 0x0844
+ OFM_HEIGHT_M1 = 0x0848
+ OFM_DEPTH_M1 = 0x084C
+ OFM_PRECISION = 0x0850
+ OFM_BLK_WIDTH_M1 = 0x0854
+ OFM_BLK_HEIGHT_M1 = 0x0858
+ OFM_BLK_DEPTH_M1 = 0x085C
+ OFM_ZERO_POINT = 0x0860
+ OFM_WIDTH0_M1 = 0x0868
+ OFM_HEIGHT0_M1 = 0x086C
+ OFM_HEIGHT1_M1 = 0x0870
+ OFM_REGION = 0x087C
+ KERNEL_WIDTH_M1 = 0x0880
+ KERNEL_HEIGHT_M1 = 0x0884
+ KERNEL_STRIDE = 0x0888
+ PARALLEL_MODE = 0x088C
+ ACC_FORMAT = 0x0890
+ ACTIVATION = 0x0894
+ ACTIVATION_MIN = 0x0898
+ ACTIVATION_MAX = 0x089C
+ WEIGHT_REGION = 0x08A0
+ SCALE_REGION = 0x08A4
+ AB_START = 0x08B4
+ BLOCKDEP = 0x08BC
+ DMA0_SRC_REGION = 0x08C0
+ DMA0_DST_REGION = 0x08C4
+ DMA0_SIZE0 = 0x08C8
+ DMA0_SIZE1 = 0x08CC
+ IFM2_BROADCAST = 0x0900
+ IFM2_SCALAR = 0x0904
+ IFM2_PRECISION = 0x0914
+ IFM2_ZERO_POINT = 0x0924
+ IFM2_WIDTH0_M1 = 0x0928
+ IFM2_HEIGHT0_M1 = 0x092C
+ IFM2_HEIGHT1_M1 = 0x0930
+ IFM2_IB_START = 0x0934
+ IFM2_REGION = 0x093C
+ IFM_BASE0 = 0x0A00
+ IFM_BASE0_HI = 0x0A04
+ IFM_BASE1 = 0x0A08
+ IFM_BASE1_HI = 0x0A0C
+ IFM_BASE2 = 0x0A10
+ IFM_BASE2_HI = 0x0A14
+ IFM_BASE3 = 0x0A18
+ IFM_BASE3_HI = 0x0A1C
+ IFM_STRIDE_X = 0x0A20
+ IFM_STRIDE_X_HI = 0x0A24
+ IFM_STRIDE_Y = 0x0A28
+ IFM_STRIDE_Y_HI = 0x0A2C
+ IFM_STRIDE_C = 0x0A30
+ IFM_STRIDE_C_HI = 0x0A34
+ OFM_BASE0 = 0x0A40
+ OFM_BASE0_HI = 0x0A44
+ OFM_BASE1 = 0x0A48
+ OFM_BASE1_HI = 0x0A4C
+ OFM_BASE2 = 0x0A50
+ OFM_BASE2_HI = 0x0A54
+ OFM_BASE3 = 0x0A58
+ OFM_BASE3_HI = 0x0A5C
+ OFM_STRIDE_X = 0x0A60
+ OFM_STRIDE_X_HI = 0x0A64
+ OFM_STRIDE_Y = 0x0A68
+ OFM_STRIDE_Y_HI = 0x0A6C
+ OFM_STRIDE_C = 0x0A70
+ OFM_STRIDE_C_HI = 0x0A74
+ WEIGHT_BASE = 0x0A80
+ WEIGHT_BASE_HI = 0x0A84
+ WEIGHT_LENGTH = 0x0A88
+ WEIGHT_LENGTH_HI = 0x0A8C
+ SCALE_BASE = 0x0A90
+ SCALE_BASE_HI = 0x0A94
+ SCALE_LENGTH = 0x0A98
+ OFM_SCALE = 0x0AA0
+ OFM_SCALE_SHIFT = 0x0AA4
+ OPA_SCALE = 0x0AA8
+ OPA_SCALE_SHIFT = 0x0AAC
+ OPB_SCALE = 0x0AB0
+ DMA0_SRC = 0x0AC0
+ DMA0_SRC_HI = 0x0AC4
+ DMA0_DST = 0x0AC8
+ DMA0_DST_HI = 0x0ACC
+ DMA0_LEN = 0x0AD0
+ DMA0_LEN_HI = 0x0AD4
+ DMA0_SKIP0 = 0x0AD8
+ DMA0_SKIP0_HI = 0x0ADC
+ DMA0_SKIP1 = 0x0AE0
+ DMA0_SKIP1_HI = 0x0AE4
+ IFM2_BASE0 = 0x0B00
+ IFM2_BASE0_HI = 0x0B04
+ IFM2_BASE1 = 0x0B08
+ IFM2_BASE1_HI = 0x0B0C
+ IFM2_BASE2 = 0x0B10
+ IFM2_BASE2_HI = 0x0B14
+ IFM2_BASE3 = 0x0B18
+ IFM2_BASE3_HI = 0x0B1C
+ IFM2_STRIDE_X = 0x0B20
+ IFM2_STRIDE_X_HI = 0x0B24
+ IFM2_STRIDE_Y = 0x0B28
+ IFM2_STRIDE_Y_HI = 0x0B2C
+ IFM2_STRIDE_C = 0x0B30
+ IFM2_STRIDE_C_HI = 0x0B34
+ WEIGHT1_BASE = 0x0B40
+ WEIGHT1_BASE_HI = 0x0B44
+ WEIGHT1_LENGTH = 0x0B48
+ WEIGHT1_LENGTH_HI = 0x0B4C
+ SCALE1_BASE = 0x0B50
+ SCALE1_BASE_HI = 0x0B54
+ SCALE1_LENGTH = 0x0B58
+ SIZE = 0x0B5C
+
+class TSU_DEBUG_RO_INTERNAL(Enum):
+ KERNEL_X = 0x0200
+ KERNEL_Y = 0x0204
+ KERNEL_W_M1 = 0x0208
+ KERNEL_H_M1 = 0x020C
+ OFM_CBLK_WIDTH_M1 = 0x0210
+ OFM_CBLK_HEIGHT_M1 = 0x0214
+ OFM_CBLK_DEPTH_M1 = 0x0218
+ IFM_CBLK_DEPTH_M1 = 0x021C
+ OFM_X = 0x0220
+ OFM_Y = 0x0224
+ OFM_Z = 0x0228
+ IFM_Z = 0x022C
+ PAD_TOP = 0x0230
+ PAD_LEFT = 0x0234
+ IFM_CBLK_WIDTH = 0x0238
+ IFM_CBLK_HEIGHT = 0x023C
+ DMA_IFM_SRC = 0x0240
+ DMA_IFM_SRC_HI = 0x0244
+ DMA_IFM_DST = 0x0248
+ DMA_OFM_SRC = 0x024C
+ DMA_OFM_DST = 0x0250
+ DMA_OFM_DST_HI = 0x0254
+ DMA_WEIGHT_SRC = 0x0258
+ DMA_WEIGHT_SRC_HI = 0x025C
+ DMA_CMD_SRC = 0x0260
+ DMA_CMD_SRC_HI = 0x0264
+ DMA_CMD_SIZE = 0x0268
+ DMA_M2M_SRC = 0x026C
+ DMA_M2M_SRC_HI = 0x0270
+ DMA_M2M_DST = 0x0274
+ DMA_M2M_DST_HI = 0x0278
+ CURRENT_QREAD = 0x027C
+ DMA_SCALE_SRC = 0x0280
+ DMA_SCALE_SRC_HI = 0x0284
+ CURRENT_CMD = 0x02BC
+ SIZE = 0x02C0
+
+
+
+class acc_format(Enum):
+ INT_32BIT = 0
+ INT_40BIT = 1
+ FP_S5_10 = 2
+
+class activation(Enum):
+ NONE = 0
+ TANH = 3
+ SIGMOID = 4
+ LUT_START = 16
+ LUT_END = 23
+
+class clip_range(Enum):
+ OFM_PRECISION = 0
+ FORCE_UINT8 = 2
+ FORCE_INT8 = 3
+ FORCE_INT16 = 5
+
+class cmd0(Enum):
+ NPU_OP_STOP = 0x000
+ NPU_OP_IRQ = 0x001
+ NPU_OP_CONV = 0x002
+ NPU_OP_DEPTHWISE = 0x003
+ NPU_OP_POOL = 0x005
+ NPU_OP_ELEMENTWISE = 0x006
+ NPU_OP_DMA_START = 0x010
+ NPU_OP_DMA_WAIT = 0x011
+ NPU_OP_KERNEL_WAIT = 0x012
+ NPU_OP_PMU_MASK = 0x013
+ NPU_SET_IFM_PAD_TOP = 0x100
+ NPU_SET_IFM_PAD_LEFT = 0x101
+ NPU_SET_IFM_PAD_RIGHT = 0x102
+ NPU_SET_IFM_PAD_BOTTOM = 0x103
+ NPU_SET_IFM_DEPTH_M1 = 0x104
+ NPU_SET_IFM_PRECISION = 0x105
+ NPU_SET_IFM_UPSCALE = 0x107
+ NPU_SET_IFM_ZERO_POINT = 0x109
+ NPU_SET_IFM_WIDTH0_M1 = 0x10A
+ NPU_SET_IFM_HEIGHT0_M1 = 0x10B
+ NPU_SET_IFM_HEIGHT1_M1 = 0x10C
+ NPU_SET_IFM_IB_END = 0x10D
+ NPU_SET_IFM_REGION = 0x10F
+ NPU_SET_OFM_WIDTH_M1 = 0x111
+ NPU_SET_OFM_HEIGHT_M1 = 0x112
+ NPU_SET_OFM_DEPTH_M1 = 0x113
+ NPU_SET_OFM_PRECISION = 0x114
+ NPU_SET_OFM_BLK_WIDTH_M1 = 0x115
+ NPU_SET_OFM_BLK_HEIGHT_M1 = 0x116
+ NPU_SET_OFM_BLK_DEPTH_M1 = 0x117
+ NPU_SET_OFM_ZERO_POINT = 0x118
+ NPU_SET_OFM_WIDTH0_M1 = 0x11A
+ NPU_SET_OFM_HEIGHT0_M1 = 0x11B
+ NPU_SET_OFM_HEIGHT1_M1 = 0x11C
+ NPU_SET_OFM_REGION = 0x11F
+ NPU_SET_KERNEL_WIDTH_M1 = 0x120
+ NPU_SET_KERNEL_HEIGHT_M1 = 0x121
+ NPU_SET_KERNEL_STRIDE = 0x122
+ NPU_SET_PARALLEL_MODE = 0x123
+ NPU_SET_ACC_FORMAT = 0x124
+ NPU_SET_ACTIVATION = 0x125
+ NPU_SET_ACTIVATION_MIN = 0x126
+ NPU_SET_ACTIVATION_MAX = 0x127
+ NPU_SET_WEIGHT_REGION = 0x128
+ NPU_SET_SCALE_REGION = 0x129
+ NPU_SET_AB_START = 0x12D
+ NPU_SET_BLOCKDEP = 0x12F
+ NPU_SET_DMA0_SRC_REGION = 0x130
+ NPU_SET_DMA0_DST_REGION = 0x131
+ NPU_SET_DMA0_SIZE0 = 0x132
+ NPU_SET_DMA0_SIZE1 = 0x133
+ NPU_SET_IFM2_BROADCAST = 0x180
+ NPU_SET_IFM2_SCALAR = 0x181
+ NPU_SET_IFM2_PRECISION = 0x185
+ NPU_SET_IFM2_ZERO_POINT = 0x189
+ NPU_SET_IFM2_WIDTH0_M1 = 0x18A
+ NPU_SET_IFM2_HEIGHT0_M1 = 0x18B
+ NPU_SET_IFM2_HEIGHT1_M1 = 0x18C
+ NPU_SET_IFM2_IB_START = 0x18D
+ NPU_SET_IFM2_REGION = 0x18F
+
+class cmd1(Enum):
+ NPU_SET_IFM_BASE0 = 0x000
+ NPU_SET_IFM_BASE1 = 0x001
+ NPU_SET_IFM_BASE2 = 0x002
+ NPU_SET_IFM_BASE3 = 0x003
+ NPU_SET_IFM_STRIDE_X = 0x004
+ NPU_SET_IFM_STRIDE_Y = 0x005
+ NPU_SET_IFM_STRIDE_C = 0x006
+ NPU_SET_OFM_BASE0 = 0x010
+ NPU_SET_OFM_BASE1 = 0x011
+ NPU_SET_OFM_BASE2 = 0x012
+ NPU_SET_OFM_BASE3 = 0x013
+ NPU_SET_OFM_STRIDE_X = 0x014
+ NPU_SET_OFM_STRIDE_Y = 0x015
+ NPU_SET_OFM_STRIDE_C = 0x016
+ NPU_SET_WEIGHT_BASE = 0x020
+ NPU_SET_WEIGHT_LENGTH = 0x021
+ NPU_SET_SCALE_BASE = 0x022
+ NPU_SET_SCALE_LENGTH = 0x023
+ NPU_SET_OFM_SCALE = 0x024
+ NPU_SET_OPA_SCALE = 0x025
+ NPU_SET_OPB_SCALE = 0x026
+ NPU_SET_DMA0_SRC = 0x030
+ NPU_SET_DMA0_DST = 0x031
+ NPU_SET_DMA0_LEN = 0x032
+ NPU_SET_DMA0_SKIP0 = 0x033
+ NPU_SET_DMA0_SKIP1 = 0x034
+ NPU_SET_IFM2_BASE0 = 0x080
+ NPU_SET_IFM2_BASE1 = 0x081
+ NPU_SET_IFM2_BASE2 = 0x082
+ NPU_SET_IFM2_BASE3 = 0x083
+ NPU_SET_IFM2_STRIDE_X = 0x084
+ NPU_SET_IFM2_STRIDE_Y = 0x085
+ NPU_SET_IFM2_STRIDE_C = 0x086
+ NPU_SET_WEIGHT1_BASE = 0x090
+ NPU_SET_WEIGHT1_LENGTH = 0x091
+ NPU_SET_SCALE1_BASE = 0x092
+ NPU_SET_SCALE1_LENGTH = 0x093
+
+class data_format(Enum):
+ NHWC = 0
+ NHCWB16 = 1
+
+class elementwise_mode(Enum):
+ MUL = 0
+ ADD = 1
+ SUB = 2
+ MIN = 3
+ MAX = 4
+ LRELU = 5
+ ABS = 6
+ CLZ = 7
+ SHR = 8
+ SHL = 9
+
+class ifm_precision(Enum):
+ W8_U8 = 0
+ W8_S8 = 1
+ W8_U16 = 4
+ W8_S16 = 5
+ W8_S32 = 9
+
+class ifm_scale_mode(Enum):
+ SCALE_16BIT = 0
+ SCALE_OPA_32BIT = 1
+ SCALE_OPB_32BIT = 2
+
+class memory_type(Enum):
+ AXI0_OUTSTANDING_COUNTER0 = 0
+ AXI0_OUTSTANDING_COUNTER1 = 1
+ AXI1_OUTSTANDING_COUNTER2 = 2
+ AXI1_OUTSTANDING_COUNTER3 = 3
+
+class ofm_precision(Enum):
+ U8 = 0
+ S8 = 1
+ U16 = 2
+ S16 = 3
+ S32 = 5
+
+class pmu_event_type(Enum):
+ CYCLE = 0x11
+ NPU_IDLE = 0x20
+ MAC_ACTIVE = 0x30
+ MAC_ACTIVE_8BIT = 0x31
+ MAC_ACTIVE_16BIT = 0x32
+ MAC_DPU_ACTIVE = 0x33
+ MAC_STALLED_BY_WD_ACC = 0x34
+ MAC_STALLED_BY_WD = 0x35
+ MAC_STALLED_BY_ACC = 0x36
+ MAC_STALLED_BY_IB = 0x37
+ AO_ACTIVE = 0x40
+ AO_ACTIVE_8BIT = 0x41
+ AO_ACTIVE_16BIT = 0x42
+ AO_STALLED_BY_OFMP_OB = 0x43
+ AO_STALLED_BY_OFMP = 0x44
+ AO_STALLED_BY_OB = 0x45
+ AO_STALLED_BY_ACC_IB = 0x46
+ AO_STALLED_BY_ACC = 0x47
+ AO_STALLED_BY_IB = 0x48
+ WD_ACTIVE = 0x50
+ WD_STALLED = 0x51
+ WD_STALLED_BY_WS = 0x52
+ WD_STALLED_BY_WD_BUF = 0x53
+ WD_PARSE_ACTIVE = 0x54
+ WD_PARSE_STALLED = 0x55
+ WD_PARSE_STALLED_IN = 0x56
+ WD_PARSE_STALLED_OUT = 0x57
+ AXI0_RD_TRANS_ACCEPTED = 0x80
+ AXI0_RD_TRANS_COMPLETED = 0x81
+ AXI0_RD_DATA_BEAT_RECEIVED = 0x82
+ AXI0_RD_TRAN_REQ_STALLED = 0x83
+ AXI0_WR_TRANS_ACCEPTED = 0x84
+ AXI0_WR_TRANS_COMPLETED_M = 0x85
+ AXI0_WR_TRANS_COMPLETED_S = 0x86
+ AXI0_WR_DATA_BEAT_WRITTEN = 0x87
+ AXI0_WR_TRAN_REQ_STALLED = 0x88
+ AXI0_WR_DATA_BEAT_STALLED = 0x89
+ AXI0_ENABLED_CYCLES = 0x8c
+ AXI0_RD_STALL_LIMIT = 0x8e
+ AXI0_WR_STALL_LIMIT = 0x8f
+ AXI1_RD_TRANS_ACCEPTED = 0x180
+ AXI1_RD_TRANS_COMPLETED = 0x181
+ AXI1_RD_DATA_BEAT_RECEIVED = 0x182
+ AXI1_RD_TRAN_REQ_STALLED = 0x183
+ AXI1_WR_TRANS_ACCEPTED = 0x184
+ AXI1_WR_TRANS_COMPLETED_M = 0x185
+ AXI1_WR_TRANS_COMPLETED_S = 0x186
+ AXI1_WR_DATA_BEAT_WRITTEN = 0x187
+ AXI1_WR_TRAN_REQ_STALLED = 0x188
+ AXI1_WR_DATA_BEAT_STALLED = 0x189
+ AXI1_ENABLED_CYCLES = 0x18c
+ AXI1_RD_STALL_LIMIT = 0x18e
+ AXI1_WR_STALL_LIMIT = 0x18f
+ AXI_LATENCY_ANY = 0xa0
+ AXI_LATENCY_32 = 0xa1
+ AXI_LATENCY_64 = 0xa2
+ AXI_LATENCY_128 = 0xa3
+ AXI_LATENCY_256 = 0xa4
+ AXI_LATENCY_512 = 0xa5
+ AXI_LATENCY_1024 = 0xa6
+
+class pooling_mode(Enum):
+ MAX = 0
+ AVERAGE = 1
+ REDUCE_SUM = 2
+
+class privilege_level(Enum):
+ USER = 0
+ PRIVILEGED = 1
+
+class product(Enum):
+ ETHOS_U55 = 0
+
+class resampling_mode(Enum):
+ NONE = 0
+ NEAREST = 1
+ TRANSPOSE = 2
+
+class rounding(Enum):
+ TFL = 0
+ TRUNCATE = 1
+ NATURAL = 2
+
+class security_level(Enum):
+ SECURE = 0
+ NON_SECURE = 1
+
+class state(Enum):
+ STOPPED = 0
+ RUNNING = 1
+
+class stride_mode(Enum):
+ STRIDE_MODE_1D = 0
+ STRIDE_MODE_2D = 1
+ STRIDE_MODE_3D = 2
+
+
+class clkforce_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("top_level_clk", c_uint32, 1),
+ ("cc_clk", c_uint32, 1),
+ ("dma_clk", c_uint32, 1),
+ ("mac_clk", c_uint32, 1),
+ ("ao_clk", c_uint32, 1),
+ ("wd_clk", c_uint32, 1),
+ ("reserved0", c_uint32, 26),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_top_level_clk(self, value): self.bits.top_level_clk = value
+ def get_top_level_clk(self): value = self.bits.top_level_clk; return value
+ def set_cc_clk(self, value): self.bits.cc_clk = value
+ def get_cc_clk(self): value = self.bits.cc_clk; return value
+ def set_dma_clk(self, value): self.bits.dma_clk = value
+ def get_dma_clk(self): value = self.bits.dma_clk; return value
+ def set_mac_clk(self, value): self.bits.mac_clk = value
+ def get_mac_clk(self): value = self.bits.mac_clk; return value
+ def set_ao_clk(self, value): self.bits.ao_clk = value
+ def get_ao_clk(self): value = self.bits.ao_clk; return value
+ def set_wd_clk(self, value): self.bits.wd_clk = value
+ def get_wd_clk(self): value = self.bits.wd_clk; return value
+
+
+class basep0_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep1_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep2_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep3_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep4_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep5_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep6_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep7_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep8_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep9_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep10_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep11_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep12_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep13_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep14_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class basep15_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("addr_word", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_addr_word(self, value): self.bits.addr_word = value
+ def get_addr_word(self): value = self.bits.addr_word; return value
+
+
+class pid4_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pid4", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pid4(self, value): self.bits.pid4 = value
+ def get_pid4(self): value = self.bits.pid4; return value
+
+
+class pid5_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pid5", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pid5(self, value): self.bits.pid5 = value
+ def get_pid5(self): value = self.bits.pid5; return value
+
+
+class pid6_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pid6", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pid6(self, value): self.bits.pid6 = value
+ def get_pid6(self): value = self.bits.pid6; return value
+
+
+class pid7_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pid7", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pid7(self, value): self.bits.pid7 = value
+ def get_pid7(self): value = self.bits.pid7; return value
+
+
+class pid0_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pid0", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pid0(self, value): self.bits.pid0 = value
+ def get_pid0(self): value = self.bits.pid0; return value
+
+
+class pid1_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pid1", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pid1(self, value): self.bits.pid1 = value
+ def get_pid1(self): value = self.bits.pid1; return value
+
+
+class pid2_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pid2", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pid2(self, value): self.bits.pid2 = value
+ def get_pid2(self): value = self.bits.pid2; return value
+
+
+class pid3_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pid3", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pid3(self, value): self.bits.pid3 = value
+ def get_pid3(self): value = self.bits.pid3; return value
+
+
+class cid0_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("cid0", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_cid0(self, value): self.bits.cid0 = value
+ def get_cid0(self): value = self.bits.cid0; return value
+
+
+class cid1_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("cid1", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_cid1(self, value): self.bits.cid1 = value
+ def get_cid1(self): value = self.bits.cid1; return value
+
+
+class cid2_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("cid2", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_cid2(self, value): self.bits.cid2 = value
+ def get_cid2(self): value = self.bits.cid2; return value
+
+
+class cid3_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("cid3", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_cid3(self, value): self.bits.cid3 = value
+ def get_cid3(self): value = self.bits.cid3; return value
+
+
+class id_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("version_status", c_uint32, 4),
+ ("version_minor", c_uint32, 4),
+ ("version_major", c_uint32, 4),
+ ("product_major", c_uint32, 4),
+ ("arch_patch_rev", c_uint32, 4),
+ ("arch_minor_rev", c_uint32, 8),
+ ("arch_major_rev", c_uint32, 4),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_version_status(self, value): self.bits.version_status = value
+ def get_version_status(self): value = self.bits.version_status; return value
+ def set_version_minor(self, value): self.bits.version_minor = value
+ def get_version_minor(self): value = self.bits.version_minor; return value
+ def set_version_major(self, value): self.bits.version_major = value
+ def get_version_major(self): value = self.bits.version_major; return value
+ def set_product_major(self, value): self.bits.product_major = value
+ def get_product_major(self): value = self.bits.product_major; return value
+ def set_arch_patch_rev(self, value): self.bits.arch_patch_rev = value
+ def get_arch_patch_rev(self): value = self.bits.arch_patch_rev; return value
+ def set_arch_minor_rev(self, value): self.bits.arch_minor_rev = value
+ def get_arch_minor_rev(self): value = self.bits.arch_minor_rev; return value
+ def set_arch_major_rev(self, value): self.bits.arch_major_rev = value
+ def get_arch_major_rev(self): value = self.bits.arch_major_rev; return value
+
+
+class status_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("state", c_uint32, 1),
+ ("irq_raised", c_uint32, 1),
+ ("bus_status", c_uint32, 1),
+ ("reset_status", c_uint32, 1),
+ ("cmd_parse_error", c_uint32, 1),
+ ("cmd_end_reached", c_uint32, 1),
+ ("pmu_irq_raised", c_uint32, 1),
+ ("wd_fault", c_uint32, 1),
+ ("reserved0", c_uint32, 3),
+ ("faulting_interface", c_uint32, 1),
+ ("faulting_channel", c_uint32, 4),
+ ("irq_history_mask", c_uint32, 16),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_state(self, value): self.bits.state = value
+ def get_state(self): value = self.bits.state; return value
+ def set_irq_raised(self, value): self.bits.irq_raised = value
+ def get_irq_raised(self): value = self.bits.irq_raised; return value
+ def set_bus_status(self, value): self.bits.bus_status = value
+ def get_bus_status(self): value = self.bits.bus_status; return value
+ def set_reset_status(self, value): self.bits.reset_status = value
+ def get_reset_status(self): value = self.bits.reset_status; return value
+ def set_cmd_parse_error(self, value): self.bits.cmd_parse_error = value
+ def get_cmd_parse_error(self): value = self.bits.cmd_parse_error; return value
+ def set_cmd_end_reached(self, value): self.bits.cmd_end_reached = value
+ def get_cmd_end_reached(self): value = self.bits.cmd_end_reached; return value
+ def set_pmu_irq_raised(self, value): self.bits.pmu_irq_raised = value
+ def get_pmu_irq_raised(self): value = self.bits.pmu_irq_raised; return value
+ def set_wd_fault(self, value): self.bits.wd_fault = value
+ def get_wd_fault(self): value = self.bits.wd_fault; return value
+ def set_faulting_interface(self, value): self.bits.faulting_interface = value
+ def get_faulting_interface(self): value = self.bits.faulting_interface; return value
+ def set_faulting_channel(self, value): self.bits.faulting_channel = value
+ def get_faulting_channel(self): value = self.bits.faulting_channel; return value
+ def set_irq_history_mask(self, value): self.bits.irq_history_mask = value
+ def get_irq_history_mask(self): value = self.bits.irq_history_mask; return value
+
+
+class cmd_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("transition_to_running_state", c_uint32, 1),
+ ("clear_irq", c_uint32, 1),
+ ("clock_q_enable", c_uint32, 1),
+ ("power_q_enable", c_uint32, 1),
+ ("stop_request", c_uint32, 1),
+ ("reserved0", c_uint32, 11),
+ ("clear_irq_history", c_uint32, 16),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_transition_to_running_state(self, value): self.bits.transition_to_running_state = value
+ def get_transition_to_running_state(self): value = self.bits.transition_to_running_state; return value
+ def set_clear_irq(self, value): self.bits.clear_irq = value
+ def get_clear_irq(self): value = self.bits.clear_irq; return value
+ def set_clock_q_enable(self, value): self.bits.clock_q_enable = value
+ def get_clock_q_enable(self): value = self.bits.clock_q_enable; return value
+ def set_power_q_enable(self, value): self.bits.power_q_enable = value
+ def get_power_q_enable(self): value = self.bits.power_q_enable; return value
+ def set_stop_request(self, value): self.bits.stop_request = value
+ def get_stop_request(self): value = self.bits.stop_request; return value
+ def set_clear_irq_history(self, value): self.bits.clear_irq_history = value
+ def get_clear_irq_history(self): value = self.bits.clear_irq_history; return value
+
+
+class reset_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("pending_cpl", c_uint32, 1),
+ ("pending_csl", c_uint32, 1),
+ ("reserved0", c_uint32, 30),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_pending_cpl(self, value): self.bits.pending_cpl = value
+ def get_pending_cpl(self): value = self.bits.pending_cpl; return value
+ def set_pending_csl(self, value): self.bits.pending_csl = value
+ def get_pending_csl(self): value = self.bits.pending_csl; return value
+
+
+class qbase0_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("qbase0", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_qbase0(self, value): self.bits.qbase0 = value
+ def get_qbase0(self): value = self.bits.qbase0; return value
+
+
+class qbase1_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("qbase1", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_qbase1(self, value): self.bits.qbase1 = value
+ def get_qbase1(self): value = self.bits.qbase1; return value
+
+
+class qread_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("qread", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_qread(self, value): self.bits.qread = value
+ def get_qread(self): value = self.bits.qread; return value
+
+
+class qconfig_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("qconfig", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_qconfig(self, value): self.bits.qconfig = value
+ def get_qconfig(self): value = self.bits.qconfig; return value
+
+
+class qsize_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("qsize", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_qsize(self, value): self.bits.qsize = value
+ def get_qsize(self): value = self.bits.qsize; return value
+
+
+class prot_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("active_cpl", c_uint32, 1),
+ ("active_csl", c_uint32, 1),
+ ("reserved0", c_uint32, 30),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_active_cpl(self, value): self.bits.active_cpl = value
+ def get_active_cpl(self): value = self.bits.active_cpl; return value
+ def set_active_csl(self, value): self.bits.active_csl = value
+ def get_active_csl(self): value = self.bits.active_csl; return value
+
+
+class config_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("macs_per_cc", c_uint32, 4),
+ ("cmd_stream_version", c_uint32, 4),
+ ("shram_size", c_uint32, 8),
+ ("reserved0", c_uint32, 12),
+ ("product", c_uint32, 4),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_macs_per_cc(self, value): self.bits.macs_per_cc = value
+ def get_macs_per_cc(self): value = self.bits.macs_per_cc; return value
+ def set_cmd_stream_version(self, value): self.bits.cmd_stream_version = value
+ def get_cmd_stream_version(self): value = self.bits.cmd_stream_version; return value
+ def set_shram_size(self, value): self.bits.shram_size = value
+ def get_shram_size(self): value = self.bits.shram_size; return value
+ def set_product(self, value): self.bits.product = value
+ def get_product(self): value = self.bits.product; return value
+
+
+class lock_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("lock", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_lock(self, value): self.bits.lock = value
+ def get_lock(self): value = self.bits.lock; return value
+
+
+class regioncfg_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("region0", c_uint32, 2),
+ ("region1", c_uint32, 2),
+ ("region2", c_uint32, 2),
+ ("region3", c_uint32, 2),
+ ("region4", c_uint32, 2),
+ ("region5", c_uint32, 2),
+ ("region6", c_uint32, 2),
+ ("region7", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_region0(self, value): self.bits.region0 = value
+ def get_region0(self): value = self.bits.region0; return value
+ def set_region1(self, value): self.bits.region1 = value
+ def get_region1(self): value = self.bits.region1; return value
+ def set_region2(self, value): self.bits.region2 = value
+ def get_region2(self): value = self.bits.region2; return value
+ def set_region3(self, value): self.bits.region3 = value
+ def get_region3(self): value = self.bits.region3; return value
+ def set_region4(self, value): self.bits.region4 = value
+ def get_region4(self): value = self.bits.region4; return value
+ def set_region5(self, value): self.bits.region5 = value
+ def get_region5(self): value = self.bits.region5; return value
+ def set_region6(self, value): self.bits.region6 = value
+ def get_region6(self): value = self.bits.region6; return value
+ def set_region7(self, value): self.bits.region7 = value
+ def get_region7(self): value = self.bits.region7; return value
+
+
+class axi_limit0_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("max_beats", c_uint32, 2),
+ ("reserved0", c_uint32, 2),
+ ("memtype", c_uint32, 4),
+ ("reserved1", c_uint32, 8),
+ ("max_outstanding_read_m1", c_uint32, 8),
+ ("max_outstanding_write_m1", c_uint32, 8),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_max_beats(self, value): self.bits.max_beats = value
+ def get_max_beats(self): value = self.bits.max_beats; return value
+ def set_memtype(self, value): self.bits.memtype = value
+ def get_memtype(self): value = self.bits.memtype; return value
+ def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value
+ def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value
+ def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value
+ def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value
+
+
+class axi_limit1_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("max_beats", c_uint32, 2),
+ ("reserved0", c_uint32, 2),
+ ("memtype", c_uint32, 4),
+ ("reserved1", c_uint32, 8),
+ ("max_outstanding_read_m1", c_uint32, 8),
+ ("max_outstanding_write_m1", c_uint32, 8),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_max_beats(self, value): self.bits.max_beats = value
+ def get_max_beats(self): value = self.bits.max_beats; return value
+ def set_memtype(self, value): self.bits.memtype = value
+ def get_memtype(self): value = self.bits.memtype; return value
+ def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value
+ def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value
+ def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value
+ def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value
+
+
+class axi_limit2_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("max_beats", c_uint32, 2),
+ ("reserved0", c_uint32, 2),
+ ("memtype", c_uint32, 4),
+ ("reserved1", c_uint32, 8),
+ ("max_outstanding_read_m1", c_uint32, 8),
+ ("max_outstanding_write_m1", c_uint32, 8),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_max_beats(self, value): self.bits.max_beats = value
+ def get_max_beats(self): value = self.bits.max_beats; return value
+ def set_memtype(self, value): self.bits.memtype = value
+ def get_memtype(self): value = self.bits.memtype; return value
+ def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value
+ def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value
+ def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value
+ def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value
+
+
+class axi_limit3_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("max_beats", c_uint32, 2),
+ ("reserved0", c_uint32, 2),
+ ("memtype", c_uint32, 4),
+ ("reserved1", c_uint32, 8),
+ ("max_outstanding_read_m1", c_uint32, 8),
+ ("max_outstanding_write_m1", c_uint32, 8),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_max_beats(self, value): self.bits.max_beats = value
+ def get_max_beats(self): value = self.bits.max_beats; return value
+ def set_memtype(self, value): self.bits.memtype = value
+ def get_memtype(self): value = self.bits.memtype; return value
+ def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value
+ def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value
+ def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value
+ def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value
+
+
+class pmcr_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("cnt_en", c_uint32, 1),
+ ("event_cnt_rst", c_uint32, 1),
+ ("cycle_cnt_rst", c_uint32, 1),
+ ("mask_en", c_uint32, 1),
+ ("reserved0", c_uint32, 7),
+ ("num_event_cnt", c_uint32, 5),
+ ("reserved1", c_uint32, 16),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_cnt_en(self, value): self.bits.cnt_en = value
+ def get_cnt_en(self): value = self.bits.cnt_en; return value
+ def set_event_cnt_rst(self, value): self.bits.event_cnt_rst = value
+ def get_event_cnt_rst(self): value = self.bits.event_cnt_rst; return value
+ def set_cycle_cnt_rst(self, value): self.bits.cycle_cnt_rst = value
+ def get_cycle_cnt_rst(self): value = self.bits.cycle_cnt_rst; return value
+ def set_mask_en(self, value): self.bits.mask_en = value
+ def get_mask_en(self): value = self.bits.mask_en; return value
+ def set_num_event_cnt(self, value): self.bits.num_event_cnt = value
+ def get_num_event_cnt(self): value = self.bits.num_event_cnt; return value
+
+
+class pmcntenset_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("event_cnt_0", c_uint32, 1),
+ ("event_cnt_1", c_uint32, 1),
+ ("event_cnt_2", c_uint32, 1),
+ ("event_cnt_3", c_uint32, 1),
+ ("reserved0", c_uint32, 27),
+ ("cycle_cnt", c_uint32, 1),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_event_cnt_0(self, value): self.bits.event_cnt_0 = value
+ def get_event_cnt_0(self): value = self.bits.event_cnt_0; return value
+ def set_event_cnt_1(self, value): self.bits.event_cnt_1 = value
+ def get_event_cnt_1(self): value = self.bits.event_cnt_1; return value
+ def set_event_cnt_2(self, value): self.bits.event_cnt_2 = value
+ def get_event_cnt_2(self): value = self.bits.event_cnt_2; return value
+ def set_event_cnt_3(self, value): self.bits.event_cnt_3 = value
+ def get_event_cnt_3(self): value = self.bits.event_cnt_3; return value
+ def set_cycle_cnt(self, value): self.bits.cycle_cnt = value
+ def get_cycle_cnt(self): value = self.bits.cycle_cnt; return value
+
+
+class pmcntenclr_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("event_cnt_0", c_uint32, 1),
+ ("event_cnt_1", c_uint32, 1),
+ ("event_cnt_2", c_uint32, 1),
+ ("event_cnt_3", c_uint32, 1),
+ ("reserved0", c_uint32, 27),
+ ("cycle_cnt", c_uint32, 1),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_event_cnt_0(self, value): self.bits.event_cnt_0 = value
+ def get_event_cnt_0(self): value = self.bits.event_cnt_0; return value
+ def set_event_cnt_1(self, value): self.bits.event_cnt_1 = value
+ def get_event_cnt_1(self): value = self.bits.event_cnt_1; return value
+ def set_event_cnt_2(self, value): self.bits.event_cnt_2 = value
+ def get_event_cnt_2(self): value = self.bits.event_cnt_2; return value
+ def set_event_cnt_3(self, value): self.bits.event_cnt_3 = value
+ def get_event_cnt_3(self): value = self.bits.event_cnt_3; return value
+ def set_cycle_cnt(self, value): self.bits.cycle_cnt = value
+ def get_cycle_cnt(self): value = self.bits.cycle_cnt; return value
+
+
+class pmovsset_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("event_cnt_0_ovf", c_uint32, 1),
+ ("event_cnt_1_ovf", c_uint32, 1),
+ ("event_cnt_2_ovf", c_uint32, 1),
+ ("event_cnt_3_ovf", c_uint32, 1),
+ ("reserved0", c_uint32, 27),
+ ("cycle_cnt_ovf", c_uint32, 1),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_event_cnt_0_ovf(self, value): self.bits.event_cnt_0_ovf = value
+ def get_event_cnt_0_ovf(self): value = self.bits.event_cnt_0_ovf; return value
+ def set_event_cnt_1_ovf(self, value): self.bits.event_cnt_1_ovf = value
+ def get_event_cnt_1_ovf(self): value = self.bits.event_cnt_1_ovf; return value
+ def set_event_cnt_2_ovf(self, value): self.bits.event_cnt_2_ovf = value
+ def get_event_cnt_2_ovf(self): value = self.bits.event_cnt_2_ovf; return value
+ def set_event_cnt_3_ovf(self, value): self.bits.event_cnt_3_ovf = value
+ def get_event_cnt_3_ovf(self): value = self.bits.event_cnt_3_ovf; return value
+ def set_cycle_cnt_ovf(self, value): self.bits.cycle_cnt_ovf = value
+ def get_cycle_cnt_ovf(self): value = self.bits.cycle_cnt_ovf; return value
+
+
+class pmovsclr_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("event_cnt_0_ovf", c_uint32, 1),
+ ("event_cnt_1_ovf", c_uint32, 1),
+ ("event_cnt_2_ovf", c_uint32, 1),
+ ("event_cnt_3_ovf", c_uint32, 1),
+ ("reserved0", c_uint32, 27),
+ ("cycle_cnt_ovf", c_uint32, 1),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_event_cnt_0_ovf(self, value): self.bits.event_cnt_0_ovf = value
+ def get_event_cnt_0_ovf(self): value = self.bits.event_cnt_0_ovf; return value
+ def set_event_cnt_1_ovf(self, value): self.bits.event_cnt_1_ovf = value
+ def get_event_cnt_1_ovf(self): value = self.bits.event_cnt_1_ovf; return value
+ def set_event_cnt_2_ovf(self, value): self.bits.event_cnt_2_ovf = value
+ def get_event_cnt_2_ovf(self): value = self.bits.event_cnt_2_ovf; return value
+ def set_event_cnt_3_ovf(self, value): self.bits.event_cnt_3_ovf = value
+ def get_event_cnt_3_ovf(self): value = self.bits.event_cnt_3_ovf; return value
+ def set_cycle_cnt_ovf(self, value): self.bits.cycle_cnt_ovf = value
+ def get_cycle_cnt_ovf(self): value = self.bits.cycle_cnt_ovf; return value
+
+
+class pmintset_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("event_cnt_0_int", c_uint32, 1),
+ ("event_cnt_1_int", c_uint32, 1),
+ ("event_cnt_2_int", c_uint32, 1),
+ ("event_cnt_3_int", c_uint32, 1),
+ ("reserved0", c_uint32, 27),
+ ("cycle_cnt_int", c_uint32, 1),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_event_cnt_0_int(self, value): self.bits.event_cnt_0_int = value
+ def get_event_cnt_0_int(self): value = self.bits.event_cnt_0_int; return value
+ def set_event_cnt_1_int(self, value): self.bits.event_cnt_1_int = value
+ def get_event_cnt_1_int(self): value = self.bits.event_cnt_1_int; return value
+ def set_event_cnt_2_int(self, value): self.bits.event_cnt_2_int = value
+ def get_event_cnt_2_int(self): value = self.bits.event_cnt_2_int; return value
+ def set_event_cnt_3_int(self, value): self.bits.event_cnt_3_int = value
+ def get_event_cnt_3_int(self): value = self.bits.event_cnt_3_int; return value
+ def set_cycle_cnt_int(self, value): self.bits.cycle_cnt_int = value
+ def get_cycle_cnt_int(self): value = self.bits.cycle_cnt_int; return value
+
+
+class pmintclr_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("event_cnt_0_int", c_uint32, 1),
+ ("event_cnt_1_int", c_uint32, 1),
+ ("event_cnt_2_int", c_uint32, 1),
+ ("event_cnt_3_int", c_uint32, 1),
+ ("reserved0", c_uint32, 27),
+ ("cycle_cnt_int", c_uint32, 1),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_event_cnt_0_int(self, value): self.bits.event_cnt_0_int = value
+ def get_event_cnt_0_int(self): value = self.bits.event_cnt_0_int; return value
+ def set_event_cnt_1_int(self, value): self.bits.event_cnt_1_int = value
+ def get_event_cnt_1_int(self): value = self.bits.event_cnt_1_int; return value
+ def set_event_cnt_2_int(self, value): self.bits.event_cnt_2_int = value
+ def get_event_cnt_2_int(self): value = self.bits.event_cnt_2_int; return value
+ def set_event_cnt_3_int(self, value): self.bits.event_cnt_3_int = value
+ def get_event_cnt_3_int(self): value = self.bits.event_cnt_3_int; return value
+ def set_cycle_cnt_int(self, value): self.bits.cycle_cnt_int = value
+ def get_cycle_cnt_int(self): value = self.bits.cycle_cnt_int; return value
+
+
+class pmccntr_lo_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("cycle_cnt_lo", c_uint32, 32),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_cycle_cnt_lo(self, value): self.bits.cycle_cnt_lo = value
+ def get_cycle_cnt_lo(self): value = self.bits.cycle_cnt_lo; return value
+
+
+class pmccntr_hi_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("cycle_cnt_hi", c_uint32, 16),
+ ("reserved0", c_uint32, 16),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_cycle_cnt_hi(self, value): self.bits.cycle_cnt_hi = value
+ def get_cycle_cnt_hi(self): value = self.bits.cycle_cnt_hi; return value
+
+
+class pmccntr_cfg_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("cycle_cnt_cfg_start", c_uint32, 10),
+ ("reserved0", c_uint32, 6),
+ ("cycle_cnt_cfg_stop", c_uint32, 10),
+ ("reserved1", c_uint32, 6),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_cycle_cnt_cfg_start(self, value): self.bits.cycle_cnt_cfg_start = value
+ def get_cycle_cnt_cfg_start(self): value = self.bits.cycle_cnt_cfg_start; return value
+ def set_cycle_cnt_cfg_stop(self, value): self.bits.cycle_cnt_cfg_stop = value
+ def get_cycle_cnt_cfg_stop(self): value = self.bits.cycle_cnt_cfg_stop; return value
+
+
+class pmcaxi_chan_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("axi_chan", c_uint32, 4),
+ ("reserved0", c_uint32, 3),
+ ("rw", c_uint32, 1),
+ ("axi_cnt", c_uint32, 2),
+ ("reserved1", c_uint32, 22),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_axi_chan(self, value): self.bits.axi_chan = value
+ def get_axi_chan(self): value = self.bits.axi_chan; return value
+ def set_rw(self, value): self.bits.rw = value
+ def get_rw(self): value = self.bits.rw; return value
+ def set_axi_cnt(self, value): self.bits.axi_cnt = value
+ def get_axi_cnt(self): value = self.bits.axi_cnt; return value
+
+
+class pmevtyper0_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("ev_type", c_uint32, 10),
+ ("reserved0", c_uint32, 22),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_ev_type(self, value): self.bits.ev_type = value
+ def get_ev_type(self): value = self.bits.ev_type; return value
+
+
+class pmevtyper1_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("ev_type", c_uint32, 10),
+ ("reserved0", c_uint32, 22),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_ev_type(self, value): self.bits.ev_type = value
+ def get_ev_type(self): value = self.bits.ev_type; return value
+
+
+class pmevtyper2_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("ev_type", c_uint32, 10),
+ ("reserved0", c_uint32, 22),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_ev_type(self, value): self.bits.ev_type = value
+ def get_ev_type(self): value = self.bits.ev_type; return value
+
+
+class pmevtyper3_r(Union):
+ class _bitfield(Structure):
+ _fields_ = [
+ ("ev_type", c_uint32, 10),
+ ("reserved0", c_uint32, 22),
+ ]
+ _fields_ = [("bits", _bitfield),
+ ("word", c_uint32)]
+ def set_ev_type(self, value): self.bits.ev_type = value
+ def get_ev_type(self): value = self.bits.ev_type; return value
+
+class command_no_payload_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class command_with_payload_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("param", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_op_stop_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("mask", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_STOP and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_mask(self): return mask
+ def set_mask(self, value): mask = value
+
+class npu_op_irq_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("mask", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_IRQ and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_mask(self): return mask
+ def set_mask(self, value): mask = value
+
+class npu_op_conv_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("reserved0", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_CONV and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+
+class npu_op_depthwise_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("reserved0", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_DEPTHWISE and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+
+class npu_op_pool_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("mode", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_POOL and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_mode(self): return mode
+ def set_mode(self, value): mode = value
+
+class npu_op_elementwise_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("mode", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_ELEMENTWISE and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_mode(self): return mode
+ def set_mode(self, value): mode = value
+
+class npu_op_dma_start_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("channel_mode", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_DMA_START and must_be_zero0==0;
+ def get_channel_mode(self): return channel_mode
+ def set_channel_mode(self, value): channel_mode = value
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+
+class npu_op_dma_wait_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("reserved0", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_DMA_WAIT and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+
+class npu_op_kernel_wait_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_KERNEL_WAIT and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_op_pmu_mask_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_OP_PMU_MASK and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_pad_top_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_TOP and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_pad_left_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_LEFT and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_pad_right_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_RIGHT and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_pad_bottom_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_BOTTOM and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_depth_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_DEPTH_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_precision_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 4),
+ ("reserved0", c_uint32, 2),
+ ("format", c_uint32, 2),
+ ("scale_mode", c_uint32, 2),
+ ("reserved1", c_uint32, 4),
+ ("round_mode", c_uint32, 2),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PRECISION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_format(self): return format
+ def set_format(self, value): format = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+ def get_round_mode(self): return round_mode
+ def set_round_mode(self, value): round_mode = value
+ def get_scale_mode(self): return scale_mode
+ def set_scale_mode(self, value): scale_mode = value
+
+class npu_set_ifm_upscale_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("mode", c_uint32, 2),
+ ("reserved0", c_uint32, 14),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_UPSCALE and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_mode(self): return mode
+ def set_mode(self, value): mode = value
+
+class npu_set_ifm_zero_point_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_ZERO_POINT and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_width0_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_WIDTH0_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_height0_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_HEIGHT0_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_height1_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_HEIGHT1_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_ib_end_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_IB_END and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_region_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM_REGION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_width_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_WIDTH_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_height_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_depth_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_DEPTH_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_precision_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("precision", c_uint32, 3),
+ ("reserved0", c_uint32, 3),
+ ("format", c_uint32, 2),
+ ("scaling", c_uint32, 1),
+ ("reserved1", c_uint32, 5),
+ ("rounding", c_uint32, 2),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_PRECISION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_format(self): return format
+ def set_format(self, value): format = value
+ def get_precision(self): return precision
+ def set_precision(self, value): precision = value
+ def get_rounding(self): return rounding
+ def set_rounding(self, value): rounding = value
+ def get_scaling(self): return scaling
+ def set_scaling(self, value): scaling = value
+
+class npu_set_ofm_blk_width_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_WIDTH_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_blk_height_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_HEIGHT_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_blk_depth_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_DEPTH_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_zero_point_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_ZERO_POINT and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_width0_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_WIDTH0_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_height0_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT0_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_height1_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT1_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ofm_region_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_OFM_REGION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_kernel_width_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_WIDTH_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_kernel_height_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_HEIGHT_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_kernel_stride_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_STRIDE and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_parallel_mode_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_PARALLEL_MODE and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_acc_format_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_ACC_FORMAT and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_activation_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("type", c_uint32, 12),
+ ("act_clip_range", c_uint32, 4),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION and must_be_zero0==0;
+ def get_act_clip_range(self): return act_clip_range
+ def set_act_clip_range(self, value): act_clip_range = value
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_type(self): return type
+ def set_type(self, value): type = value
+
+class npu_set_activation_min_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION_MIN and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_activation_max_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION_MAX and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_weight_region_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_WEIGHT_REGION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_scale_region_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_SCALE_REGION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ab_start_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_AB_START and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_blockdep_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_BLOCKDEP and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_dma0_src_region_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("region", c_uint32, 8),
+ ("internal", c_uint32, 1),
+ ("stride_mode", c_uint32, 2),
+ ("reserved0", c_uint32, 5),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SRC_REGION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_internal(self): return internal
+ def set_internal(self, value): internal = value
+ def get_region(self): return region
+ def set_region(self, value): region = value
+ def get_stride_mode(self): return stride_mode
+ def set_stride_mode(self, value): stride_mode = value
+
+class npu_set_dma0_dst_region_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("region", c_uint32, 8),
+ ("internal", c_uint32, 1),
+ ("stride_mode", c_uint32, 2),
+ ("reserved0", c_uint32, 5),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_DST_REGION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_internal(self): return internal
+ def set_internal(self, value): internal = value
+ def get_region(self): return region
+ def set_region(self, value): region = value
+ def get_stride_mode(self): return stride_mode
+ def set_stride_mode(self, value): stride_mode = value
+
+class npu_set_dma0_size0_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SIZE0 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_dma0_size1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SIZE1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm2_broadcast_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("broadcast_height", c_uint32, 1),
+ ("broadcast_width", c_uint32, 1),
+ ("broadcast_depth", c_uint32, 1),
+ ("reserved0", c_uint32, 3),
+ ("operand_order", c_uint32, 1),
+ ("broadcast_scalar", c_uint32, 1),
+ ("reserved1", c_uint32, 8),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_BROADCAST and must_be_zero0==0;
+ def get_broadcast_depth(self): return broadcast_depth
+ def set_broadcast_depth(self, value): broadcast_depth = value
+ def get_broadcast_height(self): return broadcast_height
+ def set_broadcast_height(self, value): broadcast_height = value
+ def get_broadcast_scalar(self): return broadcast_scalar
+ def set_broadcast_scalar(self, value): broadcast_scalar = value
+ def get_broadcast_width(self): return broadcast_width
+ def set_broadcast_width(self, value): broadcast_width = value
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_operand_order(self): return operand_order
+ def set_operand_order(self, value): operand_order = value
+
+class npu_set_ifm2_scalar_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_SCALAR and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm2_precision_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 4),
+ ("reserved0", c_uint32, 2),
+ ("format", c_uint32, 2),
+ ("reserved1", c_uint32, 8),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_PRECISION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_format(self): return format
+ def set_format(self, value): format = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm2_zero_point_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_ZERO_POINT and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm2_width0_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_WIDTH0_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm2_height0_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_HEIGHT0_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm2_height1_m1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_HEIGHT1_M1 and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm2_ib_start_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_IB_START and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm2_region_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero0", c_uint32, 6),
+ ("param", c_uint32, 16),
+ ]
+ def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_REGION and must_be_zero0==0;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+
+class npu_set_ifm_base0_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_base1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_base2_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_base3_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_stride_x_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_stride_y_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm_stride_c_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_base0_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_base1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_base2_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_base3_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_stride_x_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_stride_y_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_stride_c_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_weight_base_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_weight_length_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_scale_base_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_SCALE_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_scale_length_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_SCALE_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ofm_scale_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("shift", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OFM_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+ def get_shift(self): return shift
+ def set_shift(self, value): shift = value
+
+class npu_set_opa_scale_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("shift", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OPA_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+ def get_shift(self): return shift
+ def set_shift(self, value): shift = value
+
+class npu_set_opb_scale_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_OPB_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_src_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SRC and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_dst_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_DST and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_len_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_LEN and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_skip0_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("param", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SKIP0 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_dma0_skip1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("param", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SKIP1 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_base0_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_base1_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_base2_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_base3_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_stride_x_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_stride_y_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_ifm2_stride_c_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_weight1_base_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("param", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT1_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_weight1_length_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT1_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_scale1_base_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("param", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_SCALE1_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_param(self): return param
+ def set_param(self, value): param = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
+
+class npu_set_scale1_length_t(Structure):
+ _fields_ = [
+ ("cmd_code", c_uint32, 10),
+ ("must_be_zero", c_uint32, 4),
+ ("payload_size", c_uint32, 2),
+ ("reserved0", c_uint32, 16),
+ ("data", c_uint32, 32),
+ ]
+ def valid(self): return cmd_code==cmd1.NPU_SET_SCALE1_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2;
+ def get_cmd_code(self): return cmd_code
+ def set_cmd_code(self, value): cmd_code = value
+ def get_data(self): return data
+ def set_data(self, value): data = value
+ def get_payload_size(self): return payload_size
+ def set_payload_size(self, value): payload_size = value
diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py
new file mode 100644
index 00000000..5b9ba8b0
--- /dev/null
+++ b/ethosu/vela/extract_npu_subgraphs.py
@@ -0,0 +1,253 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left
+# untouched in the final output.
+#
+# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked
+# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
+# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
+
+from .nn_graph import Pass, PassPlacement, NpuBlockType, Subgraph
+from .operation import Operation
+import numpy as np
+
+
+def make_npu_call_op_pass(npu_subgraph):
+ op = Operation("NpuOp", "call_" + npu_subgraph.name)
+ op.attrs["subgraph"] = npu_subgraph
+ ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
+ ps.ops = [op]
+ ps.primary_op = op
+ op.attrs["npu_block_type"] = ps.npu_block_type
+ op.scheduled_pass = ps
+
+ # Inputs and outputs filled in later as we cut the graphs
+ return ps
+
+
+def switch_tensor_for_op(op, orig_tens, new_tens):
+
+ op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]
+ op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]
+
+ ps = op.scheduled_pass
+ if ps is None:
+ return
+
+ ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]
+ ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]
+
+ if ps.ifm_tensor == orig_tens:
+ ps.ifm_tensor = new_tens
+ if ps.ifm2_tensor == orig_tens:
+ ps.ifm2_tensor = new_tens
+ if ps.ofm_tensor == orig_tens:
+ ps.ofm_tensor = new_tens
+ if ps.weight_tensor == orig_tens:
+ ps.weight_tensor = new_tens
+ if ps.scale_tensor == orig_tens:
+ ps.scale_tensor = new_tens
+
+
+def rewrite_tensor_cpu_producer_npu_consumers(
+ orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
+):
+ is_const = orig_tens.ops[0].type == "Const"
+
+ new_tens = orig_tens.clone("_npu")
+ orig_tens.npu_tensor = new_tens
+ new_tens.cpu_tensor = orig_tens
+
+ op_type = "SubgraphInput"
+ if is_const:
+ op_type = "Const"
+ op = Operation(op_type, orig_tens.name + "_input")
+ op.attrs["npu_block_type"] = NpuBlockType.Default
+ op.outputs = [new_tens]
+ op.scheduled_pass = startup_init_ps
+ new_tens.ops = [op]
+ startup_init_ps.ops.append(op)
+ startup_init_ps.outputs.append(new_tens)
+
+ if not is_const:
+ call_ps.inputs.append(orig_tens)
+ call_ps.primary_op.inputs.append(orig_tens)
+
+ for op in list(orig_tens.consumers()):
+ if op is None:
+ continue # Subgraph consumers handled separately.
+ ps = op.scheduled_pass
+ if subgraph_for_pass[ps] == npu_subgraph:
+ switch_tensor_for_op(op, orig_tens, new_tens)
+ orig_tens.consumer_list.remove(op)
+ new_tens.consumer_list.append(op)
+
+ # Deal with output tensors for the NPU graph. These are special.
+ npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]
+
+
+def rewrite_tensor_npu_producer_cpu_consumers(
+ orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
+):
+
+ new_tens = orig_tens.clone("_cpu")
+ new_tens.npu_tensor = orig_tens
+ orig_tens.cpu_tensor = new_tens
+
+ npu_subgraph.output_tensors.append(orig_tens)
+
+ call_ps.outputs.append(new_tens)
+ call_ps.primary_op.outputs.append(new_tens)
+ new_tens.ops = [call_ps.primary_op]
+
+ for op in list(orig_tens.consumers()):
+ if op is None:
+ continue # Subgraph consumers handled separately.
+ ps = op.scheduled_pass
+ if subgraph_for_pass[ps] != npu_subgraph:
+ switch_tensor_for_op(op, orig_tens, new_tens)
+ orig_tens.consumer_list.remove(op)
+ new_tens.consumer_list.append(op)
+
+ # Deal with output tensors for the CPU graph. These are special.
+ cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]
+
+
+def extract_subgraph(nng, orig_sg, arch):
+ assert orig_sg.placement == PassPlacement.Cpu
+
+ passes = list(orig_sg.passes)
+ place_vec = np.array([ps.placement for ps in passes])
+ place_vec[
+ place_vec == PassPlacement.StartupInit
+ ] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.
+
+ # MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU
+ # passes should be assigned to the NPU.
+
+ # Forward, then backwards
+ for is_reversed in range(2):
+ last_place = PassPlacement.Cpu
+ seq = enumerate(place_vec)
+ if is_reversed:
+ seq = reversed(list(seq))
+ for idx, place in seq:
+ if place == PassPlacement.MemoryOnly:
+ if last_place == PassPlacement.Npu:
+ place = PassPlacement.Npu
+ place_vec[idx] = place
+
+ if place != PassPlacement.MemoryOnly:
+ last_place = place
+
+ # Anything left, assign to the CPU.
+ place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu
+
+ if np.all(place_vec == PassPlacement.Cpu):
+ return [] # Nothing to do
+
+ # Create the subgraphs and split passes between them
+
+ new_subgraphs = []
+ split_count = 0
+ subgraph_for_pass = {}
+ orig_sg.passes = []
+ call_pass = {}
+ startup_init_passes = {}
+
+ last_place = PassPlacement.Cpu
+ curr_sg = orig_sg
+
+ for idx, place in enumerate(place_vec):
+ if place != last_place:
+ if place == PassPlacement.Npu:
+ split_count += 1
+ curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)
+ new_subgraphs.append(curr_sg)
+ call_ps = make_npu_call_op_pass(curr_sg)
+ subgraph_for_pass[call_ps] = orig_sg
+ orig_sg.passes.append(call_ps)
+ call_pass[curr_sg] = call_ps
+
+ startup_init_ps = Pass(
+ curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default
+ )
+ curr_sg.passes.append(startup_init_ps)
+ startup_init_passes[curr_sg] = startup_init_ps
+ subgraph_for_pass[startup_init_ps] = curr_sg
+
+ else:
+ curr_sg = orig_sg
+ last_place = place
+ ps = passes[idx]
+ subgraph_for_pass[ps] = curr_sg
+ curr_sg.passes.append(ps)
+
+ # Rewrite tensors to fix up graphs.
+
+ for curr_sg in new_subgraphs:
+ for ps in curr_sg.passes:
+ for tens in ps.inputs:
+ source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]
+ assert len(source_sgs) >= 0
+ producer_sg = source_sgs[0]
+ for sg in source_sgs:
+ assert sg == producer_sg # All need to be the same.
+
+ if producer_sg != curr_sg:
+ assert (
+ producer_sg == orig_sg
+ ) # Because we go in-order, all the producers must be the original graph.
+ rewrite_tensor_cpu_producer_npu_consumers(
+ tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
+ )
+
+ for tens in ps.outputs:
+
+ dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]
+ need_rewrite = False
+ for sg in dest_sgs:
+ if sg != curr_sg:
+ need_rewrite = True
+ break
+ if tens in orig_sg.output_tensors:
+ need_rewrite = True
+
+ if need_rewrite:
+ rewrite_tensor_npu_producer_cpu_consumers(
+ tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
+ )
+
+ return new_subgraphs
+
+
+def extract_npu_subgraphs(nng, arch):
+
+ nng.refresh_after_modification()
+
+ for sg in list(nng.subgraphs):
+ if sg.placement == PassPlacement.Cpu:
+ new_subgraphs = extract_subgraph(nng, sg, arch)
+ nng.subgraphs += new_subgraphs
+
+ nng.refresh_after_modification()
+ nng.prune_startup_init_pass()
+
+ for sg in nng.subgraphs:
+ sg.build_pass_links()
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
new file mode 100644
index 00000000..f0afcf8f
--- /dev/null
+++ b/ethosu/vela/graph_optimiser.py
@@ -0,0 +1,485 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Early optimisation of the network graph, using the rewrite_graph module to do the traversal of the graph. These are
+# split into two parts optimise_graph_a and optimise_graph_b.
+
+from .nn_graph import Operation, NpuBlockType, Tensor
+from . import rewrite_graph
+from .data_type import BaseType, DataType
+import numpy as np
+import math
+from .numeric_util import round_up_divide
+
+passthrough_nodes = set(("Identity",))
+
+
+def remove_passthrough_tensor(tens, arch):
+ if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
+ assert len(tens.ops[0].inputs) == 1
+ tens = tens.ops[0].inputs[0]
+ return tens
+
+
+def rewrite_concat(tens, arch):
+ if len(tens.ops) == 1 and tens.ops[0].is_concat_op():
+ concat_op = tens.ops[0]
+ if tens != concat_op.outputs[0]:
+ return tens # don't attempt to rewrite the min/max outputs of QuantizedConcat
+
+ # Not supported so leave it and run on CPU
+ if not concat_op.run_on_npu:
+ return tens
+
+ inputs, axis = concat_op.get_concat_inputs_axis()
+
+ tens.ops = []
+ offset = 0
+ for idx, inp in enumerate(inputs):
+ new_op = Operation("ConcatSliceWrite", concat_op.name + str(idx))
+ new_op.inputs = [inp]
+ new_op.outputs = [tens]
+ new_op.attrs["concat_axis"] = axis
+ new_op.attrs["concat_start"] = offset
+ offset += inp.shape[axis]
+ new_op.attrs["concat_end"] = offset
+ new_op.run_on_npu = True
+ tens.ops.append(new_op)
+ assert tens.shape[axis] == offset
+
+ return tens
+
+
+def rewrite_split(tens, arch):
+
+ if len(tens.ops) == 1 and tens.ops[0].is_split_op():
+ split_op = tens.ops[0]
+
+ # Not supported so leave it and run on CPU
+ if not split_op.run_on_npu:
+ return tens
+
+ inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
+
+ tens.ops = []
+ new_op = Operation("SplitSliceRead", split_op.name)
+ new_op.inputs = [inp]
+ new_op.outputs = [tens]
+
+ # For Split the offset cannot be extracted from the tensor so it has to
+ # be calculated from the index of the output tensor
+ if axis != None:
+ # Get the start and end of the split
+ offset_start = [0] * len(tens.shape)
+ offset_end = [0] * len(tens.shape)
+ for out in outputs:
+ if out == tens:
+ break
+ offset_start[axis] += out.shape[axis]
+
+ offset_end[axis] = offset_start[axis] + tens.shape[axis]
+
+ new_op.attrs["split_start"] = offset_start
+ new_op.attrs["split_end"] = offset_end
+ new_op.run_on_npu = True
+ tens.ops.append(new_op)
+
+ return tens
+
+
+def needed_total_padding(input_size, stride, filter_size):
+ out_size = (input_size + stride - 1) // stride
+ needed_input = (out_size - 1) * stride + filter_size
+ total_padding = max(0, needed_input - input_size)
+ return total_padding
+
+
+def calc_padding_and_skirt(padding_type, kernel_size, stride, input_dims):
+ ypad = needed_total_padding(int(input_dims[1]), int(stride[1]), int(kernel_size[0]))
+ xpad = needed_total_padding(int(input_dims[2]), int(stride[2]), int(kernel_size[1]))
+ if padding_type == b"SAME":
+ left_pad = (xpad + 0) // 2
+ right_pad = (xpad + 1) // 2
+ top_pad = (ypad + 0) // 2
+ bottom_pad = (ypad + 1) // 2
+ elif padding_type == b"VALID":
+ left_pad = 0
+ right_pad = 0
+ top_pad = 0
+ bottom_pad = 0
+ else:
+ assert 0, "Unknown padding"
+ padding = (top_pad, left_pad, bottom_pad, right_pad)
+ skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
+ return padding, skirt
+
+
+def fixup_conv2d_backprop(op, arch):
+ if op.type == "Conv2DBackpropInput":
+ # flip the inputs
+ op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
+ op.type = "Conv2DBackpropInputSwitched"
+
+ return op
+
+
+def fixup_fully_connected_input(op, arch):
+ if op.type == "FullyConnectedAct":
+ inp = op.inputs[0]
+ weights = op.inputs[1]
+
+ n_in_elems = weights.shape[-2]
+ elms = inp.elements()
+ batch_size = elms // n_in_elems
+ assert batch_size * n_in_elems == elms
+
+ desired_shape = [batch_size, n_in_elems]
+ if inp.shape != desired_shape:
+ # mismatch, insert a reshape to fix this.
+ reshape_name = op.name + "_reshape"
+ new_shape_tens = Tensor([1], DataType.int32, reshape_name + "_shape")
+ new_shape_tens.values = np.array(desired_shape)
+ new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const")
+ new_shape_tens.ops = [new_shape_tens_const]
+ new_shape_tens_const.outputs = [new_shape_tens]
+
+ reshape_op = Operation("Reshape", reshape_name)
+ reshape_op.inputs = [inp, new_shape_tens]
+ reshape_op.attrs["new_shape"] = desired_shape
+ reshape_out = inp.clone("_reshaped")
+ reshape_out.shape = reshape_out.storage_shape = reshape_out.bandwidth_shape = desired_shape
+ reshape_out.ops = [reshape_op]
+ reshape_op.outputs = [reshape_out]
+
+ op.inputs[0] = reshape_out
+
+ return op
+
+
+def fixup_pack_input(op, arch):
+ if op.type == "Pack":
+ # Pack is also referred to as Stack
+ # Requires the rewrite_concat function to be called on the op afterwards
+ axis = int(op.attrs["axis"])
+ desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
+
+ # Construct 1 shape tensor to be used by all inserted reshape ops
+ new_shape_name = op.name + "_reshape_shape"
+ new_shape_tens = Tensor([1], DataType.int32, new_shape_name)
+ new_shape_tens.values = np.array(desired_shape)
+ new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const")
+ new_shape_tens.ops = [new_shape_tens_const]
+ new_shape_tens_const.outputs = [new_shape_tens]
+
+ for idx, inp in enumerate(op.inputs):
+ reshape_name = op.name + str(idx) + "_reshape"
+ reshape_op = Operation("Reshape", reshape_name)
+ reshape_op.inputs = [inp, new_shape_tens]
+ reshape_op.attrs["new_shape"] = desired_shape
+ reshape_out = inp.clone("_reshaped")
+ reshape_out.shape = reshape_out.storage_shape = reshape_out.bandwidth_shape = desired_shape
+ reshape_out.ops = [reshape_op]
+ reshape_op.outputs = [reshape_out]
+
+ op.inputs[idx] = reshape_out
+
+ op.type = "PackReshaped"
+
+ return op
+
+
+def fixup_unpack_output(tens, arch):
+ op = tens.ops[0]
+ if op.type in set(("Unpack", "StridedSlice")):
+ # Unpack is also referred to as Unstack
+ # Requires the rewrite_split function to be called on the op afterwards
+ if op.type == "StridedSlice":
+ shrink_axis_mask = op.attrs["shrink_axis_mask"]
+ if shrink_axis_mask == 0:
+ # Equal Rank StridedSlice, no need to insert reshape
+ return tens
+
+ # Only allow shrinking 1 axis for now
+ assert shrink_axis_mask & (shrink_axis_mask - 1) == 0
+ assert len(tens.shape) == (len(op.inputs[0].shape) - 1)
+
+ axis = int(math.log2(shrink_axis_mask))
+ op.attrs["shrink_axis_mask"] = 0
+ else:
+ axis = int(op.attrs["axis"])
+ op.type = "UnpackReshaped"
+
+ desired_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
+
+ # Construct 1 shape tensor to be used by all inserted reshape ops
+ new_shape_name = op.name + "_reshape_shape"
+ new_shape_tens = Tensor([1], DataType.int32, new_shape_name)
+ new_shape_tens.values = np.array(tens.shape)
+ new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const")
+ new_shape_tens.ops = [new_shape_tens_const]
+ new_shape_tens_const.outputs = [new_shape_tens]
+
+ for idx, out_tens in enumerate(op.outputs):
+ reshape_name = op.name + str(idx) + "_reshape"
+ reshape_op = Operation("Reshape", reshape_name)
+ reshape_op.outputs = [out_tens]
+ reshape_in = out_tens.clone("_reshaped")
+ reshape_in.shape = reshape_in.storage_shape = reshape_in.bandwidth_shape = desired_shape
+ reshape_in.ops = [op]
+ out_tens.ops = [reshape_op]
+ reshape_op.inputs = [reshape_in, new_shape_tens]
+
+ op.outputs[idx] = reshape_in
+
+ return tens
+
+
+def add_padding_fields(op, arch):
+ if "padding" in op.attrs:
+ if "Conv" in op.type:
+ kernel_size = op.inputs[1].shape[:2]
+ input_shape = op.inputs[0].shape
+ elif "Pool" in op.type:
+ kernel_size = op.attrs["ksize"][1:3]
+ input_shape = op.inputs[0].shape
+ elif op.type == "ExtractImagePatches":
+ kernel_size = op.attrs["ksizes"][1:3]
+ input_shape = op.inputs[0].shape
+ else:
+ assert 0, "Unknown operation that uses padding"
+
+ padding, skirt = calc_padding_and_skirt(op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape)
+ op.attrs["explicit_padding"] = padding
+ op.attrs["skirt"] = skirt
+ return op
+
+
+conv_op = set(("Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitched", "Conv2DBiasAct"))
+fc_op = set(
+ (
+ "MatMul",
+ "QuantizedMatMul",
+ "BlockLSTM",
+ "RnnAct",
+ "UnidirectionalSequenceRnnAct",
+ "BidirectionalSequenceRnnAct",
+ "LstmAct",
+ "UnidirectionalSequenceLstmAct",
+ "BidirectionalSequenceLstmAct",
+ "FullyConnectedAct",
+ )
+)
+depthwise_op = set(("DepthwiseConv2dNative", "DepthwiseConv2dBiasAct",))
+pool_op = set(("AvgPool", "MaxPool", "QuantizedAvgPool", "QuantizedMaxPool", "AvgPoolAct", "MaxPoolAct"))
+elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum", "LeakyRelu", "Abs"))
+activation_ops = set(("Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh"))
+memory_only_ops = set(("Reshape",))
+
+# Check if the op can be reordered
+def get_prepend_op(op):
+ inp = op.inputs[0]
+ # The op should be reordered between prev_op and prep_op
+ prev_op = inp.ops[-1]
+ prep_op = None
+ while prev_op.type in memory_only_ops and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1:
+ prep_op = prev_op
+ inp = prev_op.inputs[0]
+ prev_op = inp.ops[-1]
+ if prev_op != None and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1:
+ return prep_op
+
+ return None
+
+
+def mark_npu_block_type(op, arch):
+ npu_block_type = NpuBlockType.Default
+ if op.type in conv_op:
+ npu_block_type = NpuBlockType.ConvolutionMxN
+ elif op.type in fc_op:
+ npu_block_type = NpuBlockType.VectorProduct
+ elif op.type in depthwise_op:
+ npu_block_type = NpuBlockType.ConvolutionDepthWise
+ elif op.type in pool_op:
+ npu_block_type = NpuBlockType.Pooling
+ elif op.type in elementwise_op:
+ npu_block_type = NpuBlockType.ElementWise
+
+ op.attrs["npu_block_type"] = npu_block_type
+ return op
+
+
+def convert_depthwise_to_conv(op, arch):
+ # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
+ # the ofm depth equals the depth multipler.
+ # If those conditions are true, then we can perform a simple
+ # switch of the operator type (and weight order)
+
+ if ("DepthwiseConv2d" in op.type) and (op.attrs["depth_multiplier"] != 1):
+ ifm_tensor = op.inputs[0]
+ weight_tensor = op.inputs[1]
+ ofm_tensor = op.outputs[0]
+ if (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"]):
+ # Change op type to Conv2d
+ op.type = op.type.replace("DepthwiseConv2d", "Conv2D")
+ del op.attrs["channel_multiplier"]
+ del op.attrs["depth_multiplier"]
+
+ weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2))
+ weight_tensor.shape = weight_tensor.storage_shape = weight_tensor.bandwidth_shape = list(
+ weight_tensor.quant_values.shape
+ )
+ else:
+ print(
+ "Error: Unsupported DepthwiseConv2d with depth_multiplier = {0}, "
+ "ifm channels = {1}, ofm channels = {2}".format(
+ op.attrs["depth_multiplier"], ifm_tensor.shape[3], ofm_tensor.shape[3]
+ )
+ )
+ assert False
+ return op
+
+
+# Reorder activation op if it's after the memory only operations
+def fixup_act_reorder(op, arch):
+ if op.type in activation_ops:
+ prep_op = get_prepend_op(op)
+ if prep_op != None:
+ act_op = op.clone("_reordered")
+ act_op.inputs = [prep_op.inputs[0]]
+ act_op_out = act_op.inputs[0].clone("_acted")
+ act_op_out.quantization = op.outputs[0].quantization.clone()
+ act_op_out.ops = [act_op]
+ act_op.outputs = [act_op_out]
+ prep_op.inputs[0] = act_op_out
+ prep_op.outputs[0].quantization = act_op_out.quantization.clone()
+
+ # Mark the op so that it will be removed as passthrough later on
+ op.type = "Identity"
+ return op
+
+
+def convert_mul_max_to_abs_or_lrelu(op, arch):
+ """Whenever there is a subgraph with this topology:
+
+ Input X For X = -1 or X > 0
+ | \ / This subgraph can be replaced with either
+ | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
+ | /
+ Max
+ """
+
+ if op.type == "Maximum":
+ # finds the Mul input(s) to the Max
+ muls = [i for i in op.inputs if i.ops[0].type == "MulAct"]
+ if len(muls) == 1:
+ mul = muls[0].ops[0]
+ elif len(muls) == 2:
+ # In the case both inputs are Muls, find the one with the same input as the Max
+ mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0]
+ else:
+ # No Mul inputs
+ return op
+
+ # make sure the Mul doesn't have any other consumers
+ if len(mul.outputs[0].consumers()) != 1:
+ return op
+ # make sure the Mul doesn't have a faf
+ if mul.attrs["fused_activation_function"]:
+ return op
+
+ # finds the branched input that goes to both the Max and the Mul
+ shared = set(op.inputs) & set(mul.inputs)
+ if len(shared) == 1:
+ shared_in = shared.pop()
+ # find the constant scalar input to the Mul
+ const_tens = (set(mul.inputs) - {shared_in}).pop()
+ # check that it is a scalar
+ if const_tens.shape != []:
+ return op
+ const = const_tens.ops[0]
+ # check that it is a constant
+ if const.type != "Const":
+ return op
+ else:
+ return op
+
+ val = const.outputs[0].values
+ if val >= 0:
+ new_op = "LeakyRelu"
+ op.attrs["alpha"] = val
+ elif val == -1:
+ new_op = "Abs"
+ else:
+ return op
+
+ op.type = op.type.replace("Maximum", new_op)
+ op.name = op.name.replace("Maximum", new_op)
+ op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op)
+ op.inputs = [shared_in]
+ return op
+
+
+def supported_operator_check(op, arch):
+ op.run_on_npu = arch.supported_operators.is_operator_supported(op)
+ return op
+
+
+def optimise_graph_a(nng, arch, verbose_graph=False):
+ if verbose_graph:
+ nng.print_graph()
+
+ op_rewrite_list = [
+ # mark block type and check if the operations are supported
+ mark_npu_block_type,
+ supported_operator_check,
+ # then do any rewrites of supported operators
+ convert_depthwise_to_conv,
+ fixup_fully_connected_input,
+ fixup_pack_input,
+ fixup_conv2d_backprop,
+ fixup_act_reorder,
+ add_padding_fields,
+ mark_npu_block_type,
+ # convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved
+ ]
+
+ for idx, sg in enumerate(nng.subgraphs):
+ # rewrite graph pass
+ nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
+ sg, arch, [fixup_unpack_output,], op_rewrite_list, rewrite_unsupported=False
+ )
+
+ for idx, sg in enumerate(nng.subgraphs):
+ # remove passthrough tensors
+ nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [remove_passthrough_tensor,], [])
+
+ if verbose_graph:
+ nng.print_graph()
+ return nng
+
+def optimise_graph_b(nng, arch, verbose_graph=False):
+ if verbose_graph:
+ nng.print_graph()
+
+ for idx, sg in enumerate(nng.subgraphs):
+ # combined rewrite graph pass
+ nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [rewrite_concat, rewrite_split,], [])
+
+ if verbose_graph:
+ nng.print_graph()
+ return nng
diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py
new file mode 100644
index 00000000..6b3d2c1e
--- /dev/null
+++ b/ethosu/vela/greedy_allocation.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Allocate tensor addresses using a greedy algorithm.
+
+from . import numeric_util
+
+
+class GreedyAllocator:
+ def __init__(self, nng, arch, live_ranges, mem_area):
+ self.nng = nng
+ self.arch = arch
+ self.mem_area = mem_area
+
+ self.live_ranges = live_ranges
+ self.memory_required = 0
+
+ self.current_allocs = []
+
+ def alloc(self, new_lr):
+ size = new_lr.size
+ current_top = 0
+ if self.current_allocs:
+ current_top = max(start_addr + lr.size for start_addr, lr in self.current_allocs)
+ best_offset = numeric_util.round_up(current_top, new_lr.get_alignment())
+ best_offset_fit = (1 << 64) - 1
+
+ current_offset = 0
+ for start_addr, lr in self.current_allocs:
+ aligned_current_offset = numeric_util.round_up(current_offset, new_lr.get_alignment())
+ if aligned_current_offset + size <= start_addr and start_addr - current_offset < best_offset_fit:
+ best_offset = current_offset
+ best_offset_fit = start_addr - current_offset
+
+ current_offset = start_addr + lr.size
+
+ self.memory_required = max(self.memory_required, best_offset + size)
+ new_lr.set_address(best_offset)
+ self.current_allocs.append((best_offset, new_lr))
+ self.current_allocs = list(sorted(self.current_allocs))
+
+ def dealloc(self, lr_to_dealloc):
+ self.current_allocs = [(start_addr, lr) for start_addr, lr in self.current_allocs if lr != lr_to_dealloc]
+
+ def allocate_live_ranges(self, verbose_allocation):
+ lrs = set()
+ for lr in self.live_ranges.ranges.values():
+ lrs.add((lr.start_time, lr.end_time, lr))
+
+ lrs = sorted(lrs)
+
+ for curr_time, _, new_lr in lrs:
+ for _, lr in list(self.current_allocs):
+ if lr.end_time < curr_time:
+ self.dealloc(lr)
+
+ self.alloc(new_lr)
+
+ assert self.verify_allocation()
+ return self.memory_required
+
+ def verify_allocation(self):
+ lrs = list(self.live_ranges.ranges.values())
+ for n in lrs:
+ for m in lrs:
+ if n != m and n.overlaps_ranges(m):
+ overlap, tens_n, tens_m = n.overlaps_address(m)
+ if overlap:
+ print("Solution failed, overlapping buffer!")
+ print(tens_n.address, tens_n.address + n.size, n.name)
+ print(tens_m.address, tens_m.address + m.size, m.name)
+ print()
+ return False
+
+ return True
+
+
+def allocate_live_ranges(nng, arch, live_ranges, mem_area, verbose_allocation=False):
+ g = GreedyAllocator(nng, arch, live_ranges, mem_area)
+ return g.allocate_live_ranges(verbose_allocation)
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
new file mode 100644
index 00000000..952e2033
--- /dev/null
+++ b/ethosu/vela/high_level_command_stream.py
@@ -0,0 +1,365 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Contains classes that hold commands for the high-level command stream (one command per DMA or NPU stripe).
+
+from enum import Enum, IntEnum
+import numpy as np
+from .operation import NpuBlockType
+from .numeric_util import round_up_divide
+from .range_set import MemoryAccessSet, AccessDirection
+
+
+class Box:
+ def __init__(self, start_coord, end_coord):
+ self.start_coord = list(start_coord)
+ self.end_coord = list(end_coord)
+ assert len(self.start_coord) == len(end_coord)
+ for i in range(len(self.start_coord)):
+ assert self.start_coord[i] <= self.end_coord[i]
+
+ def transform_with_strides_and_skirt(
+ self, strides, skirt, ifm_shape, npu_block_type, concat_axis=0, concat_offset=0, split_offset=None, k_height=1
+ ):
+ new_start_coord = list(self.start_coord)
+ new_end_coord = list(self.end_coord)
+
+ new_start_coord[concat_axis] -= concat_offset
+ new_end_coord[concat_axis] -= concat_offset
+
+ if split_offset != None:
+ for idx in range(len(split_offset)):
+ new_start_coord[idx] += split_offset[idx]
+ new_end_coord[idx] += split_offset[idx]
+
+ if split_offset == None and npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
+ # these types of operations do a "dot product" over the entire IFM
+ new_start_coord[-1] = 0
+ new_end_coord[-1] = ifm_shape[-1]
+
+ if min(len(new_end_coord), len(ifm_shape)) >= 2:
+ new_end_coord[-2] = min(new_end_coord[-2], ifm_shape[-2])
+ if min(len(new_end_coord), len(ifm_shape)) >= 3:
+ new_end_coord[-3] = min(new_end_coord[-3], ifm_shape[-3])
+
+ pad_top = 0
+ pad_bottom = 0
+ if strides is not None and skirt is not None:
+ if len(new_start_coord) >= 2:
+ stride = strides[2]
+ new_start_coord[-2] = max(new_start_coord[-2] * stride - skirt[1], 0)
+ new_end_coord[-2] = min(new_end_coord[-2] * stride + skirt[3], ifm_shape[-2])
+
+ if len(new_start_coord) >= 3:
+ stride = strides[1]
+
+ total_stride = stride * (new_end_coord[-3] - new_start_coord[-3] - 1)
+ new_start_coord[-3] = new_start_coord[-3] * stride - skirt[0]
+
+ pad_top = max(0, 0 - new_start_coord[-3])
+ new_start_coord[-3] = max(new_start_coord[-3], 0)
+
+ while len(ifm_shape) < 3:
+ ifm_shape = [1] + ifm_shape
+ if (new_end_coord[-3] * stride + skirt[2]) > ifm_shape[-3]:
+ # pad_bottom is calculated based the diff between the end position of the weight kernel,
+ # after last stride and the ifm height.
+ k_start = new_start_coord[-3] - pad_top
+ pad_bottom = max(0, k_start + total_stride + k_height - ifm_shape[-3])
+
+ new_end_coord[-3] = min(new_end_coord[-3] * stride + skirt[2], ifm_shape[-3])
+
+ return Box(new_start_coord, new_end_coord), pad_top, pad_bottom
+
+ def make_weight_box(weight_shape, npu_block_type, oc_range_start=None, oc_range_end=None, weights_transposed=False):
+ start = [0] * len(weight_shape)
+ end = list(weight_shape)
+ if oc_range_start is not None and oc_range_end is not None:
+ if npu_block_type == NpuBlockType.ConvolutionDepthWise:
+ # input range is output range divided by channel multiplier
+ if weights_transposed:
+ start[-1] = oc_range_start // weight_shape[-2]
+ end[-1] = oc_range_end // weight_shape[-2]
+ else:
+ start[-2] = oc_range_start // weight_shape[-1]
+ end[-2] = oc_range_end // weight_shape[-1]
+ else:
+ start[-1] = oc_range_start
+ end[-1] = oc_range_end
+ for i in range(len(end)):
+ assert 0 <= start[i] < weight_shape[i]
+ assert 0 < end[i] <= weight_shape[i]
+
+ return Box(start, end)
+
+ def get_size_shape(self):
+ return [int(self.end_coord[i] - self.start_coord[i]) for i in range(len(self.end_coord))]
+
+ def get_size(self):
+ return int(np.prod(self.get_size_shape()))
+
+ def __str__(self):
+ return "<Box %s - %s>" % (self.start_coord, self.end_coord)
+
+ __repr__ = __str__
+
+
+class CommandType(IntEnum):
+ NpuStripe = 0
+ DMA = 1
+ Size = 2
+
+
+class Command:
+ def get_ofm_y_range_for_pass(self, ps_requested):
+ return None
+
+ def is_npu_pass_command(self):
+ return False
+
+ def get_memory_accesses(self):
+ return None
+
+ def get_operation_count(self):
+ # returns numpy array of (DPU blocks, dma_ops). Should line up with the CommandType enum
+ return np.array((0, 0))
+
+
+class NpuStripe(Command):
+ def __init__(
+ self,
+ ps,
+ block_config,
+ is_first,
+ is_last,
+ is_first_h_stripe,
+ is_last_h_stripe,
+ ifm_tensor,
+ ifm_box,
+ ofm_tensor,
+ ofm_box,
+ weight_tensor=None,
+ weight_box=None,
+ scale_tensor=None,
+ concat_axis=0,
+ concat_offset=0,
+ ifm2_tensor=None,
+ ifm2_box=None,
+ pad_top=0,
+ pad_bottom=0,
+ ):
+ self.cmdtype = CommandType.NpuStripe
+ self.ps = ps
+ self.block_config = block_config
+ self.is_first = is_first
+ self.is_last = is_last
+ self.is_first_h_stripe = is_first_h_stripe
+ self.is_last_h_stripe = is_last_h_stripe
+ self.ifm_tensor = ifm_tensor
+ self.ifm_box = ifm_box
+ self.ifm2_tensor = ifm2_tensor
+ self.ifm2_box = ifm2_box
+ self.ofm_tensor = ofm_tensor
+ self.ofm_box = ofm_box
+ self.weight_tensor = weight_tensor
+ self.scale_tensor = scale_tensor
+ self.weight_box = weight_box
+ self.concat_axis = concat_axis
+ self.concat_offset = concat_offset
+ self.pad_top = pad_top
+ self.pad_bottom = pad_bottom
+ for i in range(len(self.ofm_box.end_coord)):
+ assert self.ofm_box.end_coord[i] <= self.ofm_tensor.shape[i]
+
+ def get_memory_accesses(self):
+ res = MemoryAccessSet()
+ if self.ifm_tensor is not None and self.ifm_tensor.shape != []:
+ res.add(
+ self.ifm_tensor.get_address_ranges_for_coordinates(self.ifm_box.start_coord, self.ifm_box.end_coord),
+ AccessDirection.Read,
+ )
+ if self.ifm2_tensor is not None and self.ifm2_tensor.shape != []:
+ res.add(
+ self.ifm2_tensor.get_address_ranges_for_coordinates(self.ifm2_box.start_coord, self.ifm2_box.end_coord),
+ AccessDirection.Read,
+ )
+ if self.ofm_tensor is not None:
+ res.add(
+ self.ofm_tensor.get_address_ranges_for_coordinates(self.ofm_box.start_coord, self.ofm_box.end_coord),
+ AccessDirection.Write,
+ )
+ if self.weight_tensor is not None:
+ res.add(
+ self.weight_tensor.get_address_ranges_for_coordinates(
+ self.weight_box.start_coord, self.weight_box.end_coord
+ ),
+ AccessDirection.Read,
+ )
+ return res
+
+ def is_npu_pass_command(self):
+ return True
+
+ def __str__(self):
+ return "<NPUStripe: ps=%s, ifm_box=%s, ifm2_box=%s, ofm_box=%s, weight_box=%s, block_config=%s>" % (
+ self.ps.name,
+ self.ifm_box,
+ self.ifm2_box,
+ self.ofm_box,
+ self.weight_box,
+ self.block_config,
+ )
+
+ __repr__ = __str__
+
+ def get_ofm_y_range_for_pass(self, ps_requested):
+ if ps_requested != self.ps:
+ return None
+ if len(self.ofm_box.start_coord) >= 3:
+ return (self.ofm_box.start_coord[-3], self.ofm_box.end_coord[-3])
+ return None
+
+ def get_block_dimensions(self):
+ ofm_box = self.ofm_box
+ block_config = self.block_config
+
+ out_height = 1
+ out_width = 1
+ out_depth = ofm_box.end_coord[-1] - ofm_box.start_coord[-1]
+ if len(ofm_box.end_coord) >= 4:
+ out_width = ofm_box.end_coord[-2] - ofm_box.start_coord[-2]
+ out_height = ofm_box.end_coord[-3] - ofm_box.start_coord[-3]
+
+ assert out_height >= 0
+ assert out_width >= 0
+ assert out_depth >= 0
+ return (
+ round_up_divide(out_height, block_config[0]),
+ round_up_divide(out_width, block_config[1]),
+ round_up_divide(out_depth, block_config[3]),
+ )
+
+ def get_operation_count(self):
+ # returns numpy array of (DPU blocks, dma_ops)
+ return np.array((self.get_n_blocks(), 0))
+
+ def get_n_blocks(self):
+ h, w, d = self.get_block_dimensions()
+ res = h * w * d
+ assert res >= 0
+ return res
+
+ def get_single_block_command(self, block_idx):
+ block_cfg = (self.block_config[0], self.block_config[1], self.block_config[3])
+ dims = self.get_block_dimensions()
+ strides = dims[1] * dims[2], dims[2], 1
+ coord = []
+ idx_left = block_idx
+ for s in strides:
+ c = idx_left // s
+ idx_left -= c * s
+ coord.append(c)
+
+ assert idx_left == 0
+
+ # put in dummy height/widths in case we're dealing with FC layers
+ ofm_start = list(self.ofm_box.start_coord)
+ ofm_end = list(self.ofm_box.end_coord)
+
+ # cut out a nice block shape
+ for idx in (-1, -2, -3):
+ if len(ofm_start) >= -idx:
+ ofm_start[idx] += block_cfg[idx] * coord[idx]
+ ofm_end[idx] = min(ofm_end[idx], ofm_start[idx] + block_cfg[idx])
+
+ ps = self.ps
+ strides = None
+ skirt = None
+ if ps.primary_op is not None:
+ strides = ps.primary_op.attrs.get("strides", None)
+ skirt = ps.primary_op.attrs.get("skirt", None)
+ npu_block_type = ps.npu_block_type
+
+ ofm_box = Box(ofm_start, ofm_end)
+ ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+ strides, skirt, self.ifm_tensor.shape, npu_block_type, self.concat_axis, self.concat_offset
+ )
+
+ weight_box = None
+ if self.weight_tensor is not None:
+ weight_oc_start = ofm_start[-1]
+ weight_oc_end = ofm_end[-1]
+ if self.concat_axis - len(self.weight_tensor.shape) == -1:
+ weight_oc_start -= self.concat_offset
+ weight_oc_end -= self.concat_offset
+
+ weight_box = Box.make_weight_box(
+ self.weight_tensor.shape,
+ npu_block_type,
+ weight_oc_start,
+ weight_oc_end,
+ self.weight_tensor.weight_transpose_depthwise,
+ )
+
+ return NpuStripe(
+ self.ps,
+ self.block_config,
+ self.is_first,
+ self.is_last,
+ self.is_first_h_stripe,
+ self.is_last_h_stripe,
+ self.ifm_tensor,
+ ifm_box,
+ self.ofm_tensor,
+ ofm_box,
+ self.weight_tensor,
+ weight_box,
+ self.scale_tensor,
+ self.concat_axis,
+ self.concat_offset,
+ )
+
+
+class DMA(Command):
+ def __init__(self, in_tensor, out_tensor, box):
+ self.cmdtype = CommandType.DMA
+ self.in_tensor = in_tensor
+ self.out_tensor = out_tensor
+ self.box = box
+
+ def __str__(self):
+ return "<DMA: in=%s, out=%s, box=%s>" % (self.in_tensor.name, self.out_tensor.name, self.box)
+
+ __repr__ = __str__
+
+ def get_memory_accesses(self):
+ res = MemoryAccessSet()
+
+ res.add(
+ self.in_tensor.get_address_ranges_for_coordinates(self.box.start_coord, self.box.end_coord),
+ AccessDirection.Read,
+ )
+ res.add(
+ self.out_tensor.get_address_ranges_for_coordinates(self.box.start_coord, self.box.end_coord),
+ AccessDirection.Write,
+ )
+ return res
+
+ def get_operation_count(self):
+ # returns numpy array of (DPU blocks, dma_ops)
+ return np.array((0, 1))
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
new file mode 100644
index 00000000..364df6f8
--- /dev/null
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -0,0 +1,315 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
+#
+# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
+# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
+
+from .nn_graph import SchedulingStrategy, PassPlacement
+import numpy as np
+from .operation import NpuBlockType
+from .high_level_command_stream import Box, CommandType, Command, NpuStripe, DMA
+
+
+def need_dma(tens):
+ return len(tens.ops) == 1 and tens.ops[0].type == "DMA"
+
+
+def dma_weights_if_necessary(ps, box, weight_tensor):
+ if need_dma(weight_tensor):
+ dma_op = weight_tensor.ops[0]
+ in_tensor = dma_op.inputs[0]
+ yield DMA(in_tensor, weight_tensor, box)
+
+
+def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
+ is_first = idx == 0
+ is_last = idx == len(passes) - 1
+ ps = passes[idx]
+ block_config = block_configs[idx]
+
+ ifm_tensor = ps.ifm_tensor
+ ifm2_tensor = ps.ifm2_tensor
+ ofm_tensor = ps.ofm_tensor
+ weight_tensor = ps.weight_tensor
+ scale_tensor = ps.scale_tensor
+
+ ofm_start = [0] * len(ofm_tensor.shape)
+ ofm_end = list(ofm_tensor.shape)
+
+ strides = None
+ skirt = None
+ if ps.primary_op is not None:
+ strides = ps.primary_op.attrs.get("strides", None)
+ skirt = ps.primary_op.attrs.get("skirt", None)
+
+ npu_block_type = ps.npu_block_type
+
+ concat_axis = 0
+ concat_offset = 0
+
+ split_offsets = [None, None] # offset for [ifm, ifm2]
+
+ # Fusable activation functions
+ activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
+
+ for op in ps.ops:
+ if op.type == "ConcatSliceWrite":
+ concat_axis = op.attrs["concat_axis"]
+ concat_start = op.attrs["concat_start"]
+ concat_end = op.attrs["concat_end"]
+
+ ofm_start[concat_axis] = concat_start
+ ofm_end[concat_axis] = concat_end
+ concat_offset = concat_start
+ ps.primary_op.attrs["fused_memory_function"] = op.type
+ elif op.type in activation_ops:
+ ps.primary_op.attrs["fused_activation_function"] = op.type
+
+ # The ops list has to be reversed here since the Pass Packing is done in reverse
+ ifm_idx = 0
+ for op in reversed(ps.ops):
+ if op.type == "SplitSliceRead":
+ split_offsets[ifm_idx] = op.attrs["split_start"]
+ ps.primary_op.attrs["fused_memory_function"] = op.type
+ ifm_idx += 1
+
+ if strat == SchedulingStrategy.WeightStream:
+ ofm_step = block_config[-1]
+ ofm_stop = ofm_end[-1]
+ if weight_tensor is None or not need_dma(weight_tensor):
+ ofm_step = ofm_stop
+ for start in range(ofm_start[-1], ofm_stop, ofm_step):
+ end = min(start + ofm_step, ofm_stop)
+ ofm_start[-1] = start
+ ofm_end[-1] = end
+ ofm_box = Box(ofm_start, ofm_end)
+ ifm_box = None
+ ifm2_box = None
+
+ if ifm_tensor.shape != []:
+ ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+ strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
+ )
+ else:
+ ifm_box = Box([], [])
+ if ifm2_tensor is not None and ifm2_tensor.shape != []:
+ ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+ strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
+ )
+ else:
+ ifm2_box = Box([], [])
+
+ weight_box = None
+ if weight_tensor is not None:
+ weight_oc_start = start
+ weight_oc_end = end
+ if concat_axis - len(weight_tensor.shape) == -1:
+ weight_oc_start -= concat_offset
+ weight_oc_end -= concat_offset
+
+ weight_box = Box.make_weight_box(
+ weight_tensor.shape,
+ npu_block_type,
+ weight_oc_start,
+ weight_oc_end,
+ weight_tensor.weight_transpose_depthwise,
+ )
+ yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+
+ yield NpuStripe(
+ ps,
+ block_config,
+ is_first,
+ is_last,
+ True,
+ True,
+ ifm_tensor,
+ ifm_box,
+ ofm_tensor,
+ ofm_box,
+ weight_tensor,
+ weight_box,
+ scale_tensor,
+ concat_axis,
+ concat_offset,
+ ifm2_tensor=ifm2_tensor,
+ ifm2_box=ifm2_box,
+ )
+
+ elif strat == SchedulingStrategy.IfmStream:
+ y_step = block_config[0]
+ y_start = 0
+ y_dim = 1
+ if len(ofm_tensor.shape) >= 3:
+ y_start = ofm_start[-3]
+ y_dim = ofm_end[-3]
+ if idx > 0:
+ ifm_y_present = 0
+ prev_pass = passes[idx - 1]
+ prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
+ else:
+ ifm_y_present = 1
+ if len(ifm_tensor.shape) >= 3:
+ ifm_y_present = ifm_tensor.shape[-3]
+ prev_pass_gen = []
+ prev_pass = None
+
+ if len(passes) == 1:
+ # no cascading, can just issue one big stripe
+ # but only if we've done allocation and OFM does not overlap IFM
+ if ifm_tensor.address != -1 and ofm_tensor.address != -1:
+ if (
+ ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
+ or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
+ ):
+ y_step = y_dim
+
+ weight_box = None
+
+ for start in range(y_start, y_dim, y_step):
+ end = min(start + y_step, y_dim)
+ if len(ofm_tensor.shape) >= 3:
+ ofm_start[-3] = start
+ ofm_end[-3] = end
+ ofm_box = Box(ofm_start, ofm_end)
+
+ k_height = 1
+ if npu_block_type == NpuBlockType.Pooling:
+ if ps.primary_op is not None:
+ k_height = ps.primary_op.attrs["ksize"][1]
+ else:
+ if weight_tensor is not None:
+ k_height = weight_tensor.shape[0]
+
+ ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
+ strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
+ )
+
+ ifm_y_needed = 1
+ if len(ifm_box.end_coord) >= 3:
+ ifm_y_needed = ifm_box.end_coord[-3]
+ if ifm_y_present < ifm_y_needed:
+ for prev_cmd in prev_pass_gen:
+ yield prev_cmd
+ rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
+ if rng is not None:
+ ifm_y_present = max(ifm_y_present, rng[1])
+ if ifm_y_present >= ifm_y_needed:
+ break
+
+ if weight_tensor is not None and weight_box is None:
+ weight_box = Box.make_weight_box(
+ weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
+ )
+ yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+
+ # Check if first/last stripe in pass
+ is_first_h_stripe = start == y_start
+ is_last_h_stripe = (start + y_step) >= y_dim
+
+ stripe = NpuStripe(
+ ps,
+ block_config,
+ is_first,
+ is_last,
+ is_first_h_stripe,
+ is_last_h_stripe,
+ ifm_tensor,
+ ifm_box,
+ ofm_tensor,
+ ofm_box,
+ weight_tensor,
+ weight_box,
+ scale_tensor,
+ concat_axis,
+ concat_offset,
+ None,
+ None,
+ pad_top,
+ pad_bottom,
+ )
+ yield stripe
+ else:
+ assert 0, "unknown scheduling strategy"
+
+
+def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
+ if strat == SchedulingStrategy.WeightStream:
+ for idx in range(len(passes)):
+ yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
+ elif strat == SchedulingStrategy.IfmStream:
+ yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
+ else:
+ assert 0, "Unknown streaming strategy"
+
+
+def generate_high_level_command_stream_for_cascaded_pass(cps):
+ yield from generate_high_level_command_stream_for_pass_list(
+ cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
+ )
+
+
+def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
+ res = []
+ for cps in sg.cascaded_passes:
+ if cps.placement == PassPlacement.Npu:
+ res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
+
+ sg.high_level_command_stream = res
+ if verbose_high_level_command_stream:
+ sg.print_high_level_command_stream()
+
+
+def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
+ highest_ofm_write = 0
+ if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
+ return 0
+
+ ifm_read = passes[0].ifm_tensor.storage_size
+ min_overlap = 999999999999999999999
+ ofm_size = passes[-1].ofm_tensor.storage_size()
+ if strat == SchedulingStrategy.WeightStream:
+ return 0
+ for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
+ if cmd.is_npu_pass_command():
+ if cmd.is_first:
+ ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
+ if ifm_read is None:
+ return 0
+ if cmd.is_last:
+ write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
+ if write_offset is None:
+ return 0
+ highest_ofm_write = max(write_offset, highest_ofm_write)
+
+ if cmd.is_first or cmd.is_last:
+ overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
+ can_overwrite = ofm_size - overlap_required
+ min_overlap = min(min_overlap, can_overwrite)
+
+ if cmd.is_first:
+ ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
+
+ min_overlap = max(min_overlap, 0)
+ return min_overlap
+
+
+def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
+ return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
new file mode 100644
index 00000000..b63c1ea1
--- /dev/null
+++ b/ethosu/vela/insert_dma.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Insert DMA operations into the graph for transfering weights.
+
+from .nn_graph import Operation, MemArea, TensorPurpose, NpuBlockType
+from . import rewrite_graph
+
+
+def insert_dma_cmd(op, arch):
+ if op.type == "DMA":
+ return op # Already rewritten
+ for idx, tens in enumerate(op.inputs):
+
+ if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area:
+ if tens.purpose == TensorPurpose.Weights:
+ only_vector_product_consumers = True
+ for oper in tens.consumers():
+ if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
+ only_vector_product_consumers = False
+ break
+
+ # Tensor products has no need for DMA, tensors are only read once and can be in flash.
+ # Other operations re-reads tensors, this is better done from SRAM.
+ if not only_vector_product_consumers:
+ # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
+ new_tens = tens.clone_into_fast_storage(arch)
+ dma_cmd = Operation("DMA", tens.ops[0].name + "_dma")
+ dma_cmd.inputs = [tens]
+ dma_cmd.outputs = [new_tens]
+ dma_cmd.attrs["source"] = tens.mem_area
+ dma_cmd.attrs["destination"] = new_tens.mem_area
+ dma_cmd.run_on_npu = True
+ new_tens.ops = [dma_cmd]
+ op.inputs[idx] = new_tens
+ return op
+
+
+def insert_dma_commands(nng, arch, verbose_graph=False):
+
+ for idx, sg in enumerate(nng.subgraphs):
+ nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [], [insert_dma_cmd])
+ if verbose_graph:
+ nng.print_graph()
+ return nng
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
new file mode 100644
index 00000000..24f1f64c
--- /dev/null
+++ b/ethosu/vela/live_range.py
@@ -0,0 +1,324 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Build a live range graph for tensors in one or more subgraphs. Used for tensor allocation as well as in the scheduler.
+# Can work with either a pass packed subgraph or a scheduled subgraph.
+
+from .tensor import Tensor, MemArea
+from .nn_graph import TensorPurpose, PassPlacement
+from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass
+
+
+class LiveRange:
+ def __init__(self, tens):
+ self.tensors = [] # Tensors that are assigned to the same LiveRange will be allocated to the same address
+ self.start_time = 99999999999
+ self.end_time = -1
+ self.size = 0
+ self.name = ""
+
+ if tens:
+ self.add_tensor(tens)
+
+ def __str__(self):
+ return "<live_range.LiveRange: '%s' start_time=%s, end_time=%s>" % (self.name, self.start_time, self.end_time)
+
+ __repr__ = __str__
+
+ def add_tensor(self, tens):
+ if self.size == 0:
+ self.size = tens.storage_size()
+ self.name = tens.name # LiveRange will be named after the first tensor added
+ else:
+ assert (
+ self.size >= tens.storage_size()
+ ), "Tensors assigned to the same LiveRange need to fit the size of the LiveRange."
+
+ self.tensors.append(tens)
+
+ def mark_usage(self, op_time):
+ if op_time == -1:
+ return
+ op_time_start = op_time
+ op_time_end = op_time + 1
+
+ self.start_time = min(self.start_time, op_time_start)
+ self.end_time = max(self.end_time, op_time_end)
+
+ def overlaps_ranges(self, other):
+ return max(self.start_time, other.start_time) < min(self.end_time, other.end_time)
+
+ def overlaps_address(self, other):
+ # Returns the first pair of tensors in this LiveRange and 'other' which have
+ # overlapping addresses
+ for tens in self.tensors:
+ for other_tens in other.tensors:
+ if max(tens.address, other_tens.address) < min(
+ tens.address + self.size, other_tens.address + other.size
+ ):
+ return True, tens, other_tens
+
+ return False, None, None
+
+ def __lt__(self, other):
+ if self.start_time != other.start_time:
+ return self.start_time < other.start_time
+ if self.end_time != other.end_time:
+ return self.end_time < other.end_time
+ if self.size != other.size:
+ return self.size < other.size
+ return self.name < other.name
+
+ def set_address(self, address):
+ # Set address of all unaddressed tensors in LiveRange
+ for tens in self.tensors:
+ if tens.address == 0:
+ tens.address = address
+ # Also need to set the address to the tensor's cpu/npu clones
+ if tens.cpu_tensor != None:
+ tens.cpu_tensor.address = address
+ if tens.npu_tensor != None:
+ tens.npu_tensor.address = address
+
+ def get_alignment(self):
+ # Get max alignment of LiveRange's tensors
+ if self.tensors:
+ alignment = 0
+ for tens in self.tensors:
+ alignment = max(alignment, tens.alignment)
+
+ return alignment
+
+ return Tensor.AllocationQuantum
+
+
+def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area):
+ for ps in sg.passes:
+ if ps.placement == PassPlacement.MemoryOnly:
+ # For memory only passes, e.g. Reshape. Add input and output tensor to the same LiveRange
+ input_tensor = ps.inputs[0]
+ output_tensor = ps.outputs[0]
+ # If the input or output tensor is tied to a Cpu tensor, i.e. a subgraph input
+ # or output, fuse the live-range with the Cpu tensors' live-range instead.
+ input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor != None else input_tensor
+ output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor != None else output_tensor
+ if not tensor_should_be_ignored(input_tensor, target_mem_area) and not tensor_should_be_ignored(
+ output_tensor, target_mem_area
+ ):
+ lr_graph.fuse_ranges(input_tensor, output_tensor)
+
+
+class LiveRangeGraph:
+ def __init__(self):
+ self.ranges = {} # tens -> range
+ self.allowed_overlaps = {} # (tens,tens) -> overlap_int
+ self.ignore_tensors = set()
+ self.processed_subgraphs = set()
+ self.current_time = 0
+
+ def get_or_create_range(self, tens):
+ for rng in self.ranges.values():
+ # Return the live range of the tensor (or it's cpu/npu clone)
+ if any(tensor in rng.tensors for tensor in [tens, tens.npu_tensor, tens.cpu_tensor]):
+ return rng
+
+ # No live range found for the tensor, create a new one
+ rng = LiveRange(tens)
+ self.ranges[tens] = rng
+ return rng
+
+ def fuse_ranges(self, in_tens, out_tens):
+ live_range = self.get_or_create_range(in_tens)
+ assert out_tens not in self.ranges, out_tens
+ live_range.add_tensor(out_tens)
+ self.ranges[out_tens] = live_range
+ return live_range
+
+
+def extract_live_ranges_from_passes(
+ sg,
+ target_mem_area,
+ mark_output_tensors_overlapping_with_input_tensors=False,
+ ignore_subgraph_input_output_tensors=False,
+):
+ lr_graph = LiveRangeGraph()
+
+ if ignore_subgraph_input_output_tensors:
+ lr_graph.ignore_tensors.update(sg.input_tensors)
+ lr_graph.ignore_tensors.update(sg.output_tensors)
+
+ def tensor_should_be_ignored(tens, target_mem_area):
+ if tens.mem_area != target_mem_area:
+ return True
+ if tens in lr_graph.ignore_tensors:
+ return True
+ if tens.name.endswith("reshape_shape_npu"):
+ # Reshape tensor, no need to allocate
+ lr_graph.ignore_tensors.add(tens)
+ return True
+ return False
+
+ # Merge only memory operations in the NPU subgraphs
+ if sg.placement == PassPlacement.Npu:
+ merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area)
+
+ for idx, ps in enumerate(sg.passes):
+ ps.time = 2 * idx
+
+ time_for_pass = ps.time
+
+ for tens in ps.inputs:
+ if tensor_should_be_ignored(tens, target_mem_area):
+ continue
+ rng = lr_graph.get_or_create_range(tens)
+ rng.mark_usage(time_for_pass)
+
+ for tens in ps.intermediates:
+ if tensor_should_be_ignored(tens, target_mem_area):
+ continue
+ rng = lr_graph.get_or_create_range(tens)
+ rng.mark_usage(time_for_pass)
+
+ for tens in ps.outputs:
+ if tensor_should_be_ignored(tens, target_mem_area):
+ continue
+ rng = lr_graph.get_or_create_range(tens)
+ output_time = time_for_pass
+ if not mark_output_tensors_overlapping_with_input_tensors and ps.is_element_wise:
+ output_time += 1
+ rng.mark_usage(output_time)
+
+ end_time = len(sg.passes) * 2
+ for tens in sg.output_tensors:
+ if tensor_should_be_ignored(tens, target_mem_area):
+ continue
+ rng = lr_graph.get_or_create_range(tens)
+ rng.mark_usage(end_time)
+
+ return lr_graph
+
+
+def extract_live_ranges_from_cascaded_passes(
+ sg,
+ target_mem_area,
+ mark_output_tensors_overlapping_with_input_tensors=False,
+ use_ifm_ofm_overlap=True,
+ ignore_subgraph_input_output_tensors=False,
+ lr_graph=None,
+):
+ if lr_graph == None:
+ lr_graph = LiveRangeGraph()
+
+ if sg in lr_graph.processed_subgraphs:
+ # if subgraph has been processed already, return the lr_graph as is
+ return lr_graph
+
+ if ignore_subgraph_input_output_tensors:
+ lr_graph.ignore_tensors.update(sg.input_tensors)
+ lr_graph.ignore_tensors.update(sg.output_tensors)
+
+ def tensor_should_be_ignored(tens, target_mem_area):
+ if tens.mem_area != target_mem_area:
+ return True
+ if tens in lr_graph.ignore_tensors:
+ return True
+ if tens.name.endswith("reshape_shape_npu"):
+ # Reshape tensor, no need to allocate
+ lr_graph.ignore_tensors.add(tens)
+ return True
+ return False
+
+ # Merge only memory operations in the NPU subgraphs
+ if sg.placement == PassPlacement.Npu:
+ merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area)
+
+ for cps in sg.cascaded_passes:
+ cps.time = lr_graph.current_time
+
+ time_for_pass = cps.time
+
+ is_element_wise = cps.is_element_wise
+
+ for tens in cps.inputs:
+ if tensor_should_be_ignored(tens, target_mem_area):
+ continue
+ rng = lr_graph.get_or_create_range(tens)
+ rng.mark_usage(time_for_pass)
+
+ cps_primary_op = cps.passes[0].primary_op
+ if cps_primary_op and cps_primary_op.type == "NpuOp" and target_mem_area in set((MemArea.Sram, MemArea.Dram)):
+ # If the primary-op is an NpuOp that means this is where an Npu subgraph
+ # is called. Go into said subgraph and extract live ranges before continuing.
+ npu_sg = cps_primary_op.attrs["subgraph"]
+ lr_graph = extract_live_ranges_from_cascaded_passes(
+ npu_sg,
+ target_mem_area,
+ mark_output_tensors_overlapping_with_input_tensors,
+ use_ifm_ofm_overlap,
+ False,
+ lr_graph,
+ )
+ # Set the new time after handling the Npu subgraph
+ time_for_pass = lr_graph.current_time
+ cps.time = time_for_pass
+
+ for tens in cps.intermediates:
+ if tensor_should_be_ignored(tens, target_mem_area):
+ continue
+ rng = lr_graph.get_or_create_range(tens)
+ rng.mark_usage(time_for_pass)
+
+ for tens in cps.outputs:
+ if tensor_should_be_ignored(tens, target_mem_area):
+ continue
+ rng = lr_graph.get_or_create_range(tens)
+ output_time = time_for_pass
+ if not mark_output_tensors_overlapping_with_input_tensors and is_element_wise:
+ output_time += 1
+ rng.mark_usage(output_time)
+
+ if use_ifm_ofm_overlap:
+ # fill allowed overlap for ifm and ofm tensor
+ ifm_tensor = cps.passes[0].ifm_tensor
+ ofm_tensor = cps.passes[-1].ofm_tensor
+ if (
+ ifm_tensor is not None
+ and ofm_tensor is not None
+ and not tensor_should_be_ignored(ifm_tensor, target_mem_area)
+ and not tensor_should_be_ignored(ofm_tensor, target_mem_area)
+ ):
+ lr_graph.allowed_overlaps[(ifm_tensor, ofm_tensor)] = calc_allowed_ofm_ifm_overlap_for_cascaded_pass(
+ cps
+ )
+
+ lr_graph.current_time += 2
+
+ end_time = 0
+ for rng in lr_graph.ranges.values():
+ # Find the maximum end time of all live-ranges in the graph
+ end_time = max(end_time, rng.end_time)
+
+ for tens in sg.output_tensors:
+ if tensor_should_be_ignored(tens, target_mem_area):
+ continue
+ rng = lr_graph.get_or_create_range(tens)
+ rng.mark_usage(end_time)
+
+ # Add subgraph to set of processed subgraphs
+ lr_graph.processed_subgraphs.add(sg)
+ return lr_graph
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
new file mode 100644
index 00000000..9b1824b5
--- /dev/null
+++ b/ethosu/vela/mark_tensors.py
@@ -0,0 +1,363 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Mark purpose and select formats for Tensors. Also compresses the weights.
+
+from . import rewrite_graph
+from . import weight_compressor
+from .architecture_features import Block
+from .nn_graph import TensorPurpose, TensorFormat, PassPlacement
+from .operation import NpuBlockType
+
+
+def purpose_from_list(lst):
+ def purpose(op, idx):
+ return lst[idx]
+
+ return purpose
+
+
+def all_fm(op, idx):
+ return TensorPurpose.FeatureMap
+
+
+def all_parameter(op, idx):
+ return TensorPurpose.FeatureMap
+
+
+def input0_from_output_rest_parameter(op, idx):
+ if idx == 0:
+ res = op.outputs[0].purpose
+ if res == TensorPurpose.Unknown:
+ print("Warning: Propagating unknown tensor purpose", op)
+ return res
+ return TensorPurpose.FeatureMap
+
+
+def inputs_from_output(op, idx):
+ res = op.outputs[0].purpose
+ if res == TensorPurpose.Unknown:
+ print("Warning: Propagating unknown tensor purpose", op)
+ return res
+
+tensor_purposes = [ # ops, input_purpose
+ (
+ set(
+ (
+ "Relu",
+ "Relu6",
+ "Mul",
+ "Add",
+ "Sub",
+ "Rsqrt",
+ "Abs",
+ "Cast",
+ "Exp",
+ "Floor",
+ "FloorDiv",
+ "FloorMod",
+ "SquaredDifference",
+ "AddN",
+ "BiasAdd",
+ "RealDiv",
+ "Maximum",
+ "Minimum",
+ "Sigmoid",
+ "Tanh",
+ "FusedBatchNorm",
+ "AvgPool",
+ "MaxPool",
+ "Squeeze",
+ "Softmax",
+ "LRN",
+ "Assign",
+ "BatchMatMul",
+ "ZerosLike",
+ "ExtractImagePatches",
+ "MulAct",
+ "AddAct",
+ "SubAct",
+ "DivAct",
+ "AvgPoolAct",
+ "MaxPoolAct",
+ "LeakyRelu",
+ )
+ ),
+ all_fm,
+ ),
+ (
+ set(
+ (
+ "Conv2D",
+ "DepthwiseConv2dNative",
+ "MatMul",
+ "Conv2DBiasAct",
+ "DepthwiseConv2dBiasAct",
+ "FullyConnectedAct",
+ )
+ ),
+ purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]),
+ ),
+ (
+ set(("Conv2DBackpropInputSwitched",)),
+ purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]),
+ ),
+ (
+ set(("QuantizedConv2D", "QuantizedMatMul")),
+ purpose_from_list(
+ [
+ TensorPurpose.FeatureMap,
+ TensorPurpose.Weights,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ ]
+ ),
+ ),
+ (
+ set(
+ (
+ "Reshape",
+ "Min",
+ "Max",
+ "Mean",
+ "Pad",
+ "MirrorPad",
+ "ArgMax",
+ "ArgMin",
+ "ExpandDims",
+ "ResizeNearestNeighbor",
+ "ResizeBilinear",
+ "Tile",
+ "Transpose",
+ "Mfcc",
+ )
+ ),
+ purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
+ ),
+ (
+ set(("QuantizedReshape", "QuantizedResizeBilinear")),
+ purpose_from_list(
+ [TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
+ ),
+ ),
+ (
+ set(("QuantizedBiasAdd", "QuantizedAdd", "QuantizedMul")),
+ purpose_from_list(
+ [
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ ]
+ ),
+ ),
+ (
+ set(
+ (
+ "Dequantize",
+ "Quantize",
+ "QuantizeV2",
+ "QuantizedRelu",
+ "QuantizedRelu1",
+ "QuantizedRelu6",
+ "QuantizedAvgPool",
+ "QuantizedMaxPool",
+ "Slice",
+ "SplitV",
+ )
+ ),
+ purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
+ ),
+ (
+ set(("BatchToSpaceND", "SpaceToBatchND", "DepthToSpaceND", "SpaceToDepthND")),
+ purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
+ ),
+ (
+ set(("BlockLSTM",)),
+ purpose_from_list(
+ [
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.Weights,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ ]
+ ),
+ ),
+ (set(("SplitSliceRead",)), purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap])),
+ (set(("Shape", "ConcatSliceWrite", "AudioSpectrogram")), purpose_from_list([TensorPurpose.FeatureMap])),
+ (
+ set(("StridedSlice",)),
+ purpose_from_list(
+ [TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
+ ),
+ ),
+ (set(("Fill", "Pack", "Range")), all_parameter),
+ (
+ set(("Requantize",)),
+ purpose_from_list(
+ [
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ TensorPurpose.FeatureMap,
+ ]
+ ),
+ ),
+ (set(("Placeholder", "SubgraphInput", "Const", "VariableV2")), purpose_from_list([])),
+ (set(("FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars")), input0_from_output_rest_parameter),
+ (
+ set(("Square", "Sqrt", "Log", "Less", "Enter", "Exit", "Identity", "StopGradient", "Merge", "Switch")),
+ inputs_from_output,
+ ),
+ (None, all_fm),
+]
+
+
+for ops, input_purpose in tensor_purposes:
+ if ops is None:
+ continue
+ for op in ops:
+ assert len(op) > 1, "string literal has been decomposed"
+
+
+def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
+ def mark_tensor_helper(tens, purpose):
+
+ if tens.purpose == TensorPurpose.Unknown or tens.purpose == purpose:
+ tens.purpose = purpose
+ else:
+ assert 0, "Cannot resolve tensor purpose %s and %s for tensor %s" % (tens.purpose, purpose, tens)
+ tens.mem_area = arch.tensor_storage_mem_area[tens.purpose]
+
+ if len(tens.ops) == 1 and tens.ops[0].type == "Const":
+ tens.mem_area = (
+ arch.permanent_storage_mem_area
+ ) # special case constants, as they must be in permanent storage
+
+ def rewrite_mark_tensor_purpose(op, arch):
+ # find disconnected outputs and mark as parameters
+ for tens in op.outputs:
+ if not tens.consumers():
+ mark_tensor_helper(tens, TensorPurpose.FeatureMap)
+
+ for ops, input_purpose in tensor_purposes:
+ if ops is None or op.type in ops:
+ if ops is None:
+ print(
+ "warning: don't know how to mark up purpose for",
+ op.type,
+ op.inputs,
+ "triggering all feature map fallback",
+ )
+ for idx, tens in enumerate(op.inputs):
+ purpose = input_purpose(op, idx)
+ mark_tensor_helper(tens, purpose)
+ break
+ return op
+
+ for sg in nng.subgraphs:
+ sg = rewrite_graph.rewrite_graph_pre_order(sg, arch, [], [rewrite_mark_tensor_purpose])
+ for tens in sg.output_tensors:
+ mark_tensor_helper(tens, TensorPurpose.FeatureMap)
+
+ if verbose_tensor_purpose:
+ nng.print_graph_with_tensors()
+
+ return nng
+
+
+reshape_operations = set(
+ (
+ "Reshape",
+ "QuantizedReshape",
+ "ExpandDims",
+ "Squeeze",
+ "BatchToSpaceND",
+ "SpaceToBatchND",
+ "DepthToSpaceND",
+ "SpaceToDepthND",
+ "Placeholder",
+ )
+)
+
+
+def mark_tensor_format(nng, arch, verbose_tensor_format=False):
+ formats_for_tensor = {}
+
+ def init_tens(tens):
+ if tens.purpose == TensorPurpose.FeatureMap:
+ fmt = arch.default_feature_map_format
+ elif tens.purpose == TensorPurpose.Weights:
+ fmt = arch.default_weight_format
+ else:
+ assert 0, "unknown tensor purpose %s" % (tens.purpose,)
+ return fmt
+
+ def find_npu_usage_of_tensor(tens):
+ for op in tens.consumers():
+ if op.type == "DMA":
+ return find_npu_usage_of_tensor(op.outputs[0])
+ if "npu_block_type" in op.attrs:
+ return op.attrs["npu_block_type"]
+ return NpuBlockType.Default
+
+ def visit_tens(tens, ps):
+ if not tens in formats_for_tensor:
+ fmt = init_tens(tens)
+ else:
+ fmt = formats_for_tensor[tens]
+
+ formats_for_tensor[tens] = fmt
+
+ for sg in nng.subgraphs:
+ for ps in sg.passes:
+ for tens in ps.outputs:
+ visit_tens(tens, ps)
+ for tens in ps.intermediates:
+ visit_tens(tens, ps)
+ for tens in ps.inputs:
+ visit_tens(tens, ps)
+
+ for tens, fmt in formats_for_tensor.items():
+ tens.set_format(fmt, arch)
+ if fmt == TensorFormat.WeightsCompressed and tens.values is not None:
+ npu_block_type = find_npu_usage_of_tensor(tens)
+ if len(tens.ops) == 1 and tens.ops[0].type == "DMA":
+ weight_compressor.compress_weights(tens, arch, npu_block_type, Block(32, 32, 32), 32)
+ # Alias compressed weights back into source tensor
+ src_tens = tens.ops[0].inputs[0]
+ src_tens.compressed_values = tens.compressed_values
+ src_tens.storage_shape = tens.storage_shape
+ src_tens.brick_size = tens.brick_size
+ src_tens.weight_compression_scales = tens.weight_compression_scales
+ src_tens.weight_compressed_offsets = tens.weight_compressed_offsets
+ src_tens.compression_scale_for_worst_weight_stream = tens.compression_scale_for_worst_weight_stream
+ src_tens.storage_compression_scale = tens.storage_compression_scale
+
+ if verbose_tensor_format:
+ nng.print_passes_with_tensors()
diff --git a/ethosu/vela/model_reader.py b/ethosu/vela/model_reader.py
new file mode 100644
index 00000000..6d7a3a4f
--- /dev/null
+++ b/ethosu/vela/model_reader.py
@@ -0,0 +1,45 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Dispatcher for reading a neural network model.
+
+
+class ModelReaderOptions:
+ def __init__(self, batch_size=1):
+ self.batch_size = batch_size
+
+ def __str__(self):
+ return type(self).__name__ + ": " + str(self.__dict__)
+
+ __repr__ = __str__
+
+
+def read_model(fname, options, feed_dict={}, output_node_names=[], initialisation_nodes=[]):
+ if fname.endswith(".tflite"):
+ from . import tflite_reader
+
+ nng = tflite_reader.read_tflite(
+ fname,
+ options.batch_size,
+ feed_dict=feed_dict,
+ output_node_names=output_node_names,
+ initialisation_nodes=initialisation_nodes,
+ )
+ else:
+ assert 0, "Unknown model format"
+ return nng
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
new file mode 100644
index 00000000..8d335bd8
--- /dev/null
+++ b/ethosu/vela/nn_graph.py
@@ -0,0 +1,548 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Neural network graph classes and enums.
+# Pass - A packed pass containing one or more Operations.
+# CascadedPass - A scheduled pass containing one or more Passes, as well as a scheduling strategy and block
+# configurations.
+# Subgraph - Holds a neural network subgraph, pointing at Tensors, Operations, Passes, and CascadedPasses.
+# Graph - A full neural network graph with one or more Subgraphs.
+
+import enum
+from .data_type import BaseType, DataType
+from .tensor import MemArea, TensorPurpose, TensorSubPurpose, TensorFormat, Tensor
+from .operation import Operation, NpuBlockType
+
+
+class PassPlacement(enum.Enum):
+ Unknown = 0
+ Cpu = 1
+ Npu = 2
+ MemoryOnly = 3
+ StartupInit = 4
+
+
+class TensorAllocator(enum.Enum):
+ LinearAlloc = 1
+ Greedy = 2
+
+ def __str__(self):
+ return self.name
+
+
+class Pass:
+ def __init__(self, name, placement, is_element_wise, npu_block_type):
+ self.inputs = []
+ self.intermediates = []
+ self.outputs = []
+ self.ops = []
+ self.primary_op = None
+ self.ifm_tensor = None
+ self.ifm2_tensor = None
+ self.ofm_tensor = None
+ self.weight_tensor = None
+ self.scale_tensor = None
+ self.name = name
+ self.cascade = None
+ self.placement = placement
+
+ # TODO: rename is_element_wise because it is not the same as an ElementWise operator. It is used by the tensor
+ # allocation and requires that the OFM and IFM has the exact same address. Essentially complete overlap.
+ self.is_element_wise = is_element_wise
+ self.npu_block_type = npu_block_type
+ self.block_config = None # will be filled in by scheduler
+ self.shared_buffer = None # will be filled in by scheduler
+
+ self.predecessors = []
+ self.successors = []
+
+ def __str__(self):
+ return "<nng.Pass '%s', %s, ops=%s>" % (self.name, self.placement, [op.type for op in self.ops])
+
+ __repr__ = __str__
+
+ def get_primary_op_ifm_weights(self):
+ if not self.primary_op:
+ return None, None
+ return self.primary_op.get_ifm_ifm2_weights_ofm()[::2]
+
+ def get_primary_op_ifm_ifm2_weights_ofm(self):
+ if not self.primary_op:
+ return None, None, None, None
+ return self.primary_op.get_ifm_ifm2_weights_ofm()
+
+ def get_primary_op_ifm_weights_biases_ofm(self):
+ if not self.primary_op:
+ return None, None, None, None
+ return self.primary_op.get_ifm_weights_biases_ofm()
+
+
+class SchedulingStrategy(enum.Enum):
+ Unknown = -1
+ IfmStream = 0
+ WeightStream = 1
+
+
+class SchedulerRewrite(enum.Enum):
+ Nop = 0
+ ChangeTensorSubPurpose = 1
+
+
+class CascadedPass:
+ def __init__(self, name, strat, inputs, intermediates, outputs, passes, placement, is_element_wise):
+ self.name = name
+ self.strategy = strat
+ self.inputs = inputs
+ self.intermediates = intermediates
+ self.outputs = outputs
+ self.passes = passes
+ self.placement = placement
+ self.is_element_wise = is_element_wise
+
+ self.predecessors = []
+ self.successors = []
+
+ def __str__(self):
+ return "<nng.CascadedPass strategy=%s x %s '%s', passes=%s, block_configs=%s>" % (
+ self.strategy,
+ len(self.passes),
+ self.name,
+ [ps.name for ps in self.passes],
+ [ps.block_config for ps in self.passes],
+ )
+
+ __repr__ = __str__
+
+
+class Subgraph:
+ def __init__(self, name="<unnamed>", placement=PassPlacement.Cpu):
+ self.output_tensors = []
+ self.input_tensors = []
+ self.original_inputs = [] # Preserve the original input order
+ self.passes = []
+ self.cascaded_passes = []
+ self.name = name
+ self.high_level_command_stream = []
+ self.placement = placement
+ self.command_stream_tensor = None
+ self.flash_tensor = None
+
+ self.memory_used = {}
+
+ def __str__(self):
+ return "<nng.Subgraph '%s', n_passes=%d, n_cascaded_passes=%d>" % (
+ self.name,
+ len(self.passes),
+ len(self.cascaded_passes),
+ )
+
+ __repr__ = __str__
+
+ def update_consumers(self):
+ visit_op_set = set()
+ visit_tensor_set = set()
+ self.input_tensors = []
+
+ print_visit = False
+
+ def visit_op(op):
+ if op in visit_op_set:
+ return
+
+ visit_op_set.add(op)
+ for inp in op.inputs:
+ if print_visit:
+ print(inp, "adding consumer", op)
+ visit_tensor(inp)
+ inp.consumer_list.append(op)
+
+ if op.type in set(("Placeholder", "SubgraphInput")):
+ assert len(op.outputs) == 1
+ self.input_tensors.append(op.outputs[0])
+
+ for out in op.outputs:
+ if out not in visit_tensor_set:
+ out.consumer_list = [] # reset unvisited output, just in case
+
+ def visit_tensor(tens):
+ if tens in visit_tensor_set:
+ return
+ visit_tensor_set.add(tens)
+ tens.consumer_list = []
+ for op in tens.ops:
+ visit_op(op)
+
+ for ps in self.passes:
+ for tens in ps.outputs + ps.inputs:
+ tens.consumer_list = [] # reset unvisited tensors to start with
+
+ for tens in self.output_tensors:
+ visit_tensor(tens)
+ tens.consumer_list.append(None) # special op to indicate that the graph consumes the result
+
+ print_visit = True
+ for ps in self.passes:
+ for op in ps.ops:
+ visit_op(op)
+ for tens in ps.inputs:
+ visit_tensor(tens)
+
+ def build_pass_links(self):
+ for idx, ps in enumerate(self.passes):
+ ps.time = 2 * idx
+ ps.predecessors = []
+ ps.successors = []
+
+ for ps in self.passes:
+ for tens in ps.inputs:
+ for op in tens.ops:
+ pred_pass = op.scheduled_pass
+ assert pred_pass.time < ps.time
+ if ps not in pred_pass.successors:
+ pred_pass.successors.append(ps)
+
+ if pred_pass not in ps.predecessors:
+ ps.predecessors.append(pred_pass)
+
+ assert tens in pred_pass.outputs
+
+ def build_pass_dag_predecessors(self):
+ for ps in self.passes:
+ ps.dag_predecessors = []
+
+ class State(enum.Enum):
+ NotVisited = 0
+ BeingVisited = 1
+ Visited = 2
+
+ pass_visit_dict = {}
+
+ def visit_pass(ps):
+ state = pass_visit_dict.get(ps, State.NotVisited)
+ if state == State.Visited:
+ return True
+ elif state == State.BeingVisited:
+ return False # this is a loop, need to remove this link
+ elif state == State.NotVisited:
+ pass_visit_dict[ps] = State.BeingVisited
+
+ ps.dag_predecessors = []
+ for pred in ps.predecessors:
+ if visit_pass(pred):
+ ps.dag_predecessors.append(pred)
+
+ pass_visit_dict[ps] = State.Visited
+ return True
+
+ for ps in self.passes:
+ if not ps.successors:
+ visit_pass(ps)
+
+ def build_cascaded_pass_links(self):
+ for cps in self.cascaded_passes:
+ cps.predecessors = []
+ cps.successors = []
+
+ for cps in self.cascaded_passes:
+ for tens in cps.inputs:
+ for op in tens.ops:
+ pred_cpass = op.scheduled_pass.cascade
+ if cps not in pred_cpass.successors:
+ pred_cpass.successors.append(cps)
+
+ if pred_cpass not in cps.predecessors:
+ cps.predecessors.append(pred_cpass)
+
+ assert tens in pred_cpass.outputs
+
+ def refresh_after_modification(self):
+ self.update_consumers()
+
+ def prune_startup_init_pass(self):
+ assert len(self.passes) >= 1
+ ps = self.passes[0]
+ assert ps.placement == PassPlacement.StartupInit
+
+ ps.outputs = [out_tens for out_tens in ps.outputs if len(out_tens.consumers()) > 0]
+ ps.ops = [op for op in ps.ops if op.outputs[0] in ps.outputs]
+
+ def get_all_ops(self):
+ all_ops = []
+ visit_op_set = set()
+ visit_tensor_set = set()
+
+ def visit_op(op):
+ if op in visit_op_set:
+ return
+ visit_op_set.add(op)
+ for inp in op.inputs:
+ visit_tensor(inp)
+
+ all_ops.append(op)
+
+ def visit_tensor(tens):
+ if tens in visit_tensor_set:
+ return
+ visit_tensor_set.add(tens)
+ for op in tens.ops:
+ visit_op(op)
+
+ for tens in self.output_tensors:
+ visit_tensor(tens)
+
+ return all_ops
+
+ def print_operators(self):
+ all_ops = self.get_all_ops()
+ unique_ops = []
+ print("print_operators")
+ for op in all_ops:
+ if op.type in set(("Const", "Identity", "Placeholder")):
+ continue
+
+ attrs = op.attrs
+ if (
+ op.type == "Conv2D"
+ or op.type == "DepthwiseConv2dNative"
+ or op.type == "Conv2DBiasAct"
+ or op.type == "DepthwiseConv2dBiasAct"
+ ):
+ kshape = op.inputs[1].shape
+ attrs["kshape"] = [kshape[0], kshape[1]]
+ attrs["type"] = op.type
+ attrs.pop("use_cudnn_on_gpu", None)
+ if attrs not in unique_ops:
+ unique_ops.append(attrs)
+ # print attributes in human readable format
+ a = attrs.copy()
+ s = a.pop("type")
+ data_format = a.pop("data_format", None)
+ if data_format and data_format != b"NHWC":
+ s += " " + str(data_format)
+ t = a.pop("T", None)
+ if t:
+ s += " " + str(t)[9:-2]
+ srct = a.pop("SrcT", None)
+ if srct:
+ s += " " + str(srct)[9:-2]
+ dstt = a.pop("DstT", None)
+ if dstt:
+ s += "->" + str(dstt)[9:-2]
+ print(s + " " + str(a))
+
+ def print_graph(self):
+ all_ops = self.get_all_ops()
+ for idx, op in enumerate(all_ops):
+ print(idx, op.type, op.name)
+
+ def print_graph_with_tensors(self):
+ all_ops = self.get_all_ops()
+ for idx, op in enumerate(all_ops):
+ print(idx, op.type, op.name)
+ for idx, tens in enumerate(op.inputs):
+ print(" Input %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens))
+ for idx, tens in enumerate(op.outputs):
+ print(" Output %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens))
+ print()
+
+ def print_graph_with_tensor_quantization(self):
+ all_ops = self.get_all_ops()
+ for idx, op in enumerate(all_ops):
+ print(idx, op.type, op.name)
+ for idx, tens in enumerate(op.inputs):
+ q = tens.quantization
+ if q is None:
+ print(" Input %02d %10s NO QUANTIZATION INFO %s" % (idx, tens.dtype, tens.name))
+ else:
+ print(
+ " Input %02d %10s min=%s max=%s scale=%s zero_point=%s %s"
+ % (idx, tens.dtype, q.min, q.max, q.scale_f32, q.zero_point, tens.name)
+ )
+ for idx, tens in enumerate(op.outputs):
+ q = tens.quantization
+ if q is None:
+ print(" Output %02d %10s NO QUANTIZATION INFO %s" % (idx, tens.dtype, tens.name))
+ else:
+ print(
+ " Output %02d %10s min=%s max=%s scale=%s zero_point=%s %s"
+ % (idx, tens.dtype, q.min, q.max, q.scale_f32, q.zero_point, tens.name)
+ )
+ print()
+
+ def print_passes(self):
+ for idx, ps in enumerate(self.passes):
+ print("%03d %s" % (idx * 2, ps))
+
+ def print_passes_with_tensors(self):
+ for idx, ps in enumerate(self.passes):
+ print("%3d %s" % (idx * 2, ps))
+ for idx, tens in enumerate(ps.inputs):
+ print(
+ " Input %2d %-15s %-15s %-15s %s"
+ % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+ )
+ for idx, tens in enumerate(ps.intermediates):
+ print(
+ " Intermediate %2d %-15s %-15s %-15s %s"
+ % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+ )
+ for idx, tens in enumerate(ps.outputs):
+ print(
+ " Output %2d %-15s %-15s %-15s %s"
+ % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+ )
+ print()
+
+ def print_cascaded_passes(self):
+ for idx, ps in enumerate(self.cascaded_passes):
+ print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024))
+
+ def print_cascaded_passes_with_tensors(self):
+ for idx, ps in enumerate(self.cascaded_passes):
+ print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024))
+ for idx, tens in enumerate(ps.inputs):
+ print(
+ " Input %2d %-15s %-15s %-15s %s"
+ % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+ )
+ for idx, tens in enumerate(ps.intermediates):
+ print(
+ " Intermediate %2d %-15s %-15s %-15s %s"
+ % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+ )
+ for idx, tens in enumerate(ps.outputs):
+ print(
+ " Output %2d %-15s %-15s %-15s %s"
+ % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name)
+ )
+ print()
+
+ def print_cascaded_passes_with_tensor_sizes(self):
+ for idx, ps in enumerate(self.cascaded_passes):
+ print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024))
+ for idx, tens in enumerate(ps.inputs):
+ print(
+ " Input %2d %7.1f KB %-24s %-15s %-15s %-20s %s"
+ % (
+ idx,
+ tens.storage_size() / 1024,
+ tens.storage_shape,
+ tens.mem_area.name,
+ tens.purpose.name,
+ tens.format.name,
+ tens.name,
+ )
+ )
+ for idx, tens in enumerate(ps.intermediates):
+ print(
+ " Intermediate %2d %7.1f KB %-24s %-15s %-15s %-20s %s"
+ % (
+ idx,
+ tens.storage_size() / 1024,
+ tens.storage_shape,
+ tens.mem_area.name,
+ tens.purpose.name,
+ tens.format.name,
+ tens.name,
+ )
+ )
+ for idx, tens in enumerate(ps.outputs):
+ print(
+ " Output %2d %7.1f KB %-24s %-15s %-15s %-20s %s"
+ % (
+ idx,
+ tens.storage_size() / 1024,
+ tens.storage_shape,
+ tens.mem_area.name,
+ tens.purpose.name,
+ tens.format.name,
+ tens.name,
+ )
+ )
+ print()
+
+ def print_high_level_command_stream(self):
+ for idx, cmd in enumerate(self.high_level_command_stream):
+ print("%3d %s" % (idx, cmd))
+
+
+class Graph:
+ def __init__(self, name="<unnamed>", batch_size=1):
+ self.name = name
+ self.batch_size = batch_size
+ self.subgraphs = []
+
+ self.memory_used = {}
+ self.bits_per_element = {}
+ self.total_size = {}
+ self.total_elements = {}
+
+ def get_root_subgraph(self):
+ return self.subgraphs[0]
+
+ def prune_startup_init_pass(self):
+ for sg in self.subgraphs:
+ sg.prune_startup_init_pass()
+
+ def update_consumers(self):
+ for sg in self.subgraphs:
+ sg.update_consumers()
+
+ def refresh_after_modification(self):
+ for sg in self.subgraphs:
+ sg.refresh_after_modification()
+
+ def print_operators(self):
+ for sg in self.subgraphs:
+ sg.print_operators()
+
+ def print_graph(self):
+ for sg in self.subgraphs:
+ sg.print_graph()
+
+ def print_graph_with_tensors(self):
+ for sg in self.subgraphs:
+ sg.print_graph_with_tensors()
+
+ def print_graph_with_tensor_quantization(self):
+ for sg in self.subgraphs:
+ sg.print_graph_with_tensor_quantization()
+
+ def print_passes(self):
+ for sg in self.subgraphs:
+ sg.print_passes()
+
+ def print_passes_with_tensors(self):
+ for sg in self.subgraphs:
+ sg.print_passes_with_tensors()
+
+ def print_cascaded_passes(self):
+ for sg in self.subgraphs:
+ sg.print_cascaded_passes()
+
+ def print_cascaded_passes_with_tensors(self):
+ for sg in self.subgraphs:
+ sg.print_cascaded_passes_with_tensors()
+
+ def print_cascaded_passes_with_tensor_sizes(self):
+ for sg in self.subgraphs:
+ sg.print_cascaded_passes_with_tensor_sizes()
+
+ def print_high_level_command_stream(self):
+ for sg in self.subgraphs:
+ sg.print_high_level_command_stream()
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
new file mode 100644
index 00000000..84cc4931
--- /dev/null
+++ b/ethosu/vela/npu_performance.py
@@ -0,0 +1,516 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
+# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
+#
+# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
+# estimate.
+
+import enum
+from . import numeric_util
+import numpy as np
+from .tensor import TensorPurpose, MemArea, TensorFormat, shape_num_elements, Tensor, TensorBlockTraversal
+from .operation import Operation
+from .data_type import DataType, BaseType
+from .nn_graph import PassPlacement, NpuBlockType, SchedulerRewrite, Pass
+from .architecture_features import Block, Kernel
+
+
+def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):
+ ps2_strides = (1, 1, 1, 1)
+ ps2_dilation = (1, 1, 1, 1)
+ for op in ps2.ops:
+ if "strides" in op.attrs:
+ ps2_strides = op.attrs["strides"]
+ if "dilation" in op.attrs:
+ ps2_dilation = op.attrs["dilation"]
+
+ ifm_idx, _, weight_idx, _, _ = op.get_ifm_ifm2_weight_bias_ofm_indices()
+
+ rolling_buffer_sizes = []
+
+ weight_tensor = op.inputs[weight_idx]
+
+ ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])
+ kernel = Kernel(
+ weight_tensor.shape[1], weight_tensor.shape[0], ps2_strides[2], ps2_strides[1], ps2_dilation[2], ps2_dilation[1]
+ )
+ kernel_block = Block(weight_tensor.shape[1], weight_tensor.shape[0], 65536)
+
+ if ps2.npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
+ ifm_block_depth = arch.calc_ifm_block_depth(
+ op.inputs[ifm_idx].shape[-1], op.inputs[ifm_idx].dtype.size_in_bits()
+ )
+ else:
+ ifm_block_depth = block_config_ps2[-1]
+
+ ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, kernel_block)
+
+ # The performed height calculation is for worst case
+ height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])
+ width = ifm_block.width
+
+ rolling_buffer_sizes.append(height)
+ rolling_buffer_sizes.append(width)
+
+ return rolling_buffer_sizes
+
+
+class PassCycles(enum.IntEnum):
+ Dpu = 0
+ ElementWise = 1
+ Cpu = 2
+ SramAccess = 3
+ TotalPerPass = 4
+ DramAccess = 5
+ OnChipFlashAccess = 6
+ OffChipFlashAccess = 7
+ Total = 8
+ Size = 9
+
+ def display_name(self):
+ return (
+ "DPU",
+ "Element wise",
+ "CPU",
+ "SRAM Access",
+ "Total per Pass",
+ "DRAM Access",
+ "On-chip Flash Access",
+ "Off-chip Flash Access",
+ "Total",
+ "Size",
+ )[self.value]
+
+ def identifier_name(self):
+ return (
+ "dpu",
+ "element_wise",
+ "cpu",
+ "sram_access",
+ "total_per_pass",
+ "dram_access",
+ "on_chip_flash_access",
+ "off_chip_flash_access",
+ "total",
+ "size",
+ )[self.value]
+
+ @staticmethod
+ def all():
+ return (
+ PassCycles.Dpu,
+ PassCycles.ElementWise,
+ PassCycles.Cpu,
+ PassCycles.SramAccess,
+ PassCycles.DramAccess,
+ PassCycles.OnChipFlashAccess,
+ PassCycles.OffChipFlashAccess,
+ PassCycles.Total,
+ )
+
+
+class MacCount(enum.IntEnum):
+ NeuralNetworkMacs = 0
+ HardwareMacs = 1
+ Size = 2
+
+ def display_name(self):
+ return ("Neural Network Macs", "Hardware Macs", "Size")[self.value]
+
+ def identifier_name(self):
+ return ("nn_macs", "hardware_macs", "size")[self.value]
+
+ @staticmethod
+ def all():
+ return (MacCount.NeuralNetworkMacs, MacCount.HardwareMacs)
+
+
+class BandwidthDirection(enum.IntEnum):
+ Read = 0
+ Write = 1
+ Size = 2
+
+ def display_name(self):
+ return self.name
+
+ def identifier_name(self):
+ return self.name.lower()
+
+ @staticmethod
+ def all():
+ return (BandwidthDirection.Read, BandwidthDirection.Write)
+
+
+def make_bandwidth_array():
+ return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
+
+
+def make_macs_array():
+ return np.zeros(MacCount.Size, np.int)
+
+
+def make_cycles_array():
+ return np.zeros(PassCycles.Size)
+
+
+def make_metrics_arrays():
+ return (make_bandwidth_array(), make_macs_array(), make_cycles_array())
+
+
+def get_n_blocks_and_area(
+ ifm_brick_size, ifm_height_width, orig_skirt, clamped_skirt, block_config, min_block_size, strides
+):
+
+ ifm_block_config = (block_config[0] * strides[1], block_config[1] * strides[2])
+
+ n_normal_blocks = []
+ remainder_size = []
+ for i in range(2):
+ non_skirt_dim = ifm_height_width[i] - orig_skirt[i] - orig_skirt[2 + i]
+ n_blocks = non_skirt_dim // ifm_block_config[i]
+ n_normal_blocks.append(n_blocks)
+ remainder_dim = numeric_util.round_up(
+ ((non_skirt_dim - n_blocks * ifm_block_config[i] - 1) // strides[i + 1]) + 1, min_block_size[i]
+ )
+ remainder_size.append(remainder_dim)
+
+ # this will actually calculate reads into the edge padding.
+
+ # there are four cases in total, handling the edges that will not fill a complete block.
+
+ # 0000000001
+ # 0000000001
+ # 0000000001
+ # 0000000001
+ # 0000000001
+ # 0000000001
+ # 2222222223
+ total_blocks = 0
+ total_area = 0
+
+ block_setup = (
+ (n_normal_blocks[0] * n_normal_blocks[1], block_config),
+ (1 * n_normal_blocks[1], (remainder_size[0], block_config[1])),
+ (n_normal_blocks[0] * 1, (block_config[0], remainder_size[1])),
+ (1 * 1, remainder_size),
+ )
+
+ for n_blocks, block_size in block_setup:
+ if block_size[0] == 0 or block_size[1] == 0:
+ continue
+ read_dims = [0, 0]
+ for i in range(2):
+ read_dims[i] = (
+ numeric_util.round_up(clamped_skirt[i], ifm_brick_size[i + 1])
+ + block_size[i] * strides[i + 1]
+ + numeric_util.round_up(clamped_skirt[2 + i], ifm_brick_size[i + 1])
+ )
+ assert n_blocks >= 0
+ total_blocks += n_blocks
+ total_area += n_blocks * read_dims[0] * read_dims[1]
+ assert total_blocks >= 1
+ return total_blocks, total_area, block_setup
+
+
+def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False):
+ if block_config is None:
+ block_config = ps.block_config
+ bws = make_bandwidth_array()
+ macs = make_macs_array()
+ cycles = make_cycles_array()
+ blocks = 0
+ ifm_read_multiple = 1
+ weight_read_multiple = 0
+
+ if ps.placement in set((PassPlacement.MemoryOnly, PassPlacement.StartupInit)):
+ return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass
+
+ min_block_size = arch.min_block_sizes[ps.npu_block_type]
+
+ skirt = (0, 0, 0, 0)
+ explicit_padding = (0, 0, 0, 0)
+ primary_op = ps.primary_op
+ replacement_read_bws = {}
+ if primary_op:
+ skirt = primary_op.attrs.get("skirt", skirt)
+ explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
+ assert primary_op.attrs["npu_block_type"] == ps.npu_block_type
+ npu_block_type = primary_op.attrs["npu_block_type"]
+
+ ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
+
+ npu_convolution_ops = set((NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise))
+ if (npu_block_type == NpuBlockType.Pooling and len(ifm_tensor.shape) == 4) or (
+ npu_block_type in npu_convolution_ops
+ ):
+
+ batch_size = ifm_tensor.shape[0]
+ ifm_tensor_shape = list(ifm_tensor.shape)
+ ifm_depth = ifm_tensor.bandwidth_shape[3]
+
+ # add in padding
+ ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2] # height += top and bottom
+ ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3] # width += left and right
+
+ strides = primary_op.attrs["strides"]
+ if npu_block_type != NpuBlockType.Pooling:
+ weight_tensor_shape = weight_tensor.shape
+ weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape
+ weight_tensor_element_size = weight_tensor.element_size()
+ weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
+ nn_ops = (
+ int(ofm_tensor.shape[0])
+ * int(ofm_tensor.shape[1])
+ * int(ofm_tensor.shape[2])
+ * int(weight_tensor_shape[0])
+ * int(weight_tensor_shape[1])
+ * int(weight_tensor_shape[2])
+ * int(weight_tensor_shape[3])
+ / int(strides[1])
+ / int(strides[2])
+ )
+ else:
+ weight_tensor_shape = [
+ primary_op.attrs["ksize"][1],
+ primary_op.attrs["ksize"][2],
+ 1,
+ ifm_tensor_shape[3],
+ ]
+ weight_tensor_bandwidth_shape = weight_tensor_shape
+ weight_tensor_element_size = 0
+ weight_tensor_bandwidth_compression_scale = 0.0
+ nn_ops = 0 # pooling doesn't count as NN ops
+
+ kernel_dims = weight_tensor_shape[:2]
+
+ sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
+ # count the sub kernels; the IFM block needs to be refetched for each of them
+ n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
+ n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
+ n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x
+
+ clamped_skirt = list(skirt)
+ clamped_skirt[2] = min(clamped_skirt[2], sub_kernel_limits[0] - 1 - clamped_skirt[0])
+ clamped_skirt[3] = min(clamped_skirt[3], sub_kernel_limits[1] - 1 - clamped_skirt[1])
+ n_blocks, area, block_setup = get_n_blocks_and_area(
+ ifm_tensor.brick_size,
+ ifm_tensor_shape[1:3],
+ skirt,
+ clamped_skirt,
+ block_config,
+ min_block_size,
+ strides,
+ )
+
+ blocks = n_blocks * numeric_util.round_up_divide(weight_tensor_shape[3], block_config[3])
+
+ n_weight_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], block_config[3])
+ if npu_block_type == NpuBlockType.ConvolutionDepthWise or npu_block_type == NpuBlockType.Pooling:
+ n_weight_stages = 1 # force to no reread
+
+ ifm_tensor_bw = (
+ n_sub_kernels
+ * batch_size
+ * area
+ * ifm_depth
+ * n_weight_stages
+ * ifm_tensor.element_size()
+ * ifm_tensor.bandwidth_compression_scale
+ )
+ replacement_read_bws[ifm_tensor] = ifm_tensor_bw
+ ifm_read_multiple = n_weight_stages
+
+ replacement_read_bws[weight_tensor] = (
+ batch_size
+ * shape_num_elements(weight_tensor_bandwidth_shape)
+ * weight_tensor_element_size
+ * weight_tensor_bandwidth_compression_scale
+ * n_blocks
+ ) # read once per block and batch
+ weight_read_multiple = n_blocks
+
+ n_kernel_xy = kernel_dims[0] * kernel_dims[1]
+ n_input_channels_at_a_time = block_config[2]
+
+ if npu_block_type == NpuBlockType.Pooling or weight_tensor.block_traversal in set(
+ (TensorBlockTraversal.PartKernelFirst, TensorBlockTraversal.DepthWise)
+ ):
+ n_input_channels_at_a_time = numeric_util.round_up_divide(n_input_channels_at_a_time, 4)
+ n_kernel_xy = max(
+ n_kernel_xy, 4
+ ) # need at least 4, as this is the minimum duty cycle for secondary accumulator writes
+ if weight_tensor is not None:
+ n_kernel_xy = numeric_util.round_up(
+ n_kernel_xy, 4
+ ) # weights need to be read in blocks of 4
+
+ num_mac_ops = 0
+ for n_blocks_for_size, block_size in block_setup:
+ num_mac_ops += (
+ batch_size
+ * n_blocks_for_size
+ * block_size[0]
+ * block_size[1]
+ * numeric_util.round_up(weight_tensor_shape[2], n_input_channels_at_a_time)
+ * numeric_util.round_up(weight_tensor_shape[3], block_config[3])
+ * n_kernel_xy
+ )
+
+ if npu_block_type == NpuBlockType.Pooling:
+ # TODO: improve pooling estimation
+ cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle / 2
+ else:
+ cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle
+ macs[MacCount.NeuralNetworkMacs] += nn_ops
+ macs[MacCount.HardwareMacs] += num_mac_ops
+
+ elif npu_block_type == NpuBlockType.VectorProduct:
+ nn_macs = (
+ ifm_tensor.shape[0]
+ * numeric_util.round_up(weight_tensor.shape[-2], block_config[2])
+ * numeric_util.round_up(weight_tensor.shape[-1], block_config[3])
+ )
+ num_mac_ops = nn_macs
+
+ cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle
+ macs[MacCount.NeuralNetworkMacs] += nn_macs
+ macs[MacCount.HardwareMacs] += num_mac_ops
+
+ blocks = 1 * numeric_util.round_up_divide(weight_tensor.shape[-1], block_config[3])
+
+ non_zero_fraction = 1.0
+ if ifm_tensor.values is not None:
+ nz_vector = np.amax(ifm_tensor.values != 0, axis=0) # max across batch axis
+ non_zero_fraction = np.average(nz_vector)
+
+ replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth()
+ replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction
+ ifm_read_multiple = 1
+ weight_read_multiple = non_zero_fraction
+ else:
+ if ps.placement == PassPlacement.Npu and len(ps.outputs):
+ # Assume element-wise operation going through the element pipelines.
+ # Work out how many elements we have and calculate performance.
+ out = ps.outputs[0]
+ elms = out.elements()
+
+ cycles[PassCycles.ElementWise] = numeric_util.round_up_divide(elms, arch.num_elem_wise_units)
+
+ if ps.placement == PassPlacement.Cpu:
+ cycles[PassCycles.Cpu] = arch.cpu_cycle_estimate(ps.ops[0])
+
+ # apply the desired rewrites
+ for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
+ if ps != ps_to_rewrite:
+ continue
+ if rewrite_op == SchedulerRewrite.Nop:
+ pass # these are fine, no bandwidth changes
+ elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):
+ bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]
+ replacement_read_bws[tens] = 0
+
+ for tens in ps.outputs:
+ if force_outputs_to_fast_storage:
+ bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
+ else:
+ bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
+
+ for tens in ps.intermediates:
+ bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
+
+ if tens in replacement_read_bws:
+ bw = replacement_read_bws[tens]
+ else:
+ bw = tens.bandwidth()
+
+ bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
+
+ for tens in ps.inputs:
+ if tens in replacement_read_bws:
+ bw = replacement_read_bws[tens]
+ else:
+ bw = tens.bandwidth()
+
+ bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
+
+ cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
+ cycles[PassCycles.TotalPerPass] = np.max(cycles[: PassCycles.TotalPerPass])
+
+ # quick build access counts for only current pass, even though these aren't the final numbers
+ update_summary_cycles(arch, bws, macs, cycles)
+
+ return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple
+
+
+def update_summary_cycles(arch, bws, macs, cycles):
+ cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
+ cycles[PassCycles.OnChipFlashAccess] = (
+ np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
+ )
+ cycles[PassCycles.OffChipFlashAccess] = (
+ np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
+ )
+
+ cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
+ return cycles
+
+
+def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):
+ return bws, macs, cycles
+
+
+def performance_for_cascaded_pass(arch, cps):
+ total_bws = make_bandwidth_array()
+ total_macs = make_macs_array()
+ total_cycles = make_cycles_array()
+
+ for ps in cps.passes:
+ bws, macs, cycles, blocks, _, _ = performance_metrics_for_pass(arch, ps)
+ ps.bandwidths = bws
+ ps.macs = macs
+ ps.cycles = cycles
+ ps.n_blocks = blocks
+ total_bws += bws
+ total_macs += macs
+ total_cycles += cycles
+
+ bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)
+ cps.bandwidths = bws
+ cps.macs = macs
+ cps.cycles = cycles
+ return bws, macs, cycles
+
+
+def calc_performance_for_network(nng, arch):
+ total_bws = make_bandwidth_array()
+ total_macs = np.zeros(MacCount.Size)
+ total_cycles = np.zeros(PassCycles.Size)
+
+ for sg in nng.subgraphs:
+ for cps in sg.cascaded_passes:
+ bws, macs, cycles = performance_for_cascaded_pass(arch, cps)
+ total_bws += bws
+ total_macs += macs
+ total_cycles += cycles
+ total_cycles += arch.inter_pass_cycle_delay
+
+ nng.bandwidths = total_bws
+ nng.macs = total_macs
+ nng.cycles = total_cycles
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
new file mode 100644
index 00000000..4542c25b
--- /dev/null
+++ b/ethosu/vela/npu_serialisation.py
@@ -0,0 +1,145 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Serialises and packs an NPU subgraph into tensors.
+
+from .nn_graph import PassPlacement
+from .tensor import MemArea, Tensor, TensorPurpose, TensorFormat
+from .operation import Operation
+from .data_type import DataType
+import numpy as np
+from . import driver_actions
+import struct
+
+
+def make_memory_tensor(name, mem_area, sz, want_values, arch):
+ tens = Tensor([sz], DataType.uint8, name)
+ tens.mem_area = mem_area
+ tens.purpose = TensorPurpose.FeatureMap
+ tens.set_format(TensorFormat.NHWC, arch)
+ if want_values:
+ tens.values = np.zeros(tens.shape, np.uint8)
+ return tens
+
+
+def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
+ start_addr = src_tensor.address
+ for compressed_values in src_tensor.compressed_values:
+ end_addr = start_addr + len(compressed_values)
+ memory_tensor.values[start_addr:end_addr] = compressed_values
+ start_addr = end_addr
+
+
+def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
+ if sg.placement != PassPlacement.Npu:
+ return scratch_tens, flash_tens
+
+ flash_area = arch.permanent_storage_mem_area
+ scratch_area = MemArea.Sram
+
+ flash_size = sg.memory_used.get(flash_area, 0)
+ scratch_size = sg.memory_used.get(scratch_area, 0)
+
+ # Prepare driver actions for this command tensor
+ da_list = []
+ driver_actions.emit_fourcc(da_list, "COP1")
+ driver_actions.emit_config(da_list, 0, 1, arch)
+ driver_actions.emit_cmd_stream_header(da_list, len(sg.register_command_stream))
+
+ # Append command stream words
+ da_list.extend(sg.register_command_stream)
+
+ # Convert to bytes
+ payload_bytes = struct.pack("<{0}I".format(len(da_list)), *da_list)
+
+ command_stream_size_bytes = len(payload_bytes)
+
+ # Adjust the bits per element calculation to exclude metadata generated by Vela
+ nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes
+ nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes
+ nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
+ nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
+
+ if flash_tens == scratch_tens == None:
+ # First Npu subgraph, create scratch and flash tensors
+ sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch)
+ sg.scratch_tensor.purpose = TensorPurpose.Scratch
+ sg.flash_tensor = make_memory_tensor(sg.name + "_flash", flash_area, flash_size, True, arch)
+ else:
+ sg.scratch_tensor = scratch_tens
+ sg.scratch_tensor.shape[0] += scratch_size
+ sg.flash_tensor = flash_tens
+ sg.flash_tensor.shape[0] += flash_size
+
+ for cps in sg.cascaded_passes:
+ for ps in cps.passes:
+ if ps.placement == PassPlacement.Npu and ps.weight_tensor != None:
+ # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
+ # is pointing at the destination address of where the weights should be placed in SRAM.
+ # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
+ if ps.weight_tensor.ops[0].type == "DMA":
+ copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
+ else:
+ copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
+
+ copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+
+ sg.command_stream_tensor = make_memory_tensor(
+ sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch
+ )
+ sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
+
+ return sg.scratch_tensor, sg.flash_tensor
+
+
+def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
+ op = Operation("Const", tens.name + "_const")
+ op.outputs = [tens]
+ tens.ops = [op]
+ startup_cps.passes[0].ops.insert(0, op)
+ startup_cps.passes[0].outputs.insert(0, tens)
+ startup_cps.outputs.insert(0, tens)
+
+
+def rewrite_npu_call_ops(nng, sg, arch):
+ if sg.placement != PassPlacement.Cpu:
+ return
+
+ startup_cps = sg.cascaded_passes[0]
+
+ for idx, cps in enumerate(sg.cascaded_passes):
+ for ps in cps.passes:
+ for op in ps.ops:
+ if op.type == "NpuOp":
+ callee = op.attrs["subgraph"]
+ op.attrs["custom_options"] = {"type": op.type}
+
+ sz = 0
+ for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]:
+ op.inputs.insert(0, tens)
+ ps.inputs.insert(0, tens)
+ cps.inputs.insert(0, tens)
+ if tens != callee.scratch_tensor:
+ add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
+ sz += tens.storage_size()
+
+ for prev_cps in sg.cascaded_passes[: idx + 1]:
+ prev_cps.sram_used += sz
+
+ if callee.scratch_tensor is not None:
+ cps.sram_used += callee.scratch_tensor.storage_size()
diff --git a/ethosu/vela/numeric_util.py b/ethosu/vela/numeric_util.py
new file mode 100644
index 00000000..e5bc88b8
--- /dev/null
+++ b/ethosu/vela/numeric_util.py
@@ -0,0 +1,89 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Numerical utilities for various types of rounding etc.
+
+import math
+import numpy as np
+
+
+def round_up(a, b):
+ return ((a + b - 1) // b) * b
+
+
+def round_up_divide(a, b):
+ return (a + b - 1) // b
+
+
+def round_up_to_int(v):
+ return int(math.ceil(v))
+
+
+def round_down_to_power_of_two(v):
+ assert v > 0
+ while v & (v - 1):
+ v &= v - 1
+
+ return v
+
+
+def round_up_to_power_of_two(v):
+ return round_down_to_power_of_two(2 * v - 1)
+
+
+def round_down_log2(v):
+ return int(math.floor(np.log2(v)))
+
+
+def round_up_log2(v):
+ return int(math.ceil(np.log2(v)))
+
+
+def round_to_int(v):
+ return np.rint(v).astype(np.int64)
+
+
+# Performs rounding away from zero.
+# n.b. This is identical to C++11 std::round()
+def round_away_zero(f):
+ r = -0.5 if (f < 0) else 0.5
+ return np.trunc(f + r)
+
+
+def quantise_float32(f, scale=1.0, zero_point=0):
+ return zero_point + int(round_away_zero(np.float32(f) / np.float32(scale)))
+
+
+def clamp_tanh(x):
+ if x <= -4:
+ y = -1.0
+ elif x >= 4:
+ y = 1.0
+ else:
+ y = math.tanh(x)
+ return y
+
+
+def clamp_sigmoid(x):
+ if x <= -8:
+ y = 0.0
+ elif x >= 8:
+ y = 1.0
+ else:
+ y = 1 / (1 + math.exp(-x))
+ return y
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
new file mode 100644
index 00000000..d2f2806a
--- /dev/null
+++ b/ethosu/vela/operation.py
@@ -0,0 +1,285 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Internal representation of a Neural Network Operation.
+
+import enum
+
+
+class NpuBlockType(enum.Enum):
+ Default = 0
+ ConvolutionMxN = 1
+ VectorProduct = 2
+ Pooling = 3
+ ConvolutionDepthWise = 4
+ ElementWise = 5
+
+
+class Operation:
+ """Class representing a Neural Network operation. Has a name, a type,
+input and output tensors, as well as an attribute dictionary."""
+
+ __slots__ = "type", "name", "attrs", "inputs", "outputs", "flops", "scheduled_pass", "run_on_npu"
+
+ def __init__(self, op_type, name):
+ self.type = op_type
+ self.name = name
+ self.attrs = {}
+ self.inputs = []
+ self.outputs = []
+ self.flops = 0
+ self.run_on_npu = True
+ self.scheduled_pass = None
+
+ def clone(self, suffix="_clone"):
+ res = Operation(self.type, self.name + suffix)
+
+ res.attrs = dict(self.attrs)
+ res.inputs = list(self.inputs)
+ res.outputs = list(self.outputs)
+ res.flops = self.flops
+ res.scheduled_pass = self.scheduled_pass
+
+ return res
+
+ def __str__(self):
+ return "<nng.Operation '%s' type=%s>" % (self.name, self.type)
+
+ __repr__ = __str__
+
+ def get_ifm_ifm2_weight_bias_ofm_indices(self):
+ ifm_idx = -1
+ ifm2_idx = -1
+ weight_idx = -1
+ bias_idx = -1
+ ofm_idx = -1
+ npu_block_type = self.attrs.get("npu_block_type", NpuBlockType.Default)
+ if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)):
+ ifm_idx = 0
+ weight_idx = 1
+ ofm_idx = 0
+
+ if self.type in set(("Conv2DBiasAct", "DepthwiseConv2dBiasAct", "TransposeConvAct")):
+ if len(self.inputs) >= 3:
+ bias_idx = 2
+
+ elif npu_block_type == NpuBlockType.Pooling:
+ ifm_idx = 0
+ ofm_idx = 0
+ elif npu_block_type == NpuBlockType.VectorProduct:
+ ifm_idx = 0
+ weight_idx = 1
+ ofm_idx = 0
+
+ if self.type in set(("FullyConnectedAct",)):
+ if len(self.inputs) >= 3:
+ bias_idx = 2
+
+ if self.type == "BlockLSTM":
+ ifm_idx = 3
+ weight_idx = 4
+ ofm_idx = 6
+
+ elif npu_block_type == NpuBlockType.ElementWise:
+ ifm_idx = 0
+ ifm2_idx = 1
+ ofm_idx = 0
+
+ # LeakyRelu and Abs have a single IFM
+ if self.type in set(("LeakyRelu", "Abs")):
+ ifm2_idx = -1
+
+ elif self.type == "Conv2DBackpropInput":
+ ifm_idx = 2
+ weight_idx = 1
+ ofm_idx = 0
+
+ elif self.type in set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims")):
+ ifm_idx = 0
+ ofm_idx = 0
+
+ elif self.is_split_op():
+ ifm_idx = 0
+ ofm_idx = 0
+ if self.type == "Split":
+ ifm_idx = 1
+
+ elif self.is_concat_op():
+ ifms, _ = self.get_concat_inputs_axis()
+ ifm_idx = self.inputs.index(ifms[0])
+ if len(ifms) > 1:
+ ifm2_idx = self.inputs.index(ifms[1])
+ ofm_idx = 0
+
+ return ifm_idx, ifm2_idx, weight_idx, bias_idx, ofm_idx
+
+ def get_ifm_ifm2_weights_ofm(self):
+ ifm_tensor = None
+ ifm2_tensor = None
+ weight_tensor = None
+ ofm_tensor = None
+
+ ifm_idx, ifm2_idx, weight_idx, bias_idx, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices()
+ if ifm_idx != -1:
+ ifm_tensor = self.inputs[ifm_idx]
+ if ifm2_idx != -1:
+ ifm2_tensor = self.inputs[ifm2_idx]
+ if weight_idx != -1:
+ weight_tensor = self.inputs[weight_idx]
+ if ofm_idx != -1:
+ ofm_tensor = self.outputs[ofm_idx]
+
+ return ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor
+
+ def get_ifm_weights_biases_ofm(self):
+ ifm_tensor = None
+ weight_tensor = None
+ bias_tensor = None
+ ofm_tensor = None
+
+ ifm_idx, _, weight_idx, bias_idx, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices()
+ if ifm_idx != -1:
+ ifm_tensor = self.inputs[ifm_idx]
+ if weight_idx != -1:
+ weight_tensor = self.inputs[weight_idx]
+ if bias_idx != -1:
+ bias_tensor = self.inputs[bias_idx]
+ if ofm_idx != -1:
+ ofm_tensor = self.outputs[ofm_idx]
+
+ return ifm_tensor, weight_tensor, bias_tensor, ofm_tensor
+
+ concat_ops = set(("Concat", "ConcatV2", "QuantizedConcat", "ConcatTFLite", "PackReshaped"))
+
+ def is_concat_op(self):
+ return self.type in Operation.concat_ops
+
+ def get_concat_inputs_axis(self):
+ assert self.is_concat_op()
+
+ if self.type == "ConcatV2":
+ axis_tensor = self.inputs[-1]
+ inputs = self.inputs[:-1]
+ elif self.type == "Concat":
+ axis_tensor = self.inputs[0]
+ inputs = self.inputs[1:]
+ elif self.type == "QuantizedConcat":
+ axis_tensor = self.inputs[0]
+ inputs = self.inputs[1:]
+ inputs = inputs[: len(inputs) // 3] # Skip min/max
+
+ if self.type == "ConcatTFLite":
+ inputs = self.inputs
+ axis = self.attrs["axis"]
+ elif self.type == "PackReshaped":
+ # Requires fixup_pack_input to be called before this point
+ inputs = self.inputs
+ axis = self.attrs["axis"]
+ assert len(self.inputs) == self.attrs["values_count"]
+ else:
+ assert len(axis_tensor.ops) == 1 and axis_tensor.ops[0].type == "Const"
+ axis = int(axis_tensor.values)
+
+ return inputs, axis
+
+ split_ops = set(("Split", "StridedSlice", "Slice", "UnpackReshaped"))
+
+ def is_split_op(self):
+ return self.type in Operation.split_ops
+
+ def get_split_inputs_axis(self):
+ assert self.is_split_op()
+
+ offset_start = None
+ offset_end = None
+ axis = None
+ if self.type == "Split":
+ # TODO: Extend split capabilities
+ # If num_or_size_splits is an integer, then value is split along dimension axis into num_split smaller
+ # tensors. This requires that num_split evenly divides value.shape[axis].
+ # If num_or_size_splits is a 1-D Tensor (or list), we call it size_splits and value is split into
+ # len(size_splits) elements. The shape of the i-th element has the same size as the value except along
+ # dimension axis where the size is size_splits[i].
+ num_splits = self.attrs.get("num_splits")
+ axis_tens = self.inputs[0]
+ assert len(axis_tens.ops) == 1 and axis_tens.ops[0].type == "Const"
+ axis = int(axis_tens.values)
+ input_tens = self.inputs[1]
+ outputs = self.outputs
+ assert num_splits == len(outputs)
+
+ elif self.type == "Slice":
+ input_tens, begin_tens, size_tens = self.inputs
+ outputs = self.outputs
+ offset_start = [0] * len(input_tens.shape)
+ offset_end = [0] * len(input_tens.shape)
+
+ for idx in range(len(begin_tens.values)):
+ # Check if the op should slice in dimension idx
+ if size_tens.values[idx] != input_tens.shape[idx]:
+ offset_start[idx] = begin_tens.values[idx]
+ offset_end[idx] = size_tens.values[idx] + offset_start[idx]
+
+ elif self.type == "StridedSlice":
+ input_tens, begin_tens, end_tens, strides_tens = self.inputs
+ outputs = self.outputs
+ out_tens = outputs[0]
+ offset_start = [0] * len(outputs[0].shape)
+ offset_end = [0] * len(outputs[0].shape)
+
+ # Extract masks
+ begin_mask = self.attrs["begin_mask"]
+ ellipsis_mask = self.attrs["ellipsis_mask"]
+ end_mask = self.attrs["end_mask"]
+ new_axis_mask = self.attrs["new_axis_mask"]
+ shrink_axis_mask = self.attrs["shrink_axis_mask"]
+ # TODO: Either extend this to support these different masks or check
+ # for this at an earlier stage and place the op on Cpu if needed
+ assert begin_mask == end_mask
+ assert new_axis_mask == ellipsis_mask == 0
+ # shrink_axis_mask is not supported by the Operation class but the operation
+ # may have the attribute modified and handled in the graph optimization phase.
+ assert shrink_axis_mask == 0
+ assert len(input_tens.shape) == len(out_tens.shape)
+
+ for idx in range(len(input_tens.shape)):
+ # If the i:th bit in begin_mask is set then the value on begin[i] should be ignored
+ if (begin_mask & (1 << idx)) == 0:
+ # Check if the op should slice in dimension idx
+ if end_tens.values[idx] != input_tens.shape[idx] or (
+ end_tens.values[idx] == input_tens.shape[idx] and begin_tens.values[idx] != 0
+ ):
+ offset_start[idx] = begin_tens.values[idx]
+ offset_end[idx] = end_tens.values[idx]
+
+ else:
+ # Don't slice in this axis, instead use fullest possible range
+ continue
+
+ elif self.type == "UnpackReshaped":
+ # Requires fixup_unpack_output to be called before this point
+ input_tens = self.inputs[0]
+ outputs = self.outputs
+ axis = self.attrs["axis"]
+ num_splits = self.attrs["num"]
+ # Number of outputs have to equal the value of the dimension to unpack
+ assert num_splits == len(outputs) == input_tens.shape[axis]
+ else:
+ assert False
+
+ return input_tens, outputs, axis, offset_start, offset_end
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
new file mode 100644
index 00000000..663520fc
--- /dev/null
+++ b/ethosu/vela/pass_packing.py
@@ -0,0 +1,489 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.
+
+from .nn_graph import Operation, Pass, PassPlacement, TensorPurpose, NpuBlockType, Tensor
+import collections
+import enum
+from .data_type import BaseType, DataType
+
+
+class PassFlags(enum.Flag):
+ Empty = 0
+ Pre = 1
+ Main = 2
+ Post = 4
+ Mac = 8
+ Dma = 32
+ ElementWise = 256
+ Npu = 512
+ Cpu = 1024
+ StartupInit = 2048
+ MemoryOnly = 4096
+ PostFusingLimited = 8192
+
+
+npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))
+
+mac_main_ops = set(
+ (
+ # convolutions
+ "Conv2DBiasAct",
+ "Conv2D",
+ "QuantizedConv2D",
+ "Conv2DBackpropInputSwitched",
+ # depth-wise convolutions
+ "DepthwiseConv2dBiasAct",
+ "DepthwiseConv2dNative",
+ "QuantizedDepthwiseConv2D",
+ # FC layers
+ "QuantizedMatMul",
+ "MatMul",
+ "FullyConnectedAct",
+ # RNN/LSTM/GRU
+ "BlockLSTM",
+ # pooling
+ "QuantizedMaxPool",
+ "QuantizedAvgPool",
+ "AvgPool",
+ "MaxPool",
+ "AvgPoolAct",
+ "MaxPoolAct",
+ )
+)
+
+binary_elem_wise_main_ops = set(
+ (
+ # binary element-wise
+ "AddAct",
+ "MulAct",
+ "SubAct",
+ "QuantizedAdd",
+ "QuantizedSub",
+ "QuantizedMul",
+ "Mul",
+ "Add",
+ "Sub",
+ "Minimum",
+ "Maximum",
+ )
+)
+
+unary_elem_wise_main_ops = set(("LeakyRelu", "Abs")) # Unary element-wise operations
+
+elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
+
+activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1"))
+npu_post_ops = activation_ops | set(
+ # Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct.
+ ("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")
+)
+
+npu_post_fuse_limited_ops = set(
+ # Set of post operators that should not be fused with main/elementwise ops
+ ("ConcatSliceWrite", "Sigmoid", "Tanh")
+)
+
+elem_wise_ops = elem_wise_main_ops | activation_ops | set(("Sigmoid", "Tanh"))
+
+
+quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
+cpu_ops = (
+ set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN"))
+ | quantization_ops
+)
+
+npu_dma_ops = set(("DMA",))
+startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
+memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",))
+
+
+test_sequence = [
+ (
+ # ops_set
+ npu_post_ops,
+ # incompatible_pack_flags
+ PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main,
+ # flags_to_set
+ PassFlags.Npu | PassFlags.Post,
+ # flags_to_clear
+ PassFlags.Empty,
+ ),
+ (
+ # ops_set
+ npu_post_fuse_limited_ops,
+ # incompatible_pack_flags
+ PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main,
+ # flags_to_set
+ PassFlags.Npu | PassFlags.PostFusingLimited,
+ # flags_to_clear
+ PassFlags.Empty,
+ ),
+ (
+ # ops_set
+ mac_main_ops,
+ # incompatible_pack_flags
+ PassFlags.Cpu
+ | PassFlags.MemoryOnly
+ | PassFlags.ElementWise
+ | PassFlags.Pre
+ | PassFlags.Main
+ | PassFlags.PostFusingLimited,
+ # flags_to_set
+ PassFlags.Npu | PassFlags.Mac | PassFlags.Main,
+ # flags_to_clear
+ PassFlags.Empty,
+ ),
+ (
+ # ops_set
+ elem_wise_main_ops,
+ # incompatible_pack_flags
+ PassFlags.Cpu
+ | PassFlags.MemoryOnly
+ | PassFlags.Mac
+ | PassFlags.Pre
+ | PassFlags.Main
+ | PassFlags.PostFusingLimited,
+ # flags_to_set
+ PassFlags.Npu | PassFlags.ElementWise | PassFlags.Main,
+ # flags_to_clear
+ PassFlags.Empty,
+ ),
+ (
+ # ops_set
+ npu_pre_ops,
+ # incompatible_pack_flags
+ PassFlags.Cpu | PassFlags.MemoryOnly,
+ # flags_to_set
+ PassFlags.Npu | PassFlags.Mac | PassFlags.Pre | PassFlags.ElementWise,
+ # flags_to_clear
+ PassFlags.Empty,
+ ),
+ (
+ # ops_set
+ npu_dma_ops,
+ # incompatible_pack_flags
+ PassFlags.Cpu | PassFlags.MemoryOnly,
+ # flags_to_set
+ PassFlags.Npu | PassFlags.Dma,
+ # flags_to_clear
+ PassFlags.Empty
+ ),
+ (
+ # ops_set
+ startup_init_ops,
+ # incompatible_pack_flags
+ PassFlags.Npu | PassFlags.Cpu | PassFlags.MemoryOnly,
+ # flags_to_set
+ PassFlags.StartupInit | PassFlags.Main,
+ # flags_to_clear
+ PassFlags.Empty,
+ ),
+ (
+ # ops_set
+ memory_only_ops,
+ # incompatible_pack_flags
+ PassFlags.Npu | PassFlags.Cpu,
+ # flags_to_set
+ PassFlags.MemoryOnly | PassFlags.Main,
+ # flags_to_clear
+ PassFlags.Empty
+ ),
+ (
+ # ops_set
+ cpu_ops,
+ # incompatible_pack_flags
+ PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
+ # flags_to_set
+ PassFlags.Cpu | PassFlags.Main,
+ # flags_to_clear
+ PassFlags.Empty
+ ),
+ ( # This last one is a fallback for unrecognised operations
+ # ops_set
+ None,
+ # incompatible_pack_flags
+ PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
+ # flags_to_set
+ PassFlags.Cpu | PassFlags.Main,
+ # flags_to_clear
+ PassFlags.Empty
+ ),
+]
+
+# Some sanity checking
+for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:
+ assert not flags_to_clear & flags_to_set
+
+ if operation_set is not None:
+ for op in operation_set:
+ assert len(op) > 1 # This is to avoid string literals being decomposed
+
+
+def pack_into_passes(nng, arch, verbose_packing=False):
+ def visit_op(op, ignored):
+ visit_op_refcount[op] += 1
+
+ if visit_op_refcount[op] == 1: # First-time visit, go and fix up unused output tensors
+ for tens in op.outputs:
+ if len(tens.consumers()) == 0:
+ visit_op_refcount[op] += 1
+
+ assert visit_op_refcount[op] <= len(op.outputs)
+ if visit_op_refcount[op] == len(op.outputs):
+
+ if op.type in startup_init_ops:
+ startup_list.append(op)
+ else:
+ _, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+ if ofm_tensor is None:
+ ofm_tensor = op.outputs[0]
+ build_pass((op,), ofm_tensor)
+
+ def build_pass(start_ops_to_process, ofm_tensor=None):
+ reverse_ops_list = []
+ curr_flags = PassFlags.Empty
+ npu_block_type = NpuBlockType.Default
+
+ reverse_intermediates = []
+ input_set = set()
+ ifm_tensor = None
+ primary_op = None
+
+ to_process = collections.deque()
+ for start_op in start_ops_to_process:
+ to_process.append((start_op, None))
+
+ while to_process:
+ curr_op, tens = to_process.popleft()
+
+ if curr_op in reverse_ops_list:
+ continue
+
+ for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence:
+ if operation_set is None or curr_op.type in operation_set:
+ if not (curr_flags & incompatible_pack_flags):
+ if flags_to_set & PassFlags.Npu:
+ if not curr_op.run_on_npu:
+ continue
+
+ reverse_ops_list.append(curr_op)
+ new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default)
+ if new_block_type != NpuBlockType.Default:
+ assert npu_block_type == NpuBlockType.Default
+ npu_block_type = new_block_type # Only one major block type per pass
+ assert primary_op is None
+ primary_op = curr_op
+
+ curr_flags &= ~flags_to_clear
+ curr_flags |= flags_to_set
+
+ if flags_to_set & PassFlags.Npu:
+ if flags_to_set & (
+ PassFlags.Mac | PassFlags.ElementWise | PassFlags.Post | PassFlags.PostFusingLimited
+ ):
+ assert len(curr_op.inputs) >= 1
+ if curr_op.type == "BlockLSTM":
+ ifm_tensor = curr_op.inputs[3]
+ else:
+ ifm_tensor = curr_op.inputs[0]
+ assert ifm_tensor.purpose == TensorPurpose.FeatureMap
+
+ if flags_to_set & PassFlags.Dma:
+ # DMAs are special - Output buffers need to be preserved as intermediates,
+ # if the pass consumes the results
+ if tens is not None:
+ reverse_intermediates.append(tens)
+
+ if operation_set is None:
+ print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")
+
+ for inp in curr_op.inputs:
+ can_pack = True
+ if len(inp.ops) == 1:
+ next_op = inp.ops[0]
+ for outp in next_op.outputs:
+ consumers = outp.consumers()
+ if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op):
+ can_pack = False
+ break
+ else:
+ can_pack = False
+
+ if can_pack:
+ to_process.append((next_op, inp))
+ else:
+ assert inp is not None
+ input_set.add(inp)
+
+ break
+
+ else:
+ # This operation is not compatible with already packed operations, just register the tensor as an input
+ assert tens is not None
+ input_set.add(tens)
+
+ if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise | PassFlags.Mac):
+ # Make the choice that if we don't have a mac operation, the ambidextrous operations go on the
+ # element wise unit
+ curr_flags |= PassFlags.ElementWise
+
+ is_element_wise = True
+ for op in reverse_ops_list:
+ if not op.type in elem_wise_ops and not op.type in npu_dma_ops:
+ is_element_wise = False
+ break
+
+ placement = PassPlacement.Unknown
+ if curr_flags & PassFlags.Npu:
+ assert placement == PassPlacement.Unknown
+ placement = PassPlacement.Npu
+ if curr_flags & PassFlags.Cpu:
+ assert placement == PassPlacement.Unknown
+ placement = PassPlacement.Cpu
+ if curr_flags & PassFlags.MemoryOnly:
+ assert placement == PassPlacement.Unknown
+ placement = PassPlacement.MemoryOnly
+ if curr_flags & PassFlags.StartupInit:
+ assert placement == PassPlacement.Unknown
+ placement = PassPlacement.StartupInit
+ assert placement != PassPlacement.Unknown
+
+ ops_list = list(reversed(reverse_ops_list))
+ intermediates = list(reversed(reverse_intermediates))
+
+ if primary_op == None:
+ primary_op = create_primary_op(ops_list)
+ if primary_op != None:
+ visit_tensor_refcount[primary_op.inputs[0]] += 1
+ npu_block_type = primary_op.attrs["npu_block_type"]
+ for input_tens in primary_op.inputs:
+ if input_tens not in input_set:
+ input_set.add(input_tens)
+
+ ordered_input_list = []
+ input_refcounts = collections.defaultdict(int)
+ for op in ops_list:
+ for inp in op.inputs:
+ if inp in input_set:
+ if input_refcounts[inp] == 0:
+ ordered_input_list.append(inp)
+ input_refcounts[inp] += 1
+
+ name = ops_list[0].name
+ non_dma_ops = [op for op in ops_list if op.type != "DMA"]
+ if non_dma_ops:
+ name = non_dma_ops[0].name
+ ps = Pass(name, placement, is_element_wise, npu_block_type)
+ ps.ops = ops_list
+ ps.primary_op = primary_op
+ ps.inputs = ordered_input_list
+ ps.intermediates = intermediates
+ ps.outputs = list(ops_list[-1].outputs)
+ ps.ifm_tensor = ifm_tensor
+
+ # ElementWise operation, 2 IFMs
+ if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops:
+ ps.ifm_tensor = ps.inputs[0]
+
+ if len(ps.inputs) == 1:
+ # Only 1 input, IFM and IFM2 are the same tensor
+ ps.ifm2_tensor = ps.inputs[0]
+ else:
+ ps.ifm2_tensor = ps.inputs[1]
+ else:
+ ps.ifm_tensor = ifm_tensor
+ ps.ifm2_tensor = None
+
+ ps.ofm_tensor = ofm_tensor
+ assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None
+ ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]
+ ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2]
+
+ for op in ps.ops:
+ op.scheduled_pass = ps
+
+ reverse_pass_list.append(ps)
+
+ for inp, refcount in input_refcounts.items():
+ for _ in range(refcount):
+ visit_tensor(inp)
+
+ return ps
+
+ def visit_tensor(tens):
+ visit_tensor_refcount[tens] += 1
+ assert visit_tensor_refcount[tens] <= len(tens.consumers())
+ if visit_tensor_refcount[tens] == len(tens.consumers()):
+ for op in reversed(tens.ops):
+ visit_op(op, tens)
+
+ def create_primary_op(ops_list):
+ if any(op.type in (npu_pre_ops | npu_post_ops | npu_post_fuse_limited_ops) for op in ops_list):
+ # Configure a 1x1 AvgPool and attach the op onto it
+ op = ops_list[0]
+ inp = op.inputs[0]
+ avgpool_name = op.name + "_avgpool"
+ avgpool_op = Operation("AvgPool", avgpool_name)
+ avgpool_op.inputs = [inp]
+ avgpool_op.inputs[0].consumer_list.append(avgpool_op)
+ avgpool_op.attrs["padding"] = b"VALID"
+ avgpool_op.attrs["npu_block_type"] = NpuBlockType.Pooling
+ avgpool_op.attrs["stride_w"] = 1
+ avgpool_op.attrs["stride_h"] = 1
+ avgpool_op.attrs["filter_width"] = 1
+ avgpool_op.attrs["filter_height"] = 1
+ avgpool_op.attrs["strides"] = [1, 1, 1, 1]
+ avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
+ avgpool_op.attrs["skirt"] = [0, 0, 0, 0]
+ avgpool_op.attrs["explicit_padding"] = [0, 0, 0, 0]
+ avgpool_out = inp.clone("_avgpooled")
+ avgpool_out.consumer_list.append(op)
+ avgpool_out.ops = [avgpool_op]
+ avgpool_op.outputs = [avgpool_out]
+
+ op.inputs[0] = avgpool_out
+ ops_list.insert(0, avgpool_op)
+
+ return avgpool_op
+
+ return None
+
+ for sg in nng.subgraphs:
+ reverse_pass_list = []
+ visit_op_refcount = collections.defaultdict(int)
+ visit_tensor_refcount = collections.defaultdict(int)
+
+ startup_list = []
+
+ for tens in sg.output_tensors:
+ visit_tensor(tens)
+
+ if startup_list:
+ startup_ps = build_pass(startup_list)
+ startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs
+ startup_ps.name = "startup_weight_initialisation"
+
+ sg.passes = list(reversed(reverse_pass_list))
+ sg.build_pass_links()
+
+ if verbose_packing:
+ nng.print_passes()
+
+ return nng
diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py
new file mode 100644
index 00000000..64de9709
--- /dev/null
+++ b/ethosu/vela/range_set.py
@@ -0,0 +1,154 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Helper classes to track memory accesses for calculating dependencies between Commands.
+
+from enum import IntEnum
+from collections import defaultdict
+from functools import lru_cache
+
+
+class RangeSet:
+ """A Range set class to track ranges and whether they intersect.
+Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas."""
+
+ def __init__(self, start=None, end=None, ranges=None):
+ if ranges is None:
+ ranges = []
+
+ self.ranges = ranges # track a list of (start, end) tuples, always in ascending order sorted by start.
+
+ if start is not None and start != end:
+ assert start < end
+ self.ranges.append((start, end))
+
+ def __or__(self, other):
+ combined_ranges = list(sorted(self.ranges + other.ranges))
+ return RangeSet(ranges=combined_ranges)
+
+ def __ior__(self, other):
+ self.ranges = list(sorted(self.ranges + other.ranges))
+ return self
+
+ def intersects(self, other):
+ a_ranges = self.ranges
+ b_ranges = other.ranges
+
+ a_idx = 0
+ b_idx = 0
+
+ while a_idx < len(a_ranges) and b_idx < len(b_ranges):
+ ar = a_ranges[a_idx]
+ br = b_ranges[b_idx]
+ if max(ar[0], br[0]) < min(ar[1], br[1]):
+ return True # intersection
+
+ # advance one of the two upwards
+ if ar[0] < br[0]:
+ a_idx += 1
+ else:
+ assert ar[0] != br[0]
+ # note ar[0] == br[0] cannot happen, then we'd have an intersection
+ b_idx += 1
+
+ return False
+
+ def __str__(self):
+ return "<RangeSet %s>" % (["%#x:%#x" % (int(start), int(end)) for start, end in self.ranges],)
+
+ __repr__ = __str__
+
+
+class MemoryRangeSet:
+ """Extended version of the RangeSet class that handles having different memory areas"""
+
+ def __init__(self, mem_area=None, start=None, end=None, regions=None):
+
+ if regions is None:
+ regions = {}
+ self.regions = regions
+
+ if mem_area is not None:
+ self.regions[mem_area] = RangeSet(start, end)
+
+ def __or__(self, other):
+ combined_regions = {
+ mem_area: (self.regions.get(mem_area, RangeSet()) | other.regions.get(mem_area, RangeSet()))
+ for mem_area in (self.regions.keys() | other.regions.keys())
+ }
+ return MemoryRangeSet(regions=combined_regions)
+
+ def __ior__(self, other):
+ self.regions = {
+ mem_area: (self.regions.get(mem_area, RangeSet()) | other.regions.get(mem_area, RangeSet()))
+ for mem_area in (self.regions.keys() | other.regions.keys())
+ }
+ return self
+
+ def intersects(self, other):
+ for mem_area in self.regions.keys() & other.regions.keys():
+ if self.regions[mem_area].intersects(other.regions[mem_area]):
+ return True
+ return False
+
+ def __str__(self):
+ s = "<MemoryRangeSet>"
+ for mem_area, rng in self.regions.items():
+ s += "%s: %s\t" % (mem_area, rng)
+ return s
+
+ __repr__ = __str__
+
+
+class AccessDirection(IntEnum):
+ Read = 0
+ Write = 1
+ Size = 2
+
+
+class MemoryAccessSet:
+ """Tracks memory ranges, but also access patterns to know which accesses actually are in conflict"""
+
+ def __init__(self):
+ self.accesses = [MemoryRangeSet() for i in range(AccessDirection.Size)]
+
+ def add(self, memory_range_set, access):
+ self.accesses[access] |= memory_range_set
+
+ @lru_cache(maxsize=None)
+ def conflicts(self, other):
+
+ # True dependencies, or write -> read
+ if self.accesses[AccessDirection.Write].intersects(other.accesses[AccessDirection.Read]):
+ return True
+
+ # Anti-dependencies, or read -> write
+ if self.accesses[AccessDirection.Read].intersects(other.accesses[AccessDirection.Write]):
+ return True
+
+ # Output dependencies, or write -> write
+ if self.accesses[AccessDirection.Write].intersects(other.accesses[AccessDirection.Write]):
+ return True
+
+ # read -> read does not cause a conflict
+ return False
+
+ def __str__(self):
+ return "Read: %s\nWrite: %s\n\n" % (self.accesses[AccessDirection.Read], self.accesses[AccessDirection.Write])
+
+ __repr__ = __str__
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
new file mode 100644
index 00000000..5563b969
--- /dev/null
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -0,0 +1,945 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates
+# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
+# stream suitable for interpretation by the Ethos-U55 processor.
+
+from collections import defaultdict
+from enum import Enum, IntEnum
+from .high_level_command_stream import CommandType
+from .ethos_u55_regs.ethos_u55_regs import *
+from .tensor import MemArea, TensorBlockTraversal
+from .operation import NpuBlockType
+from .numeric_util import quantise_float32, round_up, round_away_zero, round_up_to_int, clamp_sigmoid, clamp_tanh
+from .data_type import BaseType
+import numpy as np
+from .shared_buffer_allocation import SharedBufferAllocation
+from .architecture_features import SharedBufferArea, SHRAMElements, ArchitectureFeatures
+from .nn_graph import TensorFormat, SchedulingStrategy
+from .range_set import (
+ MemoryAccessSet,
+ AccessDirection,
+)
+from .mark_tensors import (
+ reshape_operations,
+)
+from .architecture_features import Block, Kernel, Rect
+from . import scaling
+
+
+class RegisterMachine:
+ def __init__(self):
+ self.n_banks = 1
+ self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
+ self.bank_idx = 0
+
+ def set_register(self, reg, value):
+ is_changed = self.registers[self.bank_idx][reg] != value
+ self.registers[self.bank_idx][reg] = value
+ # is_changed = True # force command
+ return is_changed
+
+ def switch_bank(self):
+ self.bank_idx = (self.bank_idx + 1) % self.n_banks
+
+
+class CmdMode(IntEnum):
+ NoPayload = 0x0000
+ Payload32 = 0x4000
+ Mask = 0xC000
+ CmdOpMask = 0x03FF
+
+
+class BasePointerIndex(IntEnum):
+ ReadOnly = 0 # base address slot index for weights and scaling
+ Scratch = 1 # base address slot index for scratch memory area
+
+
+# TODO: Replace with definitions from ethos_u55_regs
+class IFM2Broadcast(IntEnum):
+ BroadcastHdim = 1 << 0
+ BroadcastWdim = 1 << 1
+ BroadcastCdim = 1 << 2
+ ReverseOperandOrder = 1 << 6
+ UseIFM2Scalar = 1 << 7
+
+
+class CommandStreamEmitter:
+ def __init__(self):
+ self.cmd_stream = []
+ self.reg_machine = [RegisterMachine(), RegisterMachine()]
+ self.last_absolute_wait = defaultdict(int)
+
+ def get_reg_machine(self, cmd):
+ if "DMA" in cmd.name:
+ return self.reg_machine[1]
+ else:
+ return self.reg_machine[0]
+
+ def size_in_bytes(self):
+ sz = 0
+ for cmd in self.cmd_stream:
+ sz += len(cmd) * 4
+ return sz
+
+ def to_list(self):
+ return [elem for cmd in self.cmd_stream for elem in cmd]
+
+ def print_cmds(self):
+ print("Code: Command: Param: Payload:")
+ for words_for_one_command in self.cmd_stream:
+ code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
+ param = words_for_one_command[0] >> 16 # higher 16 bits
+
+ payload_mode = CmdMode(code & CmdMode.Mask)
+
+ # code and command
+ s = " 0x%04x " % code
+ if payload_mode == CmdMode.NoPayload:
+ s += str(cmd0(code & CmdMode.CmdOpMask))
+ else:
+ s += str(cmd1(code & CmdMode.CmdOpMask))
+
+ s = s.ljust(40)
+ s += "%5d" % param
+
+ # payload
+ if payload_mode == CmdMode.Payload32:
+ s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
+ else:
+ s += " -"
+
+ print(s)
+
+ def cmd0_with_param(self, cmd, param):
+ if isinstance(param, Enum):
+ param = int(param.value)
+ else:
+ param = int(param)
+ param = param & 0xFFFF
+ command = cmd.value | (param << 16)
+ if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
+ return
+
+ # This is not a redundant command, actually write it
+ self.cmd_stream.append((command,))
+
+ def cmd1_with_offset(self, cmd, offset, param=0x0):
+ offset = int(offset) & 0xFFFFFFFFF
+ command = cmd.value | CmdMode.Payload32.value | (param << 16)
+
+ if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
+ return
+
+ # This is not a redundant command, actually write it
+ self.cmd_stream.append((command, offset))
+
+ def cmd_wait(self, cmd, param, absolute_wait_time):
+ if absolute_wait_time <= self.last_absolute_wait[cmd]:
+ return
+
+ self.last_absolute_wait[cmd] = absolute_wait_time
+ param = int(param)
+ command = ((param & 0xFFFF) << 16) | cmd.value
+ self.cmd_stream.append((command,))
+
+ def cmd_do_operation(self, cmd, param=0):
+ param = int(param)
+ command = ((param & 0xFFFF) << 16) | cmd.value
+
+ self.cmd_stream.append((command,))
+ self.get_reg_machine(cmd).switch_bank()
+
+
+def calc_command_dependencies(cmd_stream, arch):
+ cmd_starts = {}
+ cmd_ends = {}
+ memory_accesses = {}
+
+ # Keep track of accumulated number of commands in command stream.
+ # First element kernel ops: (# of blocks, # of commands)
+ # Second element DMA ops: (# of commands)
+ pos = np.array((np.array((0, 0)), np.array([0])))
+
+ dependencies = {}
+
+ for cmd in cmd_stream:
+ cmd_starts[cmd] = pos
+ op_count = cmd.get_operation_count()
+ # Keep track of both num blocks and commands
+ cmd_add = 0 if (op_count[0] == 0) else 1
+ pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]])))
+ cmd_ends[cmd] = np.array((pos[0], pos[1]))
+ memory_accesses[cmd] = cmd.get_memory_accesses()
+
+ for idx, cmd in enumerate(cmd_stream):
+ curr_accesses = memory_accesses[cmd]
+ # Keep track of command dependency.
+ # First element kernel ops: (# of blocks, # of commands)
+ # Second element DMA ops: (# of commands)
+ dep_offsets = np.array((np.array((-1, -1)), np.array([-1])))
+ dep_cmds = [None] * CommandType.Size.value
+ if idx > 0:
+ # Look at the previous commands in backwards order
+ for prev_cmd in cmd_stream[idx - 1 :: -1]:
+ assert prev_cmd is not cmd
+ if dep_cmds[prev_cmd.cmdtype] is None:
+ is_dependency = False
+ if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe:
+ # Special handling here, as dpu -> dpu operations require additional care
+ if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer):
+ is_dependency = True
+ elif memory_accesses[prev_cmd].conflicts(curr_accesses):
+ is_dependency = True
+ else:
+ if memory_accesses[prev_cmd].conflicts(curr_accesses):
+ is_dependency = True
+
+ if is_dependency:
+ new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype]
+ if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]:
+ dep_cmds[prev_cmd.cmdtype] = prev_cmd
+ dep_offsets[prev_cmd.cmdtype] = new_offset
+
+ # Check if we've got dependencies for all commands, in which case we can early out
+ for dep in dep_cmds:
+ if dep is None:
+ break
+ else:
+ break # all handled
+
+ # Convert absolute to relative dependencies, using None to signal the special case of no
+ # dependency of this kind
+ res = [None] * CommandType.Size.value
+ for i in range(CommandType.Size.value):
+ if dep_cmds[i] is not None:
+ res[i] = cmd_starts[cmd][i] - dep_offsets[i]
+
+ dependencies[cmd] = cmd_starts[cmd], res
+
+ return dependencies
+
+
+def get_op_kernel(ps):
+ if ps.primary_op is None:
+ return None
+
+ strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1))
+ dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1))
+ if ps.weight_tensor:
+ if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)):
+ k_h = 1
+ k_w = 1
+ else:
+ k_h = ps.weight_tensor.shape[0]
+ k_w = ps.weight_tensor.shape[1]
+ else:
+ k_h = ps.primary_op.attrs.get("filter_height", 1)
+ k_w = ps.primary_op.attrs.get("filter_width", 1)
+
+ return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
+
+
+def full_shape(shape, fill):
+ return ([fill] * (4 - len(shape))) + shape
+
+
+def has_prev_op_dependency(prev_cmd, cmd):
+ if prev_cmd is None:
+ return False
+ if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps):
+ if prev_cmd.ofm_tensor == cmd.ifm_tensor:
+ return True
+ else:
+ return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id
+ return False
+
+
+def get_op_ofm_rect(cmd):
+ start = full_shape(cmd.ofm_box.start_coord, 0)
+ end = full_shape(cmd.ofm_box.end_coord, 1)
+ return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
+
+
+def get_op_ifm_rect(cmd):
+ start = full_shape(cmd.ifm_box.start_coord, 0)
+ end = full_shape(cmd.ifm_box.end_coord, 1)
+ return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1)
+
+
+def get_op_ifmofm_block_depth(arch, cmd):
+ # Note: NOT equivalent to the normal ifm block depth calculation since
+ # it takes into account 'depthless' block operations by returning full
+ # depth
+ if cmd.ps.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling, NpuBlockType.ElementWise):
+ return cmd.ofm_box.get_size_shape()[-1]
+
+ return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits)
+
+
+def get_op_padding_lt(cmd):
+ if cmd.ps.npu_block_type not in (
+ NpuBlockType.ConvolutionDepthWise,
+ NpuBlockType.Pooling,
+ NpuBlockType.ConvolutionMxN,
+ ):
+ return (0, 0)
+
+ explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
+
+ # Check if this is for horizontal ifm streaming
+ if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
+ explicit_padding[0] = cmd.pad_top
+ explicit_padding[2] = cmd.pad_bottom
+
+ return (explicit_padding[1], explicit_padding[0])
+
+
+def generate_register_command_stream(nng, sg, arch, verbose=False):
+ emit = CommandStreamEmitter()
+
+ base_ptr_idx_map = {
+ MemArea.Sram: BasePointerIndex.Scratch,
+ MemArea.OnChipFlash: BasePointerIndex.ReadOnly,
+ MemArea.OffChipFlash: BasePointerIndex.ReadOnly,
+ MemArea.Dram: BasePointerIndex.ReadOnly,
+ }
+
+ # Maps an AccumulatorType enum to the corresponding acc_format value
+ acc_format_map = {
+ SHRAMElements.Acc16: acc_format.FP_S5_10.value,
+ SHRAMElements.Acc32: acc_format.INT_32BIT.value,
+ SHRAMElements.Acc40: acc_format.INT_40BIT.value,
+ }
+
+ # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
+ elementwise_mode_map = {
+ "MulAct": elementwise_mode.MUL.value,
+ "AddAct": elementwise_mode.ADD.value,
+ "SubAct": elementwise_mode.SUB.value,
+ "Minimum": elementwise_mode.MIN.value,
+ "Maximum": elementwise_mode.MAX.value,
+ "LeakyRelu": elementwise_mode.LRELU.value,
+ "Abs": elementwise_mode.ABS.value,
+ }
+
+ cmd_stream = []
+ for cmd in sg.high_level_command_stream:
+ if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
+ print("Warning: Skipping register command stream generation for", cmd.ps)
+ else:
+ cmd_stream.append(cmd)
+
+ dependencies = calc_command_dependencies(cmd_stream, arch)
+
+ # Initialise operator dependency state
+ prev_ifm_rect = cur_ifm_rect = None
+ prev_ifm_block_depth = cur_ifm_block_depth = None
+ prev_ofm_rect = cur_ofm_rect = None
+ prev_ofm_block = cur_ofm_block = None
+ prev_kernel = cur_kernel = None
+ prev_cmd = None
+
+ def emit_wait_commands(cmd):
+ # The command is fully set up, emit whatever wait commands we need
+ absolute_dep, relative_dep = dependencies[cmd]
+ if relative_dep[CommandType.NpuStripe] is not None:
+ if cmd.cmdtype == CommandType.DMA:
+ param = relative_dep[CommandType.NpuStripe][1]
+ if param <= 3:
+ emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1])
+ else:
+ param = relative_dep[CommandType.NpuStripe][0]
+ param = min(param, 0xFFFF) # Clamp to allowable wait amount
+
+ if relative_dep[CommandType.DMA] is not None:
+ param = relative_dep[CommandType.DMA][0]
+ param = min(param, 0xF) # Clamp to allowable wait amount
+ emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
+ prev_cmd = None # Clear any dependency
+
+ # Start by issuing REGION commands since they remain the same
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, BasePointerIndex.Scratch)
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, BasePointerIndex.Scratch)
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, BasePointerIndex.Scratch)
+ for cmd in cmd_stream:
+ if cmd.cmdtype == CommandType.DMA:
+ start_coord = cmd.box.start_coord
+
+ src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
+ dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
+
+ if cmd.in_tensor.compressed_values is not None:
+ stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
+ sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
+ else:
+ sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
+
+ # TODO: Yoda support needs to use feature_maps_not_in_fast_storage and force_outputs_to_fast_storage
+ emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_area])
+ emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr)
+ emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_area])
+ emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr)
+ emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz)
+ dma_channel = 0
+ mode = 0 # From external to external
+
+ emit_wait_commands(cmd)
+ emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode)
+
+ elif cmd.cmdtype == CommandType.NpuStripe:
+
+ ps = cmd.ps
+ primary_op = ps.primary_op
+ npu_block_type = ps.npu_block_type
+ # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale
+ use_global_scale = False
+ # Specifies type of rounding to be used.
+ rounding_mode = rounding.TFL
+ fmf = primary_op.attrs.get("fused_memory_function", None)
+ faf = primary_op.attrs.get("fused_activation_function", None)
+
+ # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB
+ op_to_scale = 0
+
+ # Update state history
+ prev_ifm_rect = cur_ifm_rect
+ prev_ifm_block_depth = cur_ifm_block_depth
+ prev_ofm_rect = cur_ofm_rect
+ prev_ofm_block = cur_ofm_block
+ prev_kernel = cur_kernel
+
+ block_config = ps.block_config
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1)
+
+ shared_buffer = ps.shared_buffer
+
+ if npu_block_type == NpuBlockType.ElementWise:
+ ifm2_broadcast = 0
+
+ if cmd.ifm_tensor.shape == []:
+ # The scalar has to be the ifm2 tensor so switch the ifms
+ cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
+ cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
+
+ # Set ReverseOperandOrder bit to IFM2_BROADCAST
+ ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
+
+ # Calculate scales needed for arithmetic elementwise operators
+ if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
+ input_scale = cmd.ifm_tensor.quantization.scale_f32
+ input2_scale = cmd.ifm2_tensor.quantization.scale_f32
+ output_scale = cmd.ofm_tensor.quantization.scale_f32
+ use_global_scale = True
+
+ if primary_op.type == "MulAct":
+ if (faf == "Sigmoid") or (faf == "Tanh"):
+ output_scale = 1 / 0x3000
+
+ ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
+ emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+ else: # AddAct/SubAct
+ if (faf == "Sigmoid") or (faf == "Tanh"):
+ output_scale = 1 / 0x3000
+
+ if input_scale == input2_scale:
+ opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
+ input_scale, input2_scale, output_scale
+ )
+ opa_shift = 0 # Unused for this case
+ else:
+ # Use advanced implementation only when input scales differ
+ bitdepth = cmd.ifm_tensor.dtype.bits
+ (
+ opa_scale,
+ opa_shift,
+ ofm_scale,
+ shift,
+ op_to_scale,
+ ) = scaling.advanced_elementwise_add_sub_scale(
+ input_scale, input2_scale, output_scale, bitdepth
+ )
+ opb_scale = 0 # Unused for this case
+ if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder:
+ # If the operand order is reversed we also have to swap which operand is scaled
+ if op_to_scale == scaling.OperandToScale.OPa:
+ op_to_scale = scaling.OperandToScale.OPb
+ else:
+ op_to_scale = scaling.OperandToScale.OPa
+
+ emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
+ emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
+ emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+
+ if primary_op.type in set(("LeakyRelu", "Abs",)):
+ output_scale = cmd.ofm_tensor.quantization.scale_f32
+ use_global_scale = True
+
+ if primary_op.type == "LeakyRelu":
+ output_scale *= primary_op.attrs["alpha"]
+
+ ofm_scale, shift = scaling.quantise_scale(output_scale)
+ emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
+
+ # For elementwise set the required SHRAM to be equal to the total size of SHRAM
+ shram_required = arch.shram_total_banks
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
+
+ # Acc buffers not needed so set AB_START to size of SHRAM
+ emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks)
+
+ # Is not a unary operator
+ if cmd.ifm2_tensor is not None:
+ if cmd.ifm2_tensor.shape == []:
+ # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
+ ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
+ else:
+ ifm_box_shape = cmd.ifm_box.get_size_shape()
+ ifm2_box_shape = cmd.ifm2_box.get_size_shape()
+
+ if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]:
+ # Broadcast in 'H' dimension
+ assert cmd.ifm2_tensor.shape[1] == 1
+ ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
+
+ if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]:
+ # Broadcast in 'W' dimension
+ assert cmd.ifm2_tensor.shape[2] == 1
+ ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
+
+ if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]:
+ # Broadcast in 'C' dimension
+ assert cmd.ifm2_tensor.shape[3] == 1
+ ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
+
+ # Set IFM2_IB_START to the latter half of the IB space
+ ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
+ emit.cmd0_with_param(
+ cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start
+ )
+
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
+
+ else:
+ emit.cmd0_with_param(
+ cmd0.NPU_SET_IFM_IB_END,
+ shared_buffer.bank_locations[SharedBufferArea.IFM]
+ + shared_buffer.banks_required[SharedBufferArea.IFM],
+ )
+ emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
+
+ emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
+
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 0)
+
+ if npu_block_type in set(
+ (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling)
+ ):
+ # Set up padding
+ explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
+
+ # Check if this is for horizontal ifm streaming
+ if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
+ explicit_padding[0] = cmd.pad_top
+ explicit_padding[2] = cmd.pad_bottom
+
+ # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
+ # because of activation function needed to be fused.
+ if cmd.ifm_box.start_coord[-2] > 0:
+ explicit_padding[1] = 0
+ if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
+ explicit_padding[3] = 0
+
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3])
+
+ stride = primary_op.attrs["strides"][2] - 1
+ stride |= (primary_op.attrs["strides"][1] - 1) << 1
+
+ if npu_block_type == NpuBlockType.Pooling:
+ k_height, k_width = primary_op.attrs["ksize"][1:3]
+ emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1)
+
+ valid_padding = sum(explicit_padding) == 0
+
+ if primary_op.type in set(("AvgPool", "AvgPoolAct")) and valid_padding:
+ # For valid padding vela has to output scaling values
+ if faf == "Sigmoid" or faf == "Tanh":
+ rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
+ rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
+
+ scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
+ scale = int(round_away_zero(scale * rescale))
+ else:
+ # In case avg pool fused with concat or other memory operation, rescaling might be needed.
+ # k_height == k_width == 1 is allways true in this case
+ # Normally the scale is maximised, to get maximum precision, which means that
+ # if rescale != 1, scale need to consider the number of bits needed for rescaling
+ rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32
+ rescale_bits = 0
+ if k_height == k_width == 1:
+ if fmf == "ConcatSliceWrite":
+ rounding_mode = rounding.NATURAL
+ if rescale > 1:
+ rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
+ elif rescale < 1:
+ rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
+ scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
+ scale = int(round_away_zero(scale * rescale))
+
+ emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
+ # Valid-padded average pool should use the global scale from
+ # NPU_SET_OFM_SCALE register, which is set above.
+ use_global_scale = True
+
+ else: # Convolution
+ assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
+ emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, cmd.weight_tensor.shape[0] - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, cmd.weight_tensor.shape[1] - 1)
+ if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
+ # Part-kernel-first weight ordering
+ assert npu_block_type == NpuBlockType.ConvolutionMxN
+ stride |= 1 << 2
+
+ emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
+
+ elif npu_block_type in set((NpuBlockType.VectorProduct,)):
+ # Vector product is implemented using a 1x1 convolution so need
+ # to setup the appropriate padding and kernel info
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0)
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0)
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0)
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0)
+
+ # kernel stride reg = 0 means stride(1,1) + depth first weight
+ # order + dilation(0,0) + kernel_split_size=8
+ emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0)
+
+ emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0)
+ emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0)
+
+ if npu_block_type in set(
+ (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
+ ):
+ # Emit Weight base address commands, only maps the area required for
+ # this command's weights from the larger tensor.
+ stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
+ weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
+ weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index)
+ # Select weight/scale region depending on where permanent storage was defined
+ weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_area]
+ if arch.permanent_storage_mem_area == MemArea.Sram:
+ weight_region = BasePointerIndex.ReadOnly
+ emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
+ emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr)
+ emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len)
+
+ # Emit Scale & Bias base address commands, with length matching the amount required by
+ # the weight tensors.
+ if cmd.scale_tensor is not None:
+ # Get address and size of the scale/bias data area
+ scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:])
+ scale_len = (
+ cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr
+ )
+ # Emit base address for NPU to access scale & bias data
+ scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_area]
+ if arch.permanent_storage_mem_area == MemArea.Sram:
+ scale_region = BasePointerIndex.ReadOnly
+ emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region)
+ emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr)
+ emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16))
+
+ ofm_quant = cmd.ofm_tensor.quantization
+ ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min
+ ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max
+ ifm_min = cmd.ifm_tensor.quantization.min
+ ifm_max = cmd.ifm_tensor.quantization.max
+
+ # Emit commands for any fused activation function
+ if faf == None:
+ emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
+ # Even if no activation function, values need to be set to override previous values
+ faf_min = ofm_quant_qmin
+ faf_max = ofm_quant_qmax
+ elif faf == "Relu":
+ emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
+ faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+ faf_max = ofm_quant_qmax
+ elif faf == "Relu6":
+ emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
+ faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+ faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+ elif faf == "ReluN1To1":
+ emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
+ faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+ faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
+ elif faf == "Tanh":
+ emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
+ faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
+ faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
+ elif faf == "Sigmoid":
+ emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
+ faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
+ faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
+ else:
+ raise Exception("Unsupported fused_activation_function = " + faf)
+
+ # Activation range needs to be set based upon the quantisation range and the fused activation range
+ emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
+ emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max))
+
+ out_shape = cmd.ofm_box.get_size_shape()
+ if len(out_shape) >= 4:
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1)
+ else:
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0)
+ if len(out_shape) >= 2:
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1)
+ else:
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0)
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1)
+
+ if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
+ in_shape = cmd.ifm_box.get_size_shape()
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1)
+ else:
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1)
+
+ for tens, box, ptr_ops, stride_ops, zero_point_op in (
+ (
+ cmd.ifm_tensor,
+ cmd.ifm_box,
+ (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3),
+ (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X),
+ cmd0.NPU_SET_IFM_ZERO_POINT,
+ ),
+ (
+ cmd.ifm2_tensor,
+ cmd.ifm2_box,
+ (
+ cmd1.NPU_SET_IFM2_BASE0,
+ cmd1.NPU_SET_IFM2_BASE1,
+ cmd1.NPU_SET_IFM2_BASE2,
+ cmd1.NPU_SET_IFM2_BASE3,
+ ),
+ (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X),
+ cmd0.NPU_SET_IFM2_ZERO_POINT,
+ ),
+ (
+ cmd.ofm_tensor,
+ cmd.ofm_box,
+ (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3),
+ (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X),
+ cmd0.NPU_SET_OFM_ZERO_POINT,
+ ),
+ ):
+
+ if tens == None:
+ continue
+
+ need_zero_point = (faf != None) or (fmf == "ConcatSliceWrite")
+ if (
+ primary_op.type in set(("AvgPool", "AvgPoolAct")) and not need_zero_point
+ ) or tens.quantization == None:
+ # Actual integer operation, just set scale to 1 and zero point to 0
+ emit.cmd0_with_param(zero_point_op, 0)
+ else:
+ assert tens.quantization.zero_point is not None, "need an actual zero point set"
+ emit.cmd0_with_param(zero_point_op, int(tens.quantization.zero_point))
+
+ if tens.shape == []:
+ # Empty shape, elementwise constant
+ ifm2_scalar = tens.quant_values.astype(np.uint8)
+ assert ifm2_scalar.size == 1
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, ifm2_scalar.item(0))
+ continue
+
+ height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
+ box.start_coord, box.end_coord
+ )
+ if npu_block_type != NpuBlockType.VectorProduct:
+ if tens == cmd.ifm_tensor:
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1)
+ elif tens == cmd.ofm_tensor:
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1)
+ elif tens == cmd.ifm2_tensor:
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1)
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1)
+ else:
+ if len(out_shape) == 2:
+ # TODO: N is put in W-dimension for now
+ # Should be spread over H and W, but then block size selectetion,
+ # and stride calculation should be changed
+ if tens == cmd.ifm_tensor:
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1)
+ elif tens == cmd.ofm_tensor:
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1)
+ else:
+ assert False
+
+ for idx, addr in enumerate(addresses):
+ if addr is None:
+ addresses[idx] = 0
+
+ emit.cmd1_with_offset(ptr_ops[0], addresses[0])
+ emit.cmd1_with_offset(ptr_ops[1], addresses[1])
+ emit.cmd1_with_offset(ptr_ops[2], addresses[2])
+ emit.cmd1_with_offset(ptr_ops[3], addresses[3])
+
+ strides = tens.get_strides()
+ emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C)
+ emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W)
+ emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H)
+
+ if tens.format == TensorFormat.NHCWB16:
+ # Check that all BasePointer addresses are aligned to 16 bytes
+ assert (int(addresses[0]) % 16) == 0
+ assert (int(addresses[1]) % 16) == 0
+ assert (int(addresses[2]) % 16) == 0
+ assert (int(addresses[3]) % 16) == 0
+
+ ofm_dtype = cmd.ofm_tensor.dtype
+ assert ofm_dtype.type & BaseType.Int
+ prec = 0
+ if ofm_dtype.size_in_bits() == 8:
+ prec = 0
+ elif ofm_dtype.size_in_bits() == 16:
+ prec = 2
+ else:
+ assert 0
+
+ if ofm_dtype.type & BaseType.Signed:
+ prec += 1
+
+ if use_global_scale:
+ # Set global scale bit, as opposed to using per channel scale
+ prec |= 1 << 8
+
+ if cmd.ofm_tensor.format == TensorFormat.NHCWB16:
+ prec |= 1 << 6
+
+ prec |= rounding_mode.value << 14
+
+ emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
+
+ prec = None
+ weight_bits = 8
+ if cmd.weight_tensor is not None:
+ weight_bits = cmd.weight_tensor.dtype.size_in_bits()
+
+ ifm_dtype = cmd.ifm_tensor.dtype
+
+ assert weight_bits == 8, "Unsupported weight bit depth"
+ assert ifm_dtype.size_in_bits() in {8, 16}
+
+ if ifm_dtype.size_in_bits() == 8:
+ if ifm_dtype.type & BaseType.Signed:
+ prec = ifm_precision.W8_S8
+ else:
+ prec = ifm_precision.W8_U8
+ elif ifm_dtype.size_in_bits() == 16:
+ if ifm_dtype.type & BaseType.Signed:
+ prec = ifm_precision.W8_S16
+ else:
+ prec = ifm_precision.W8_U16
+
+ ifm_prec = prec.value
+ ifm2_prec = ifm_prec
+
+ if cmd.ifm_tensor.format == TensorFormat.NHCWB16:
+ ifm_prec |= 1 << 6
+
+ ifm_prec |= op_to_scale << 8
+
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec)
+
+ if cmd.ifm2_tensor is not None:
+ if cmd.ifm2_tensor.format == TensorFormat.NHCWB16:
+ ifm2_prec |= 1 << 6
+ emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec)
+
+ emit_wait_commands(cmd)
+
+ # Get op parameters
+ cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd)
+ cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
+ cur_ofm_rect = get_op_ofm_rect(cmd)
+ cur_ifm_rect = get_op_ifm_rect(cmd)
+ cur_kernel = get_op_kernel(cmd.ps)
+ cur_padLT = get_op_padding_lt(cmd)
+ if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
+ if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
+ blockdep = arch.calc_block_dep(
+ prev_ifm_rect,
+ prev_ofm_rect,
+ prev_ifm_block_depth,
+ prev_ofm_block,
+ prev_kernel,
+ cur_ifm_rect,
+ cur_ofm_rect,
+ cur_ifm_block_depth,
+ cur_ofm_block,
+ cur_kernel,
+ cur_padLT,
+ )
+ else:
+ blockdep = 0
+ else:
+ blockdep = ArchitectureFeatures.MAX_BLOCKDEP
+
+ # Set between every op (dependent or not)
+ blockdep = min(blockdep, arch.max_blockdep)
+ emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
+ prev_cmd = cmd
+
+ if npu_block_type == NpuBlockType.ConvolutionMxN:
+ emit.cmd_do_operation(cmd0.NPU_OP_CONV)
+ elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
+ emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
+ elif npu_block_type == NpuBlockType.VectorProduct:
+ # Vector product is implemented using a 1x1 convolution
+ emit.cmd_do_operation(cmd0.NPU_OP_CONV)
+ elif npu_block_type == NpuBlockType.Pooling:
+ param = "Max" not in primary_op.type
+ emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
+ elif npu_block_type == NpuBlockType.ElementWise:
+ param = elementwise_mode_map[primary_op.type]
+ emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param)
+ else:
+ print("Warning: Skipping register command stream generation for", ps)
+
+ # Fill in final part of command stream:
+ emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
+
+ sg.register_command_stream = emit.to_list()
+ if verbose:
+ emit.print_cmds()
+ print("number of commands", len(emit.cmd_stream))
+ print("command stream length in words", len(sg.register_command_stream))
diff --git a/ethosu/vela/rewrite_graph.py b/ethosu/vela/rewrite_graph.py
new file mode 100644
index 00000000..e6e24e62
--- /dev/null
+++ b/ethosu/vela/rewrite_graph.py
@@ -0,0 +1,171 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Functions for abstracting out the traversal and rewriting of graphs so that the optimisation passes can focus on the
+# correct operation.
+#
+# Requires two lists, one of functions that rewrite Tensors, and one of functions that rewrite Operations.
+#
+# Pre-order traversal, this supports rewrites. Therefore, functions can return something other than the original value.
+#
+# Post-order traversal, this does not support rewrites. Therefore, functions must return the original value.
+
+
+def rewrite_graph_pre_order(sg, arch, tensor_rewrite_list, op_rewrite_list, rewrite_unsupported=True):
+
+ op_visit_dict = dict()
+ tens_visit_dict = dict()
+
+ def visit_op(op):
+ if op in op_visit_dict:
+ return op_visit_dict[op]
+ res = op
+ prev_res = None
+ while prev_res != res:
+ prev_res = res
+ for rewrite in op_rewrite_list:
+ if res.run_on_npu or rewrite_unsupported:
+ res = rewrite(res, arch)
+
+ op_visit_dict[op] = res
+ op_visit_dict[res] = res
+
+ inputs = res.inputs
+ res.inputs = []
+ for tens in inputs:
+ res.inputs.append(visit_tens(tens))
+
+ outputs = res.outputs
+ res.outputs = []
+ for tens in outputs:
+ res.outputs.append(visit_tens(tens))
+
+ return res
+
+ def visit_tens(tens):
+ if tens in tens_visit_dict:
+ return tens_visit_dict[tens]
+
+ res = tens
+ prev_res = None
+ while prev_res != res:
+ prev_res = res
+ for rewrite in tensor_rewrite_list:
+ res = rewrite(res, arch)
+
+ tens_visit_dict[tens] = res
+ tens_visit_dict[res] = res
+
+ ops = res.ops
+ res.ops = []
+ for op in ops:
+ res.ops.append(visit_op(op))
+ return res
+
+ sg.output_tensors = [visit_tens(tens) for tens in sg.output_tensors]
+ sg.refresh_after_modification()
+
+ return sg
+
+
+def visit_graph_post_order(sg, arch, tensor_visit_list, op_visit_list):
+
+ op_visit_dict = dict()
+ tens_visit_dict = dict()
+
+ def visit_op(op):
+ if op in op_visit_dict:
+ return op_visit_dict[op]
+ op_visit_dict[op] = op
+
+ for tens in op.inputs:
+ visit_tens(tens)
+
+ for visit in op_visit_list:
+ visit(op, arch)
+
+ for tens in op.outputs:
+ visit_tens(tens)
+
+ return op
+
+ def visit_tens(tens):
+ if tens in tens_visit_dict:
+ return tens_visit_dict[tens]
+
+ tens_visit_dict[tens] = tens
+
+ for op in tens.ops:
+ visit_op(op)
+
+ for visit in tensor_visit_list:
+ visit(tens, arch)
+
+ return tens
+
+ for tens in sg.output_tensors:
+ visit_tens(tens)
+
+ sg.refresh_after_modification()
+
+ return sg
+
+
+def verify_graph_health(nng):
+
+ for sg in nng.subgraphs:
+ verify_subgraph_health(sg)
+
+ return True
+
+
+def verify_subgraph_health(sg):
+ op_visit_dict = dict()
+ tens_visit_dict = dict()
+
+ def visit_op(op):
+ if op in op_visit_dict:
+ return op_visit_dict[op]
+ op_visit_dict[op] = op
+
+ for tens in op.inputs:
+ assert op in tens.consumers()
+ visit_tens(tens)
+
+ for tens in op.outputs:
+ assert op in tens.ops
+ visit_tens(tens)
+
+ return op
+
+ def visit_tens(tens):
+ if tens in tens_visit_dict:
+ return tens_visit_dict[tens]
+
+ tens_visit_dict[tens] = tens
+
+ for op in tens.ops:
+ assert tens in op.outputs
+ visit_op(op)
+
+ return tens
+
+ for tens in sg.output_tensors:
+ visit_tens(tens)
+
+ return True
diff --git a/ethosu/vela/scaling.py b/ethosu/vela/scaling.py
new file mode 100644
index 00000000..b255f938
--- /dev/null
+++ b/ethosu/vela/scaling.py
@@ -0,0 +1,91 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Contains various scaling calculations for weights, elementwise operations, pooling etc.
+
+import math
+from .numeric_util import round_away_zero
+from enum import IntEnum
+
+
+class OperandToScale(IntEnum):
+ OPa = 1
+ OPb = 2
+
+
+# Quantise floating point scale value into 32-bit int scale and 6-bit shift
+def quantise_scale(scale):
+ significand, exponent = math.frexp(scale)
+ significand_q31 = int(round_away_zero(significand * (1 << 31)))
+ exponent_q31 = exponent - 31
+ shift = exponent_q31 * -1
+
+ if shift >= (1 << 6):
+ # Shift outside of valid range, set scale to 0
+ return 0, 16
+
+ return significand_q31, shift
+
+
+# Calculate global OFM scale for Average Pooling
+def quantise_pooling_scale(nr_kernel_elements, rescale_bits=0):
+ _, k = math.frexp(nr_kernel_elements - 1)
+ N = 31 - rescale_bits
+ scale = ((1 << (N + k)) + (1 << k)) // nr_kernel_elements
+ shift = N + k
+
+ assert shift < (1 << 6)
+
+ return scale, shift
+
+
+# Calculate elementwise Mul OFM scale+shift
+def elementwise_mul_scale(input_scale, input2_scale, output_scale):
+ output_rescale = (input_scale * input2_scale) / output_scale
+ out_scale, out_shift = quantise_scale(output_rescale)
+ return out_scale, out_shift
+
+
+# Simplified version of calculating elementwise Add/Sub scales
+def simplified_elementwise_add_sub_scale(input1_scale, input2_scale, output_scale, input_shift=16):
+ max_input_scale = max(input1_scale, input2_scale)
+
+ input1_rescale = input1_scale * (1 << input_shift) / (2 * max_input_scale)
+ input2_rescale = input2_scale * (1 << input_shift) / (2 * max_input_scale)
+ output_rescale = (2 * max_input_scale) / (output_scale * (1 << input_shift))
+
+ out_scale, out_shift = quantise_scale(output_rescale)
+
+ return input1_rescale, input2_rescale, out_scale, out_shift
+
+
+# Advanced version of calculating elementwise Add/Sub scales
+def advanced_elementwise_add_sub_scale(input1_scale, input2_scale, output_scale, bitdepth):
+ # Always scale the smaller of the input scales
+ max_input_scale = max(input1_scale, input2_scale)
+ min_input_scale = min(input1_scale, input2_scale)
+ input_shift = 20 if bitdepth == 8 else 14
+ op_to_scale = OperandToScale.OPa if input1_scale < input2_scale else OperandToScale.OPb
+
+ input1_rescale, _, out_scale, out_shift = simplified_elementwise_add_sub_scale(
+ min_input_scale, max_input_scale, output_scale, input_shift
+ )
+
+ in_scale, in_shift = quantise_scale(input1_rescale)
+
+ return in_scale, in_shift, out_scale, out_shift, op_to_scale
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
new file mode 100644
index 00000000..c35c1566
--- /dev/null
+++ b/ethosu/vela/scheduler.py
@@ -0,0 +1,949 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# The scheduler costs various strategies for scheduling the network in order to select the block configuration.
+
+import enum
+from .nn_graph import (
+ TensorPurpose,
+ TensorSubPurpose,
+ TensorFormat,
+ MemArea,
+ SchedulingStrategy,
+ CascadedPass,
+ PassPlacement,
+ SchedulerRewrite,
+ Operation,
+ NpuBlockType,
+)
+from . import live_range
+import numpy as np
+from . import npu_performance
+from . import stats_writer
+from .npu_performance import make_bandwidth_array, make_macs_array, make_cycles_array, make_metrics_arrays, PassCycles
+import time, copy
+from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_pass_list
+from .shared_buffer_allocation import (
+ find_block_configs_suitable_for_pass_and_shared_buffer,
+ shared_buffer_allocation_for_pass_and_block_config,
+)
+from functools import lru_cache
+
+
+class ParetoMetric(enum.Enum):
+ BwCycMem = 1
+ BwCycMemBlkH = 2
+
+ def __str__(self):
+ return self.name
+
+
+class SchedulerOptions:
+ def __init__(
+ self,
+ use_cascading=True,
+ use_ifm_ofm_overlap=True,
+ verbose_schedule=False,
+ verbose_pareto_frontier_schedules=False,
+ use_ifm_streaming=True,
+ pareto_metric=ParetoMetric.BwCycMem,
+ ):
+ self.use_cascading = use_cascading
+ self.use_ifm_ofm_overlap = use_ifm_ofm_overlap
+ self.verbose_schedule = verbose_schedule
+ self.verbose_pareto_frontier_schedules = verbose_pareto_frontier_schedules
+ self.use_ifm_streaming = use_ifm_streaming
+ self.pareto_metric = pareto_metric
+
+ def __str__(self):
+ return type(self).__name__ + ": " + str(self.__dict__)
+
+ __repr__ = __str__
+
+
+class Strategy:
+ __slots__ = "strat", "param", "passes", "block_configs", "rewrite_list", "bws", "macs", "cycles", "sram_used"
+
+ def __init__(self, strat, param, passes, block_configs, rewrite_list, bws, macs, cycles, sram_used):
+ self.strat = strat
+ self.param = param
+ self.passes = passes
+ self.block_configs = block_configs
+ self.rewrite_list = (
+ rewrite_list # list of (SchedulerRewrite, Tensor, new sub purpose, purpose param a, purpose param b, pass)
+ )
+ self.bws = bws
+ self.macs = macs
+ self.cycles = cycles
+ self.sram_used = sram_used
+
+ def __eq__(self, other):
+ if self.strat != other.strat:
+ return False
+ if self.param != other.param:
+ return False
+ if self.block_configs != other.block_configs:
+ return False
+ if self.passes != other.passes:
+ return False
+ if (self.bws != other.bws).any():
+ return False
+ if (self.macs != other.macs).any():
+ return False
+ if (self.cycles != other.cycles).any():
+ return False
+ if self.sram_used != other.sram_used:
+ return False
+ return True
+
+ def empty(self):
+ return not self.passes
+
+ def key(self):
+ return self.passes[-1]
+
+ def clone(self):
+ return Strategy(
+ self.strat,
+ self.param,
+ self.passes,
+ self.block_configs,
+ self.rewrite_list,
+ self.bws,
+ self.macs,
+ self.cycles,
+ self.sram_used,
+ )
+
+ def __str__(self):
+ return "<scheduler.Strategy: %s %s %s %s %s %s %s>" % (
+ self.strat,
+ self.passes,
+ self.rewrite_list,
+ self.bws,
+ self.macs,
+ self.cycles,
+ self.sram_used,
+ )
+
+ __repr__ = __str__
+
+
+class StrategySet:
+ __slots__ = "strats", "bws", "macs", "cycles", "max_sram_used", "total_sram_used"
+
+ def __init__(self, strats=None):
+ if strats is None:
+ strats = dict()
+ self.strats = strats # final pass in packed pass -> Strategy
+ self.bws, self.macs, self.cycles = make_metrics_arrays()
+ self.max_sram_used = 0
+ self.total_sram_used = 0
+
+ def update_statistics(self):
+ self.bws = make_bandwidth_array()
+ self.max_sram_used = 0
+ for ps, strat in self.strats.items():
+ self.bws += strat.bws
+ self.macs += strat.macs
+ self.cycles += strat.cycles
+ self.max_sram_used = max(self.max_sram_used, strat.sram_used)
+ self.total_sram_used += strat.sram_used
+
+ def clone_add_strategy(self, new_strat):
+ key = new_strat.key()
+ if key in self.strats:
+ assert new_strat == self.strats[key]
+ return self
+ else:
+ new_strats = dict(self.strats)
+ new_strats[key] = new_strat
+ new_set = StrategySet(new_strats)
+ new_set.bws = self.bws + new_strat.bws
+ new_set.macs = self.macs + new_strat.macs
+ new_set.cycles = self.cycles + new_strat.cycles
+ new_set.max_sram_used = max(self.max_sram_used, new_strat.sram_used)
+ new_set.total_sram_used = self.total_sram_used + new_strat.sram_used
+ return new_set
+
+ def __eq__(self, other):
+ if (self.bws != other.bws).any():
+ return False
+ if (self.macs != other.macs).any():
+ return False
+ if (self.cycles != other.cycles).any():
+ return False
+ if self.max_sram_used != other.max_sram_used:
+ return False
+ if self.total_sram_used != other.total_sram_used:
+ return False
+ if self.strats != other.strats:
+ return False
+ return True
+
+ def __str__(self):
+ return "<scheduler.StrategySet: max_sram_used=%s passes_covered=%s>" % (
+ self.max_sram_used,
+ list(ps.name for ps in self.strats),
+ )
+
+ __repr__ = __str__
+
+
+empty_strategy = Strategy(
+ SchedulingStrategy.Unknown, None, [], [], [], make_bandwidth_array(), make_macs_array(), make_cycles_array(), 0
+)
+INFINITY = 1e30
+
+ABORT_SEARCH = []
+
+
+def flatten_list_of_lists(lstlst):
+ lst = []
+ for v in lstlst:
+ lst.extend(v)
+ return lst
+
+
+class DynamicProgrammingScheduler:
+ def __init__(self, nng, sg, arch, sram_limit, options: SchedulerOptions):
+ self.nng = nng
+ self.sg = sg
+ self.arch = arch
+ self.sram_limit = sram_limit
+ self.options = copy.copy(options)
+ self.use_cascading = options.use_cascading
+
+ if self.arch.feature_map_storage_mem_area != MemArea.Sram:
+ self.use_ifm_ofm_overlap = False # force off IFM/OFM overlap if IFMs and OFMs are not in the SRAM
+ self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap
+
+ self.verbose_schedule = options.verbose_schedule
+ self.verbose_pareto_frontier_schedules = options.verbose_pareto_frontier_schedules
+ self.mem_area = MemArea.Sram
+
+ self.bandwidth_weights = arch.bandwidth_weights
+ self.cycles_weight = arch.cycles_weight
+ self.max_sram_used_weight = arch.max_sram_used_weight
+
+ self.n_combinations_searched = 0
+
+ self.feature_maps_not_in_fast_storage = (
+ arch.tensor_storage_mem_area[TensorPurpose.FeatureMap] != arch.fast_storage_mem_area
+ )
+
+ self.pareto_max_candidates = 16
+
+ self.ifm_stream_npu_blocks = set(
+ (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling,)
+ )
+
+ num_pareto_metrics = 4
+ view_values = ",".join(["d"] * num_pareto_metrics)
+ order_values = ["f%d" % (idx,) for idx in range(num_pareto_metrics)]
+
+ def pareto_metric(self, candidate):
+ strat, strat_set = candidate
+ total_cycles = strat.cycles[PassCycles.Total] + strat_set.cycles[PassCycles.Total]
+ bws = strat.bws + strat_set.bws
+ last_block_height = 0
+ if self.options.pareto_metric == ParetoMetric.BwCycMemBlkH and len(strat.block_configs) > 0:
+ last_block_height = strat.block_configs[-1][0]
+
+ return (
+ np.tensordot(bws, self.bandwidth_weights, axes=3) + total_cycles * self.cycles_weight,
+ strat_set.max_sram_used,
+ strat.sram_used,
+ last_block_height,
+ )
+
+ def filter_pareto_frontier(self, candidates, remove_equally_good_candidates):
+
+ candidates = [cand for cand in candidates if max(cand[0].sram_used, cand[1].max_sram_used) <= self.sram_limit]
+
+ if len(candidates) <= 1:
+ return candidates
+ assert remove_equally_good_candidates
+ start = time.time()
+ pareto_vals = np.zeros((len(candidates), DynamicProgrammingScheduler.num_pareto_metrics))
+ ids = np.arange(len(candidates), dtype=np.int32)
+ for idx, cand in enumerate(candidates):
+ pareto_vals[idx] = self.pareto_metric(cand)
+
+ sort_order = np.argsort(
+ pareto_vals.view(DynamicProgrammingScheduler.view_values),
+ order=DynamicProgrammingScheduler.order_values,
+ axis=0,
+ kind="stable",
+ ).flatten()
+ pareto_vals = pareto_vals[sort_order]
+ ids = ids[sort_order]
+
+ pareto_frontier = []
+ while len(ids) > 0:
+ pareto_frontier.append(candidates[ids[0]])
+ not_dominated_by_first = (pareto_vals < pareto_vals[0]).any(axis=1)
+ ids = ids[not_dominated_by_first]
+ pareto_vals = pareto_vals[not_dominated_by_first]
+
+ if len(pareto_frontier) > self.pareto_max_candidates:
+ pareto_frontier = self.sort_by_candidate_metric(pareto_frontier)
+ pareto_frontier = pareto_frontier[: self.pareto_max_candidates]
+
+ return pareto_frontier
+
+ def candidate_metric(self, candidate):
+ strat, strat_set = candidate
+ max_sram_used = max(strat_set.max_sram_used, strat.sram_used)
+ bws = strat.bws + strat_set.bws
+ total_cycles = strat.cycles[PassCycles.Total] + strat_set.cycles[PassCycles.Total]
+
+ return (
+ max_sram_used * self.max_sram_used_weight
+ + np.tensordot(bws, self.bandwidth_weights, axes=3)
+ + total_cycles * self.cycles_weight
+ )
+
+ def sort_by_candidate_metric(self, candidate_list):
+ sorted_list = list(sorted(candidate_list, key=self.candidate_metric))
+ return sorted_list
+
+ def best_candidate(self, candidate_list):
+ if len(candidate_list) == 0:
+ return ABORT_SEARCH
+ if len(candidate_list) == 1:
+ return candidate_list[0]
+ sorted_list = self.sort_by_candidate_metric(candidate_list)
+ return sorted_list[0]
+
+ def graduate_strat(self, strat_type, sram_used, old_strat_data):
+ res = []
+ for old_strat, old_strat_set in old_strat_data:
+ if old_strat.sram_used + sram_used > self.sram_limit:
+ continue # This strategy is bad, drop it
+ if old_strat_set.max_sram_used > self.sram_limit:
+ continue # This strategy is bad, drop it
+ assert old_strat.strat == SchedulingStrategy.Unknown
+
+ new_strat = old_strat.clone()
+ new_strat.strat = strat_type
+ new_strat.sram_used = old_strat.sram_used + sram_used
+
+ if self.use_ifm_ofm_overlap:
+ overlap = calc_allowed_ofm_ifm_overlap_for_pass_list(
+ new_strat.strat, new_strat.passes, new_strat.block_configs
+ )
+ new_strat.sram_used -= overlap
+
+ new_strat_set = old_strat_set.clone_add_strategy(new_strat)
+ res.append((empty_strategy, new_strat_set))
+ return self.filter_pareto_frontier(res, remove_equally_good_candidates=True)
+
+ def append_sram(self, sram_used, old_strat_data):
+ res = []
+ for old_strat, strat_set in old_strat_data:
+ assert old_strat.strat == SchedulingStrategy.Unknown
+ assert old_strat.sram_used == 0
+ new_strat = old_strat.clone()
+ new_strat.sram_used = old_strat.sram_used + sram_used
+
+ res.append((new_strat, strat_set))
+ return res
+
+ def append_sram_block_config_performance_metrics(self, sram_used, block_config, metrics, old_strat_data):
+ res = []
+ for old_strat, strat_set in old_strat_data:
+ assert old_strat.strat == SchedulingStrategy.Unknown
+ new_strat = old_strat.clone()
+ bws, macs, cycles = metrics[:3]
+
+ new_strat.sram_used = old_strat.sram_used + sram_used
+ new_strat.block_configs = old_strat.block_configs + [block_config]
+ new_strat.bws = old_strat.bws + bws
+ new_strat.macs = old_strat.macs + macs
+ new_strat.cycles = old_strat.cycles + cycles
+ new_strat.bws, new_strat.macs, new_strat.cycles = npu_performance.collate_stats_for_cascaded_pass(
+ self.arch, new_strat.bws, new_strat.macs, new_strat.cycles
+ )
+
+ res.append((new_strat, strat_set))
+ return res
+
+ def append_sram_pass_block_config_performance_metrics_rewrite_list(
+ self, sram_used, new_pass, block_config, metrics, rewrite_list, old_strat_data
+ ):
+ res = []
+ for old_strat, strat_set in old_strat_data:
+ assert old_strat.strat == SchedulingStrategy.Unknown
+ new_strat = old_strat.clone()
+ bws, macs, cycles = metrics[:3]
+ new_strat.sram_used = old_strat.sram_used + sram_used
+ new_strat.block_configs = old_strat.block_configs + [block_config]
+ new_strat.bws = old_strat.bws + bws
+ new_strat.macs = old_strat.macs + macs
+ new_strat.cycles = old_strat.cycles + cycles
+ new_strat.passes = old_strat.passes + [new_pass]
+ new_strat.bws, new_strat.macs, new_strat.cycles = npu_performance.collate_stats_for_cascaded_pass(
+ self.arch, new_strat.bws, new_strat.macs, new_strat.cycles
+ )
+ new_strat.rewrite_list = old_strat.rewrite_list + rewrite_list
+ res.append((new_strat, strat_set))
+ return res
+
+ def append_sram_rewrite_list(self, sram_used, rewrite_list, old_strat_data):
+ res = []
+ for old_strat, strat_set in old_strat_data:
+ assert old_strat.strat == SchedulingStrategy.Unknown
+ new_strat = old_strat.clone()
+ new_strat.sram_used = old_strat.sram_used + sram_used
+ new_strat.rewrite_list = old_strat.rewrite_list + rewrite_list
+ res.append((new_strat, strat_set))
+ return res
+
+ def pass_to_strat(self, strat_data):
+ res = {}
+ for strat in strat_data[1].strats.values():
+ for ps in strat.passes:
+ res[ps] = strat
+ return res
+
+ def compatible_strats(self, a, b):
+ intersection = a.keys() & b.keys()
+ for k in intersection:
+ if a[k] != b[k]:
+ return False
+ return True
+
+ def collate_strats_for_passes(self, all_passes):
+ if len(all_passes) == 0:
+ return [(empty_strategy, StrategySet(dict()))]
+ if len(all_passes) == 1:
+ return all_passes[0] # save some space in the common case
+ all_strands = [[self.pass_to_strat(strat_data) for strat_data in strand] for strand in all_passes]
+ prev_combos = [dict()]
+ for j, strand in enumerate(all_strands):
+ new_combos = []
+ for i, alt in enumerate(strand):
+ for prev in prev_combos:
+ if self.compatible_strats(prev, alt):
+ cmb = dict(prev)
+ cmb.update(all_passes[j][i][1].strats)
+ new_combos.append(cmb)
+ prev_combos = new_combos
+
+ res = []
+ for d in prev_combos:
+ s = StrategySet(d)
+ s.update_statistics()
+ res.append((empty_strategy, s))
+ return res
+
+ def search_all_but_one_predecessor(self, ps, pred_pass, pred_pass_data):
+ # get the rest of the predecessors
+ other_predecessors = [pred for pred in ps.dag_predecessors if pred != pred_pass]
+ other_predecessor_data = self.search_pass_list(other_predecessors)
+
+ # pred strat data has an incomplete strategy, which we need
+ # to continue on, whereas the other ones have completed strategies.
+ # we need to merge these, but keep the incomplete strategy too.
+
+ res = []
+ for pred_pass_strat, pred_pass_strat_set in pred_pass_data:
+ all_strats = [
+ [(empty_strategy, pred_pass_strat_set)], # pred strat data but with a dummy empty strategy
+ other_predecessor_data, # this one is fine to use as-is
+ ]
+ collated_strat_data = self.collate_strats_for_passes(all_strats)
+ strat_data = [(pred_pass_strat, strat_set) for _, strat_set in collated_strat_data]
+ res.extend(strat_data)
+ return res
+
+ def calc_non_local_mem_usage(self):
+ ignore_subgraph_input_output_tensors = self.sg.placement == PassPlacement.Cpu
+ range_set = live_range.extract_live_ranges_from_passes(
+ self.sg,
+ self.mem_area,
+ mark_output_tensors_overlapping_with_input_tensors=True,
+ ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
+ )
+ range_dict = range_set.ranges
+
+ # find which ranges overlap passes but aren't input/outputs of the passes.
+ # these won't be counted by the dynamic programming search and must be counted in manually.
+ end_pos = max(ps.time for ps in self.sg.passes) + 2
+ mem_usage = np.zeros(end_pos) + self.sg.base_sram_used
+ non_local_mem_usage = np.zeros(end_pos, dtype=np.int64)
+
+ for tens, rng in range_dict.items():
+ storage_size = tens.storage_size()
+ assert tens.mem_area == self.mem_area
+ mem_usage[rng.start_time : rng.end_time] += storage_size
+
+ for ps in self.sg.passes:
+ local_mem_usage = 0
+ for tens in ps.inputs + ps.outputs + ps.intermediates:
+ if tens.mem_area != self.mem_area:
+ continue
+
+ local_mem_usage += tens.storage_size()
+
+ non_local_mem_usage[ps.time] = mem_usage[ps.time] - local_mem_usage
+
+ self.non_local_mem_usage = non_local_mem_usage
+
+ def search(self):
+ self.calc_non_local_mem_usage()
+ starting_passes = [ps for ps in self.sg.passes if not ps.successors]
+ strat_data = self.search_pass_list(starting_passes)
+
+ _, best_set = self.best_candidate(strat_data)
+
+ if self.verbose_pareto_frontier_schedules:
+ print(
+ "Scheduler searched %d combinations and found %d candidate schedules along the pareto frontier"
+ % (self.n_combinations_searched, len(strat_data,))
+ )
+ for idx, (_, strat_set) in enumerate(strat_data):
+ extra = ""
+ if strat_set == best_set:
+ extra = "(Best candidate)"
+ print("Candidate", idx, extra)
+ memory_used = {MemArea.Sram: strat_set.max_sram_used}
+ stats_writer.print_performance_metrics_for_strat(
+ self.arch,
+ "",
+ strat_set.cycles,
+ strat_set.macs,
+ strat_set.bws,
+ self.nng.batch_size,
+ memory_used,
+ len(self.sg.passes),
+ len(strat_set.strats),
+ )
+
+ return best_set
+
+ def search_pass_list(self, pass_list):
+ all_strats = []
+ for ps in pass_list:
+ strat = self.search_output(ps)
+ all_strats.append(strat)
+ strat_data = self.collate_strats_for_passes(all_strats)
+ for strd in strat_data:
+ for ps in pass_list:
+ assert ps in strd[1].strats # should have strategies for everything we asked to search
+ return strat_data
+
+ def search_predecessors(self, ps):
+
+ # protect against graphs with loops. collate_strats_for_passes will sort this out later so that
+ # we have strats for all passes
+
+ pass_list = ps.dag_predecessors
+ strat_data = self.search_pass_list(pass_list)
+
+ return strat_data
+
+ @lru_cache(maxsize=None)
+ def search_output(self, ps):
+
+ assert ps in self.sg.passes
+ candidate_list = []
+
+ candidate_list.extend(self.search_weight_streaming_output(ps))
+
+ if self.options.use_ifm_streaming:
+ candidate_list.extend(self.search_ifm_streaming_output(ps))
+
+ best = self.filter_pareto_frontier(candidate_list, remove_equally_good_candidates=True)
+
+ if not best:
+ print(
+ "Warning: Dynamic search programming algorithm failed for pass %s, invoking fallback strategy"
+ % (ps.name,)
+ )
+ return self.search_predecessors(ps)
+
+ return best
+
+ def search_ifm_streaming_output(self, ps):
+ if ps.placement != PassPlacement.Npu:
+ return ABORT_SEARCH
+ if ps.npu_block_type not in self.ifm_stream_npu_blocks:
+ return ABORT_SEARCH
+ strat_data = self.search_ifm_streaming_body(ps, False)
+
+ sram_used = self.non_local_mem_usage[ps.time]
+ for tens in ps.outputs:
+ if tens.mem_area == self.mem_area:
+ sram_used += tens.storage_size()
+
+ return self.graduate_strat(SchedulingStrategy.IfmStream, sram_used, strat_data)
+
+ @lru_cache(maxsize=None)
+ def search_ifm_streaming_body(self, ps, force_outputs_to_fast_storage):
+ if ps.placement != PassPlacement.Npu:
+ return ABORT_SEARCH
+ if ps.npu_block_type not in self.ifm_stream_npu_blocks:
+ return ABORT_SEARCH
+ ifm_input_search_resuls = self.search_ifm_streaming_input(ps)
+ res = []
+
+ base_sram_used = 0
+ for tens in ps.intermediates:
+ if tens.mem_area == self.mem_area:
+ base_sram_used += tens.storage_size()
+
+ all_block_configs = self.get_block_configs(ps)
+ for block_config in all_block_configs:
+ all_strats = []
+
+ if self.use_cascading:
+ all_strats.extend(self.search_ifm_streaming_partial(ps, block_config))
+
+ all_strats.extend(ifm_input_search_resuls)
+
+ rewrite_list = []
+ sram_used = base_sram_used
+
+ metrics = npu_performance.performance_metrics_for_pass(
+ self.arch,
+ ps,
+ block_config,
+ rewrite_list=rewrite_list,
+ force_outputs_to_fast_storage=force_outputs_to_fast_storage,
+ )
+
+ res.extend(
+ self.append_sram_pass_block_config_performance_metrics_rewrite_list(
+ sram_used, ps, block_config, metrics, rewrite_list, all_strats
+ )
+ )
+
+ self.n_combinations_searched += len(res)
+ res = self.filter_pareto_frontier(res, remove_equally_good_candidates=True)
+ return res
+
+ def search_ifm_streaming_partial(self, ps, block_config):
+ if ps.placement != PassPlacement.Npu:
+ return ABORT_SEARCH
+
+ if len(ps.inputs) < 1:
+ return ABORT_SEARCH
+
+ ifm_tensor = ps.ifm_tensor
+
+ if ifm_tensor is None:
+ return ABORT_SEARCH
+ if ifm_tensor.purpose != TensorPurpose.FeatureMap:
+ return ABORT_SEARCH
+ if not ifm_tensor.storage_shape or len(ifm_tensor.storage_shape) != 4:
+ return ABORT_SEARCH
+
+ pred_pass_list = []
+ for pred_candidate in ps.dag_predecessors:
+ if len(pred_candidate.outputs) == 1 and pred_candidate.outputs[0] == ifm_tensor:
+ # we found a predecessor that produces this IFM tensor
+ if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps:
+ # and it only has one successor, namely us
+ if pred_candidate.placement == PassPlacement.Npu:
+ if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks:
+ # and it is on the Npu and fusable - it's a candidate
+ pred_pass_list.append(pred_candidate)
+
+ if not pred_pass_list:
+ return ABORT_SEARCH
+
+ all_candidates = []
+ for pred_pass in pred_pass_list:
+ # recurse into the next pass
+ ifm_strat_data = self.search_ifm_streaming_body(pred_pass, self.feature_maps_not_in_fast_storage)
+
+ strat_data = self.search_all_but_one_predecessor(ps, pred_pass, ifm_strat_data)
+ for strat_opt in strat_data:
+
+ pred_pass_block_config = strat_opt[0].block_configs[-1]
+ rolling_buffer_dims = npu_performance.rolling_buffer_dims_from_passes(
+ self.arch, pred_pass, pred_pass_block_config, ps, block_config
+ )
+ if rolling_buffer_dims is None:
+ continue # this does not pack properly, skip it.
+
+ sram_used = 0
+ for tens in ps.inputs:
+ if tens != ifm_tensor:
+ if tens.mem_area == self.mem_area:
+ sram_used += tens.storage_size()
+
+ rolling_buffer_y, rolling_buffer_x = rolling_buffer_dims
+
+ rewrite_list = [
+ (
+ SchedulerRewrite.ChangeTensorSubPurpose,
+ ifm_tensor,
+ TensorSubPurpose.RollingBufferY,
+ rolling_buffer_y,
+ None,
+ ps,
+ )
+ ]
+ sram_used += ifm_tensor.storage_size_for_sub_purpose(
+ TensorSubPurpose.RollingBufferY, rolling_buffer_y, None
+ )
+
+ all_candidates.extend(self.append_sram_rewrite_list(sram_used, rewrite_list, [strat_opt]))
+
+ self.n_combinations_searched += len(all_candidates)
+ return all_candidates
+
+ def get_block_configs(self, ps):
+ if ps.placement != PassPlacement.Npu:
+ return [(1, 1, 1, 1)] # default
+
+ block_configs = find_block_configs_suitable_for_pass_and_shared_buffer(self.arch, ps)
+
+ # Take a limited number of the largest blocks
+ if self.arch.block_config_limit > 0:
+ # Sort by block area, followed by depth
+ block_configs.sort(key=lambda cfg: (cfg[0] * cfg[1]) << 8 | cfg[3], reverse=True)
+ bound = min(len(block_configs), self.arch.block_config_limit)
+ # We take 'n' from the fat end of the list, and 'n' from the thin end of the list.
+ tmp = block_configs[:bound]
+ tmp.extend(block_configs[max(bound, len(block_configs) - bound) :])
+ block_configs = tmp
+
+ return block_configs
+
+ def search_ifm_streaming_input(self, ps):
+ sram_used = 0
+ for tens in ps.inputs:
+ if tens.mem_area == self.mem_area:
+ sram_used += tens.storage_size()
+
+ return self.append_sram(sram_used, self.search_predecessors(ps))
+
+ def search_weight_streaming_output(self, ps):
+ strat_data = self.search_weight_streaming_body(ps)
+
+ sram_used = self.non_local_mem_usage[ps.time]
+ for tens in ps.outputs:
+ if tens.mem_area == self.mem_area:
+ sram_used += tens.storage_size()
+
+ return self.graduate_strat(SchedulingStrategy.WeightStream, sram_used, strat_data)
+
+ @lru_cache(maxsize=None)
+ def search_weight_streaming_body(self, ps):
+
+ strat_data = self.search_weight_streaming_input(ps)
+
+ res = []
+
+ all_block_configs = self.get_block_configs(ps)
+
+ for block_config in all_block_configs:
+
+ sram_used = 0
+ rewrite_list = []
+
+ for tens in ps.intermediates:
+ if tens.mem_area == self.mem_area:
+ if tens.purpose == TensorPurpose.Weights:
+ sram_used += tens.storage_size_for_sub_purpose(
+ TensorSubPurpose.DoubleBuffer, block_config[3]
+ )
+ rewrite_list.append(
+ (
+ SchedulerRewrite.ChangeTensorSubPurpose,
+ tens,
+ TensorSubPurpose.DoubleBuffer,
+ block_config[3],
+ None,
+ ps,
+ )
+ )
+ else:
+ sram_used += tens.storage_size()
+
+ metrics = npu_performance.performance_metrics_for_pass(
+ self.arch, ps, block_config, rewrite_list=rewrite_list
+ )
+
+ res.extend(
+ self.append_sram_pass_block_config_performance_metrics_rewrite_list(
+ sram_used, ps, block_config, metrics, rewrite_list, strat_data
+ )
+ )
+
+ self.n_combinations_searched += len(res)
+ res = self.filter_pareto_frontier(res, remove_equally_good_candidates=True)
+ return res
+
+ def search_weight_streaming_input(self, ps):
+ sram_used = 0
+ for tens in ps.inputs:
+ if tens.mem_area == self.mem_area:
+ sram_used += tens.storage_size()
+
+ return self.append_sram(sram_used, self.search_predecessors(ps))
+
+ def apply_result(self, strat_set, arch):
+ pass_to_cascaded_pass = dict()
+ for _, strat in strat_set.strats.items():
+ # rewrite the tensors that need this first. e.g. make rolling buffers
+ inputs = []
+ intermediates = []
+ outputs = []
+
+ for ps in strat.passes:
+ inputs += ps.inputs
+ intermediates += ps.intermediates
+ outputs += ps.outputs
+
+ for tens in set(inputs) & set(outputs):
+ # tensors that are in both sets are intermediates
+
+ # find pass with input/output tensor, and check if they are both placed on NPU
+ input_placement = None
+ output_placement = None
+ for ps in strat.passes:
+ if tens in ps.inputs:
+ input_placement = ps.placement
+ if tens in ps.outputs:
+ output_placement = ps.placement
+ if input_placement == output_placement == PassPlacement.Npu:
+ tens.set_format(TensorFormat.NHCWB16, arch)
+
+ intermediates.append(tens)
+ inputs.remove(tens)
+ outputs.remove(tens)
+
+ for rewrite_op, tens, sub_purpose, param_a, param_b, ps in strat.rewrite_list:
+ if rewrite_op == SchedulerRewrite.ChangeTensorSubPurpose:
+ tens.mem_area = self.arch.fast_storage_mem_area
+ tens.set_new_sub_purpose(sub_purpose, param_a, param_b)
+ else:
+ assert 0, "unknown rewrite_op " + str(rewrite_op)
+
+ is_element_wise = True
+ for ps in strat.passes:
+ assert ps.placement == strat.passes[0].placement
+ if not ps.is_element_wise:
+ is_element_wise = False
+ break
+
+ cascaded_pass = CascadedPass(
+ strat.passes[0].name,
+ strat.strat,
+ inputs,
+ intermediates,
+ outputs,
+ strat.passes,
+ strat.passes[0].placement,
+ is_element_wise,
+ )
+ assert strat.sram_used >= 0
+ cascaded_pass.sram_used = strat.sram_used
+
+ for idx, ps in enumerate(strat.passes):
+ assert ps not in pass_to_cascaded_pass
+ pass_to_cascaded_pass[ps] = cascaded_pass
+ ps.cascade = cascaded_pass
+ ps.block_config = strat.block_configs[idx]
+
+ if ps.placement == PassPlacement.Npu:
+ ps.shared_buffer = shared_buffer_allocation_for_pass_and_block_config(
+ self.arch, ps, ps.block_config
+ )
+ assert ps.shared_buffer is not None
+
+ for op in ps.ops:
+ subgraph = op.attrs.get("subgraph")
+ if subgraph:
+ subgraph.base_sram_used = cascaded_pass.sram_used
+
+ # all passes should have a cascaded pass now
+ if len(pass_to_cascaded_pass) != len(self.sg.passes):
+ print(
+ "mismatch: we have %d passes, but only %d have cascaded passes associated"
+ % (len(self.sg.passes), len(pass_to_cascaded_pass))
+ )
+ for ps in self.sg.passes:
+ if not ps in pass_to_cascaded_pass:
+ print("%3d pass missing cascaded pass %s" % (ps.time, ps))
+
+ assert len(pass_to_cascaded_pass) == len(self.sg.passes)
+ # we have all the passes, but we need to put them in order and build predecessor/successor links.
+
+ visit_pass_set = set()
+ cascaded_passes = []
+
+ def visit_pass(ps):
+ if ps in visit_pass_set:
+ return
+ visit_pass_set.add(ps)
+
+ cps = ps.cascade
+ dont_traverse = set(cps.passes)
+
+ for ps in cps.passes:
+ for pred in ps.predecessors:
+ if pred in dont_traverse:
+ continue
+ visit_pass(pred)
+
+ cascaded_passes.append(cps)
+
+ starting_passes = [ps for ps in self.sg.passes if not ps.successors]
+ for ps in starting_passes:
+ visit_pass(ps)
+
+ # reorder so startup init cascaded passes come first
+ def is_startup_cascaded_pass(cps):
+ if not cps.passes:
+ return False
+ return cps.placement == PassPlacement.StartupInit
+
+ cascaded_passes = [cps for cps in cascaded_passes if is_startup_cascaded_pass(cps)] + [
+ cps for cps in cascaded_passes if not is_startup_cascaded_pass(cps)
+ ]
+
+ self.sg.cascaded_passes = cascaded_passes
+ self.sg.build_cascaded_pass_links()
+
+
+def schedule_passes(nng, arch, options: SchedulerOptions):
+
+ for sg in nng.subgraphs:
+ sg.base_sram_used = 0
+
+ for sg in nng.subgraphs:
+ # re-entering the same nodes from different contexts requires us to
+ # build a simplified directed acyclic (DAG) version of the graph to
+ # use for traversal, rather than using a visit dictionary. this avoids
+ # recursing infinitely due to loops.
+ sg.build_pass_dag_predecessors()
+
+ dps = DynamicProgrammingScheduler(nng, sg, arch, arch.sram_size, options)
+
+ strat_set = dps.search()
+
+ dps.apply_result(strat_set, arch)
+
+ if options.verbose_schedule:
+ sg.print_cascaded_passes()
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
new file mode 100644
index 00000000..b5408d19
--- /dev/null
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -0,0 +1,199 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.
+
+import numpy as np
+from .nn_graph import NpuBlockType
+from .numeric_util import round_up_divide, round_up
+from .architecture_features import Block, Kernel, SHRAMElements, SharedBufferArea, ArchitectureFeatures
+from . import pass_packing
+
+
+class SharedBufferAllocation:
+ def __init__(self, arch, ps):
+ self.arch = arch
+
+ self.bank_locations = np.zeros(SharedBufferArea.Size)
+ self.banks_required = np.zeros(SharedBufferArea.Size)
+
+ ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
+
+ strides = (1, 1, 1, 1)
+ dilation = (1, 1, 1, 1)
+ self.kernel = Kernel(1, 1)
+ is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
+
+ if ps.primary_op:
+ strides = ps.primary_op.attrs.get("strides", strides)
+ dilation = ps.primary_op.attrs.get("dilation", dilation)
+ k_h = 1
+ k_w = 1
+ if weight_tensor:
+ if ps.primary_op.type != "FullyConnectedAct":
+ k_h = weight_tensor.shape[0]
+ k_w = weight_tensor.shape[1]
+ else:
+ k_h = ps.primary_op.attrs.get("filter_height", 1)
+ k_w = ps.primary_op.attrs.get("filter_width", 1)
+
+ self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
+
+ self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (
+ NpuBlockType.ConvolutionDepthWise,
+ NpuBlockType.Pooling,
+ )
+ self.strides = strides
+
+ self.use_accumulator_element = SHRAMElements.Acc32
+ if is_elementwise:
+ self.use_ifm_element = SHRAMElements.IFM8_Elementwise
+ else:
+ self.use_ifm_element = SHRAMElements.IFM8
+
+ self.ifm_bits = 0
+ self.ifm_depth = 0
+ if ifm_tensor:
+ self.ifm_bits = ifm_tensor.dtype.size_in_bits()
+ if ifm_tensor.shape == [] and is_elementwise:
+ # Elementwise operator with scalar in ifm, use ifm2 depth
+ self.ifm_depth = ifm2_tensor.shape[-1]
+ else:
+ self.ifm_depth = ifm_tensor.shape[-1]
+ if self.ifm_bits == 16:
+ self.use_accumulator_element = SHRAMElements.Acc40
+ self.use_ifm_element = self.use_ifm_element + 1
+ assert (self.use_ifm_element == SHRAMElements.IFM16) or (
+ self.use_ifm_element == SHRAMElements.IFM16_Elementwise
+ )
+ else:
+ assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
+
+ self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
+ self.ofm_tensor = ofm_tensor
+
+ self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
+ self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
+
+ def is_valid(self):
+ # Assign zero-based bank starts (first element remains zero)
+ self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
+
+ # Accumulator area is measured from the end of the buffer
+ self.bank_locations[SharedBufferArea.Accumulators] = (
+ self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]
+ )
+ ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
+ return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
+
+ def try_block(self, ofm_block: Block):
+ # Get IFM block configuration
+ ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
+ ifm_block = self.arch.get_ifm_block_size(ifm_block_depth, ofm_block, self.kernel)
+ ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
+ if ifm_config is None:
+ return None
+
+ # Get OFM block configuration
+ ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
+ if ofm_config is None:
+ return None
+
+ # Update bank counts for IFM and Accumulator
+ self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element]
+ self.banks_required[SharedBufferArea.Accumulators] = ofm_config.banks[self.use_accumulator_element]
+
+ # Validating calculates bank layout and returns validity
+ if not self.is_valid():
+ return None
+
+ return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
+
+ def generate_used_mask(self, active_set):
+ res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
+ for kind in active_set:
+ start = int(self.bank_locations[kind])
+ end = start + int(self.banks_required[kind])
+ res[start:end] = 1
+ return res
+
+ def is_compatible(first, second):
+ """See if the bank allocations of two convolutions are compatible,
+ so that they can run back-to-back without a fence in between"""
+
+ first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
+ second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
+
+ first_mask = first.generate_used_mask(first_set)
+ second_mask = second.generate_used_mask(second_set)
+
+ if np.sum(first_mask & second_mask):
+ # overlap
+ return False
+
+ return True
+
+
+def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
+ alloc = SharedBufferAllocation(arch, ps)
+ assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
+ if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
+ return alloc
+
+ return None
+
+
+def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):
+ alloc = SharedBufferAllocation(arch, ps)
+
+ if arch.override_block_config:
+ config = alloc.try_block(arch.override_block_config)
+ assert config, "Block config override cannot be used"
+ return [config]
+
+ # Constrain the search space if the OFM is smaller than the max block size
+ # - Add other block search constraints here if required
+ if len(alloc.ofm_tensor.shape) == 2:
+ max_block_height = max_block_width = alloc.ofm_tensor.shape[0]
+ else:
+ max_block_width = alloc.ofm_tensor.shape[-2]
+ max_block_height = alloc.ofm_tensor.shape[-3]
+
+ # Common block depth
+ max_block_depth = alloc.ofm_tensor.shape[-1]
+
+ # Constrain to valid ranges before search
+ max_block_width = min(arch.ofm_block_max.width, max_block_width)
+ max_block_height = min(arch.ofm_block_max.height, max_block_height)
+ max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
+
+ valid_block_configs = []
+ # Try a range of block shapes against this pass
+ for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):
+ for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):
+ # Try valid OFM block depths
+ for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
+ # OFM block depth has the constraint that if it causes the OFM to be
+ # split, it must be a multiple of the OFM split size
+ if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
+ config = alloc.try_block(Block(w, h, c))
+ if config:
+ valid_block_configs.append(config)
+
+ assert len(valid_block_configs) > 0
+ return valid_block_configs
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
new file mode 100644
index 00000000..c4b4cd9e
--- /dev/null
+++ b/ethosu/vela/stats_writer.py
@@ -0,0 +1,367 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Writes out per-pass and summary performance statistics to CSV files.
+
+import numpy as np
+from .nn_graph import MemArea, TensorPurpose, PassPlacement
+from .npu_performance import PassCycles, MacCount, BandwidthDirection
+import csv
+from .numeric_util import round_up_to_int
+import sys
+
+
+def write_summary_metrics_csv(nng, summary_filename, arch):
+ with open(summary_filename, "w") as f:
+ writer = csv.writer(f)
+
+ labels = [
+ "experiment",
+ "network",
+ ]
+
+ labels += (
+ ["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
+ + [area.identifier_name() + "_bandwidth" for area in MemArea.all()]
+ + ["weights_storage_area", "feature_map_storage_area"]
+ )
+
+ labels += [
+ "inferences_per_second",
+ "batch_size",
+ "inference_time",
+ "passes_before_fusing",
+ "passes_after_fusing",
+ ]
+ labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()]
+ labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
+
+ for mem_area in MemArea.all():
+ labels += [
+ mem_area.identifier_name() + "_feature_map_read_bytes",
+ mem_area.identifier_name() + "_feature_map_write_bytes",
+ mem_area.identifier_name() + "_weight_read_bytes",
+ mem_area.identifier_name() + "_weight_write_bytes",
+ mem_area.identifier_name() + "_total_bytes",
+ ]
+
+ labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
+
+ labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
+
+ writer.writerow(labels)
+
+ data_items = [
+ "default",
+ nng.name,
+ ]
+
+ if arch:
+ data_items += (
+ [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
+ + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()]
+ + [
+ arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
+ arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
+ ]
+ )
+
+ midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
+ midpoint_fps = 1 / midpoint_inference_time
+
+ n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+ n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+
+ data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
+ data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()]
+
+ data_items += [
+ nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
+ nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
+ ]
+
+ for mem_area in MemArea.all():
+ bws = nng.bandwidths[mem_area]
+ total_bw = np.sum(bws)
+ weight_bws = bws[TensorPurpose.Weights]
+ fm_bws = bws[TensorPurpose.FeatureMap]
+ data_items += [
+ fm_bws[BandwidthDirection.Read],
+ fm_bws[BandwidthDirection.Write],
+ weight_bws[BandwidthDirection.Read],
+ weight_bws[BandwidthDirection.Write],
+ total_bw,
+ ]
+
+ data_items += [
+ nng.macs[MacCount.NeuralNetworkMacs],
+ nng.macs[MacCount.HardwareMacs],
+ nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
+ nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
+ ]
+
+ data_items += [nng.cycles[kind] for kind in PassCycles.all()]
+
+ writer.writerow(data_items)
+
+
+def write_pass_metrics_csv(nng, pass_filename):
+
+ with open(pass_filename, "w") as f:
+ writer = csv.writer(f)
+
+ purpose_list = (
+ ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
+ ("weights", (TensorPurpose.Weights,)),
+ ("feature_map", (TensorPurpose.FeatureMap,)),
+ )
+
+ direction_list = (
+ ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
+ ("read", (BandwidthDirection.Read,)),
+ ("write", (BandwidthDirection.Write,)),
+ )
+ bandwidth_names = []
+ bandwidth_indices = []
+ for mem_area in MemArea.all():
+ for purpose, purpose_candidates in purpose_list:
+ for direction, direction_candidates in direction_list:
+ label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
+ bandwidth_names.append(label)
+ bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
+
+ all_macs = MacCount.all()
+ all_cycles = (
+ PassCycles.Total,
+ PassCycles.Dpu,
+ PassCycles.ElementWise,
+ PassCycles.Cpu,
+ PassCycles.SramAccess,
+ PassCycles.DramAccess,
+ PassCycles.OnChipFlashAccess,
+ PassCycles.OffChipFlashAccess,
+ )
+ writer.writerow(
+ [
+ "name",
+ "operators",
+ "placement",
+ "streaming_strategy",
+ "block_config_height",
+ "block_config_width",
+ "block_config_input_channels",
+ "block_config_output_channels",
+ "n_blocks_in_pass",
+ ]
+ + ["cycles_" + v.identifier_name() for v in all_cycles]
+ + [v.identifier_name() for v in all_macs]
+ + bandwidth_names
+ + ["sram_used"]
+ )
+
+ def write_subgraph(sg):
+ for cps in sg.cascaded_passes:
+ if cps.placement == PassPlacement.StartupInit:
+ continue # skip the dummy init pass
+
+ for ps in cps.passes:
+ if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
+ # just treat this as a call, unroll it
+ write_subgraph(ps.ops[0].attrs["subgraph"])
+ continue
+ stats = [ps.name, " ".join(op.type for op in ps.ops)]
+ stats += [ps.placement.name]
+ stats += [cps.strategy.name]
+ stats += list(ps.block_config)
+ stats += [ps.n_blocks]
+ stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
+ stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
+ for indices in bandwidth_indices:
+ res = 0
+ i = indices[0]
+ for j in indices[1]:
+ for k in indices[2]:
+ res += round_up_to_int(ps.bandwidths[i, j, k])
+ stats.append(res)
+ stats += [ps.sram_used]
+
+ writer.writerow(stats)
+
+ write_subgraph(nng.get_root_subgraph())
+
+
+def print_performance_metrics_for_strat(
+ arch,
+ name,
+ cycles,
+ macs,
+ bandwidths,
+ batch_size,
+ memory_used,
+ num_passes,
+ num_cascaded_passes,
+ n_operations=0,
+ cpu_operations=[],
+ bits_per_element=None,
+ show_cpu_operations=False,
+ f=sys.stdout,
+):
+
+ orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()]
+
+ midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
+ midpoint_fps = 1 / midpoint_inference_time
+
+ mem_area_labels = [
+ (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
+ ]
+
+ if name:
+ print("", file=f)
+ print("Network summary for", name, file=f)
+ print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f)
+ print("System configuration %20s" % (arch.system_config,), file=f)
+ print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f)
+ for mem_area, label in mem_area_labels:
+ print(
+ "Design peak %-25s %12.2f GB/s"
+ % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
+ file=f,
+ )
+
+ print(file=f)
+ for mem_area, label in mem_area_labels:
+ if not mem_area in memory_used:
+ continue
+
+ aug_label = label + " used"
+
+ extra = ""
+ if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
+ extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
+
+ print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
+
+ print(file=f)
+ print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
+
+ n_cpu_operations = len(cpu_operations)
+ if n_operations > 0:
+ print(
+ "%d/%d (%4.1f %%) operations falling back to the CPU"
+ % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
+ file=f,
+ )
+
+ if show_cpu_operations:
+ for op in cpu_operations:
+
+ def format_tens_list(lst):
+ return " ".join(str(list(tens.shape)) for tens in lst)
+
+ print(
+ "CPU operation: %s, inputs %s, outputs %s"
+ % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
+ file=f,
+ )
+
+ print("", file=f)
+
+ for mem_area, label in mem_area_labels:
+ bws = bandwidths[mem_area]
+ total_bw = np.sum(bws)
+ weight_bws = bws[TensorPurpose.Weights]
+ fm_bws = bws[TensorPurpose.FeatureMap]
+ aug_label = label + " bandwidth"
+ print(
+ "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
+ file=f,
+ )
+ print(
+ "Input %-25s %12.2f MB/batch"
+ % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
+ file=f,
+ )
+ print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
+ print(
+ "Output %-25s %12.2f MB/batch"
+ % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
+ file=f,
+ )
+ print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
+ print(
+ "Total %-25s per input %9.2f MB/inference (batch size %d)"
+ % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
+ file=f,
+ )
+ print(file=f)
+
+ print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
+ print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
+ print(
+ "Network Tops/s %12.2f Tops/s"
+ % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
+ file=f,
+ )
+ print(
+ "Hardware Tops/s %12.2f Tops/s"
+ % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
+ file=f,
+ )
+ print(file=f)
+
+ for kind in PassCycles.all():
+ aug_label = kind.display_name() + " cycles"
+ cyc = cycles[kind]
+ print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f)
+ print(file=f)
+
+ print(
+ "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)"
+ % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
+ file=f,
+ )
+ print(file=f)
+
+
+def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
+ n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+ n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+ n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
+ cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
+ return print_performance_metrics_for_strat(
+ arch,
+ nng.name,
+ nng.cycles,
+ nng.macs,
+ nng.bandwidths,
+ nng.batch_size,
+ nng.memory_used,
+ n_passes,
+ n_cascaded_passes,
+ n_operations,
+ cpu_operations,
+ nng.bits_per_element,
+ show_cpu_operations,
+ f,
+ )
+
+
+def write_human_friendly_metrics(nng, arch, filename):
+ f = open(filename, "w")
+ print_performance_metrics(nng, arch, f=f)
diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py
new file mode 100644
index 00000000..23135f8a
--- /dev/null
+++ b/ethosu/vela/supported_operators.py
@@ -0,0 +1,243 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# The SupportedOperators class which is a collection of all supported operators and parameter checks.
+
+from .data_type import BaseType
+
+
+class SupportedOperators:
+ def __init__(self):
+ # Categorised lists of supported operators
+ self.npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead"))
+ self.convolution_ops = set(("Conv2DBiasAct", "Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitched"))
+ self.depthwise_convolution_ops = set(
+ ("DepthwiseConv2dBiasAct", "DepthwiseConv2dNative", "QuantizedDepthwiseConv2D")
+ )
+ self.max_pooling_ops = set(("QuantizedMaxPool", "MaxPool", "MaxPoolAct"))
+ self.avg_pooling_ops = set(("QuantizedAvgPool", "AvgPool", "AvgPoolAct"))
+ self.pooling_ops = self.max_pooling_ops | self.avg_pooling_ops
+ self.fc_vector_products = set(("QuantizedMatMul", "MatMul", "FullyConnectedAct"))
+ self.mac_main_ops = (
+ # convolutions
+ self.convolution_ops
+ # depth-wise convolutions
+ | self.depthwise_convolution_ops
+ # pooling
+ | self.pooling_ops
+ # FC layers
+ | self.fc_vector_products
+ # RNN/LSTM/GRU
+ | set(("BlockLSTM"))
+ )
+ self.elem_wise_main_ops = set(
+ (
+ # element-wise
+ "AddAct",
+ "MulAct",
+ "SubAct",
+ "QuantizedAdd",
+ "QuantizedSub",
+ "QuantizedMul",
+ "Mul",
+ "Add",
+ "Sub",
+ "Minimum",
+ "Maximum",
+ )
+ )
+ self.activation_ops = set(
+ ("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh")
+ )
+ self.npu_post_ops = (
+ # activation functions
+ self.activation_ops
+ # concatenation write direction
+ | set(("ConcatSliceWrite"))
+ # bias add and batch norm
+ | set(("QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm"))
+ )
+ self.split_ops = set(("Split", "StridedSlice", "Slice", "UnpackReshaped", "Unpack"))
+ self.concat_ops = set(("Concat", "ConcatV2", "QuantizedConcat", "ConcatTFLite", "PackReshaped", "Pack"))
+ self.memory_only_ops = (
+ set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims")) | self.concat_ops | self.split_ops
+ )
+ self.supported_fused_activations = set(("Relu", "Relu6", "ReluN1To1", "Tanh", "Sigmoid"))
+ self.supported_operators = (
+ self.npu_pre_ops | self.mac_main_ops | self.elem_wise_main_ops | self.npu_post_ops | self.memory_only_ops
+ )
+ # Setup supported operator restriction checkers
+ self.supported_operator_restrictions = {}
+ self.supported_operator_restrictions.update(
+ {op: self.check_convolution_restrictions for op in self.convolution_ops}
+ )
+ self.supported_operator_restrictions.update(
+ {op: self.check_depthwise_convolution_restrictions for op in self.depthwise_convolution_ops}
+ )
+ self.supported_operator_restrictions.update({op: self.check_pooling_restrictions for op in self.pooling_ops})
+ self.supported_operator_restrictions.update(
+ {op: self.check_vector_product_restrictions for op in self.fc_vector_products}
+ )
+ self.supported_operator_restrictions.update(
+ {op: self.check_element_wise_restrictions for op in self.elem_wise_main_ops}
+ )
+ self.supported_operator_restrictions.update(
+ {op: self.check_memory_only_restrictions for op in self.memory_only_ops}
+ )
+
+ def is_operator_supported(self, op):
+ if op.type not in self.supported_operators:
+ return False
+ if not self.check_generic_restrictions(op):
+ return False
+ if op.type in self.supported_operator_restrictions:
+ return self.supported_operator_restrictions[op.type](op)
+ return True
+
+ def check_generic_restrictions(self, op):
+ # check fully defined shapes
+ for t in op.inputs + op.outputs:
+ if not t.has_fully_defined_shape():
+ print("Warning:", op, "has inputs/outputs of undefined shape, placing on CPU")
+ return False
+
+ # check data type
+ tensors = [t for t in op.get_ifm_ifm2_weights_ofm() if t is not None]
+ if not tensors:
+ tensors = op.inputs
+ for t in tensors:
+ if not (t.dtype.type & BaseType.Int):
+ return False
+ if t.element_size() > 2 and op.type != "Requantize":
+ return False
+ # check size
+ if any(dim > 65536 for dim in t.shape):
+ return False
+
+ # check fused activations
+ if (
+ "fused_activation_function" in op.attrs
+ and op.attrs["fused_activation_function"] is not None
+ and op.attrs["fused_activation_function"] not in self.supported_fused_activations
+ ):
+ return False
+ return True
+
+ def check_convolution_restrictions(self, op):
+ # check stride
+ if op.attrs["stride_w"] > 2 or op.attrs["stride_h"] > 2:
+ return False
+
+ # check dilation
+ dilation_w_factor = op.attrs.get("dilation_w_factor", 1)
+ dilation_h_factor = op.attrs.get("dilation_h_factor", 1)
+ if dilation_w_factor > 2 or dilation_h_factor > 2:
+ return False
+
+ # check data type
+ ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
+ if weight_tensor.element_size() > 1:
+ return False
+
+ # check kernel size
+ dilated_weight_w = weight_tensor.shape[0] + (weight_tensor.shape[0] - 1) * (dilation_w_factor - 1)
+ dilated_weight_h = weight_tensor.shape[1] + (weight_tensor.shape[1] - 1) * (dilation_h_factor - 1)
+ if (
+ dilated_weight_w > 64
+ or dilated_weight_h > 64
+ or dilated_weight_w * dilated_weight_h * weight_tensor.shape[2] > 127 * 65536
+ ):
+ return False
+
+ # check batch size
+ if ifm_tensor.shape[0] != 1:
+ return False
+ return True
+
+ def check_depthwise_convolution_restrictions(self, op):
+ # check depth
+ ifm_tensor, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+ if op.attrs["depth_multiplier"] > 1 and not (
+ (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"])
+ ):
+ return False
+ return self.check_convolution_restrictions(op)
+
+ def check_pooling_restrictions(self, op):
+ # check stride
+ if op.attrs["stride_w"] > 2 or op.attrs["stride_h"] > 2:
+ return False
+
+ # check data type
+ ifm_tensor, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+ if ifm_tensor.dtype != ofm_tensor.dtype:
+ return False
+
+ # check batch size
+ if ifm_tensor.shape[0] != 1:
+ return False
+
+ if op.type in self.avg_pooling_ops:
+ # check kernel size
+ if op.attrs["padding"] == b"SAME" and (op.attrs["filter_width"] > 8 or op.attrs["filter_height"] > 8):
+ return False
+ if op.attrs["padding"] == b"VALID" and (op.attrs["filter_width"] > 256 or op.attrs["filter_height"] > 256):
+ return False
+
+ if op.type in self.max_pooling_ops:
+ # check data type
+ if not ifm_tensor.dtype == ofm_tensor.dtype:
+ return False
+ # check kernel size
+ if op.attrs["filter_width"] > 256 or op.attrs["filter_height"] > 256: # any padding
+ return False
+ return True
+
+ def check_vector_product_restrictions(self, op):
+ # check data type
+ ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
+ if weight_tensor.element_size() > 1:
+ return False
+
+ return True
+
+ def check_element_wise_restrictions(self, op):
+ # check data type
+ ifm_tensor, ifm2_tensor, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+ if op.type in ("Minimum", "Maximum") and ifm_tensor.dtype != ofm_tensor.dtype:
+ return False
+
+ # check batch size
+ if (len(ifm_tensor.shape) > 2 and ifm_tensor.shape[0] != 1) or (
+ len(ifm2_tensor.shape) > 2 and ifm2_tensor.shape[0] != 1
+ ):
+ return False
+
+ # check scalar size
+ if (hasattr(ifm_tensor.values, "__len__") and len(ifm_tensor.values) > 1) or (
+ hasattr(ifm2_tensor.values, "__len__") and len(ifm2_tensor.values) > 1
+ ):
+ return False
+ return True
+
+ def check_memory_only_restrictions(self, op):
+ # check stride size
+ if op.type == "StridedSlice":
+ if len(op.inputs) > 3 and any(stride != 1 for stride in op.inputs[3].values):
+ return False
+ return True
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
new file mode 100644
index 00000000..46040a46
--- /dev/null
+++ b/ethosu/vela/tensor.py
@@ -0,0 +1,629 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Internal representation of a Neural Network Tensor.
+
+import enum
+from . import numeric_util
+import numpy as np
+from . import data_type
+import uuid
+from .range_set import MemoryRangeSet
+from .numeric_util import round_up_divide
+
+
+class MemArea(enum.IntFlag):
+ Unknown = 0
+ Sram = 1
+ Dram = 2
+ OnChipFlash = 3
+ OffChipFlash = 4
+ Size = OffChipFlash + 1
+
+ def display_name(self):
+ return ("Unknown", "SRAM", "DRAM", "On-chip Flash", "Off-chip Flash", "Size")[self.value]
+
+ def identifier_name(self):
+ return ("unknown", "sram", "dram", "on_chip_flash", "off_chip_flash", "size")[self.value]
+
+ def all():
+ return (MemArea.Sram, MemArea.Dram, MemArea.OnChipFlash, MemArea.OffChipFlash)
+
+ def __str__(self):
+ return self.name
+
+
+class TensorPurpose(enum.IntFlag):
+ Unknown = 0
+ Weights = 1
+ FeatureMap = 2
+ Scratch = 3
+ Size = 4
+
+ def display_name(self):
+ return ("Unknown", "Weights", "FeatureMap", "Scratch", "Size")[self.value]
+
+ def identifier_name(self):
+ return ("unknown", "weights", "feature_map", "scratch", "size")[self.value]
+
+ def all():
+ return (TensorPurpose.Weights, TensorPurpose.FeatureMap)
+
+
+class TensorSubPurpose(enum.Enum):
+ Standard = 0
+ DoubleBuffer = 1
+ RollingBufferX = 2
+ RollingBufferY = 3
+ RollingBufferXY = 4
+
+ def display_name(self):
+ return ("Standard", "Double Buffer", "Rolling Buffer X", "Rolling Buffer Y", "Rolling Buffer XY")[self.value]
+
+ def identifier_name(self):
+ return ("standard", "double_buffer", "rolling_buffer_x", "rolling_buffer_y", "rolling_buffer_xy")[self.value]
+
+ def all():
+ return (
+ TensorSubPurpose.Standard,
+ TensorSubPurpose.DoubleBuffer,
+ TensorSubPurpose.RollingBufferX,
+ TensorSubPurpose.RollingBufferY,
+ TensorSubPurpose.RollingBufferXY,
+ )
+
+
+class TensorFormat(enum.Flag):
+ Unknown = 0
+ WeightsCompressed = 1
+ NHWC = 2
+ NHCWB16 = 3
+
+ def __str__(self):
+ return self.name
+
+
+class TensorBlockTraversal(enum.Enum):
+ Default = 0
+ DepthWise = 1
+ DepthFirst = 2
+ PartKernelFirst = 3
+
+
+def shape_num_elements(shp):
+ elems = 1
+ if shp is None:
+ return None
+ for d in shp:
+ if d is None:
+ return None
+ elems *= d
+ return elems
+
+
+def shape_fully_defined(shp):
+ if shp is None:
+ return False
+ for d in shp:
+ if d is None:
+ return False
+ return True
+
+
+def shape_round_to_quantum(shp, quantum):
+ new_shp = list(shp)
+
+ # Traverse backwards using length of shape since there may be more rounding quantums than shape elements
+ for i in range(-1, -len(shp) - 1, -1):
+ if new_shp[i] is not None:
+ new_shp[i] = numeric_util.round_up(new_shp[i], quantum[i])
+ return new_shp
+
+
+class QuantizationParameters:
+ __slots__ = "min", "max", "num_bits", "narrow_range", "scale_f32", "zero_point", "quant_min", "quant_max"
+
+ def __init__(self, min=None, max=None, num_bits=None, narrow_range=None):
+ self.min = min
+ self.max = max
+
+ self.num_bits = num_bits
+ self.narrow_range = narrow_range
+
+ self.scale_f32 = None
+ self.zero_point = None
+ self.quant_min = None
+ self.quant_max = None
+
+ def __str__(self):
+ return "<nng.QuantizationParameters min=%s max=%s, num_bits=%s, scale=%s, zero_point=%s>" % (
+ self.min,
+ self.max,
+ self.num_bits,
+ self.scale_f32,
+ self.zero_point,
+ )
+
+ __repr__ = __str__
+
+ def clone(self):
+ res = QuantizationParameters()
+ res.min = self.min
+ res.max = self.max
+
+ res.num_bits = self.num_bits
+ res.narrow_range = self.narrow_range
+
+ res.scale_f32 = self.scale_f32
+ res.zero_point = self.zero_point
+ res.quant_min = self.quant_min
+ res.quant_max = self.quant_max
+ return res
+
+ def dequantize(self, values):
+ if self.zero_point.size == 1 and self.scale_f32.size == 1:
+ # same scale is used for all values
+ res = (values.astype(np.float64) - self.zero_point) * self.scale_f32
+ else:
+ # a different scale is used for different sets of values
+ values_as_float = values.astype(np.float64)
+
+ # this is not compatible with the format of depthwise weights,
+ # where input is at index 3 (Output, Kh, Kw, Input)
+ # return the quantized values
+ return np.ndarray((values_as_float.shape))
+
+ shape = values_as_float.shape[0]
+ assert self.zero_point.size == self.scale_f32.size == shape
+ res = np.ndarray(values_as_float.shape)
+ for i in range(shape):
+ res[i] = (values_as_float[i] - self.zero_point[i]) * self.scale_f32[i]
+
+ return res
+
+
+class Tensor:
+ __slots__ = (
+ "shape",
+ "storage_shape",
+ "bandwidth_shape",
+ "dtype",
+ "name",
+ "ops",
+ "consumer_list",
+ "values",
+ "quant_values",
+ "compressed_values",
+ "mem_area",
+ "format",
+ "purpose",
+ "sub_purpose",
+ "alignment",
+ "weight_transpose_depthwise",
+ "storage_compression_scale",
+ "bandwidth_compression_scale",
+ "compression_scale_for_worst_weight_stream",
+ "weight_compression_scales",
+ "weight_compression_config",
+ "storage_rounding_quantum",
+ "brick_size",
+ "address",
+ "quantization",
+ "weight_compressed_offsets",
+ "element_size_bytes",
+ "reshaped",
+ "block_traversal",
+ "offset",
+ "cpu_tensor",
+ "npu_tensor",
+ "equivalence_id",
+ )
+ AllocationQuantum = 16
+
+ def __init__(self, shape, dtype, name):
+ self.shape = shape
+ self.storage_shape = shape
+ self.bandwidth_shape = shape
+ self.dtype = dtype
+ self.name = name
+ self.equivalence_id = uuid.uuid4()
+
+ self.ops = []
+ self.consumer_list = []
+ # Below attributes are only set if a tensor has been cloned,
+ # either from Cpu -> Npu or vice versa. Needed for offline allocation
+ self.cpu_tensor = None # reference to the corresponding Cpu tensor
+ self.npu_tensor = None # reference to the corresponding Npu tensor
+
+ self.values = None
+ self.quant_values = None
+ self.compressed_values = None
+ self.mem_area = MemArea.Unknown
+ self.format = TensorFormat.Unknown
+ self.purpose = TensorPurpose.Unknown
+ self.sub_purpose = TensorSubPurpose.Standard
+ self.alignment = Tensor.AllocationQuantum
+ self.weight_transpose_depthwise = False
+
+ self.storage_compression_scale = 1.0
+ self.bandwidth_compression_scale = 1.0
+ self.compression_scale_for_worst_weight_stream = 1.0
+ self.weight_compression_scales = None
+ self.weight_compression_config = None
+ self.weight_compressed_offsets = []
+ self.storage_rounding_quantum = (1, 1, 1, 1)
+ self.brick_size = (1, 1, 1, 1)
+ self.address = 0 # start address of tensor. will be filled in by tensor allocator
+ self.element_size_bytes = 0
+
+ # quantization parameters
+ self.quantization = None
+
+ self.reshaped = False
+ self.block_traversal = TensorBlockTraversal.Default
+
+ def element_size(self):
+ if self.element_size_bytes == 0:
+ return self.dtype.size_in_bits() / 8
+ return self.element_size_bytes
+
+ def clone(self, suffix="_clone"):
+ res = Tensor(self.shape, self.dtype, self.name + suffix)
+ res.storage_shape = list(self.storage_shape)
+ res.bandwidth_shape = list(self.bandwidth_shape)
+
+ res.ops = []
+ res.consumer_list = []
+ res.equivalence_id = self.equivalence_id
+
+ res.values = self.values
+ res.quant_values = self.quant_values
+ res.compressed_values = self.compressed_values
+ res.mem_area = self.mem_area
+ res.format = self.format
+ res.purpose = self.purpose
+ res.sub_purpose = self.sub_purpose
+ res.alignment = self.alignment
+ res.weight_transpose_depthwise = self.weight_transpose_depthwise
+
+ res.storage_compression_scale = self.storage_compression_scale
+ res.bandwidth_compression_scale = self.bandwidth_compression_scale
+ res.compression_scale_for_worst_weight_stream = self.compression_scale_for_worst_weight_stream
+ res.weight_compression_scales = self.weight_compression_scales
+ res.storage_rounding_quantum = self.storage_rounding_quantum
+ res.brick_size = self.brick_size
+ res.address = 0
+
+ if self.quantization is not None:
+ res.quantization = self.quantization.clone()
+ else:
+ res.quantization = None
+
+ return res
+
+ def clone_into_fast_storage(self, arch):
+ res = self.clone(suffix="_fast_storage")
+ res.mem_area = arch.fast_storage_mem_area
+ return res
+
+ def set_format(self, fmt, arch):
+ self.format = fmt
+ shape_len = 0
+ try:
+ shape_len = len(self.shape)
+ except TypeError:
+ pass
+
+ self.storage_rounding_quantum = arch.storage_rounding_quantums[self.format]
+ self.storage_rounding_quantum = self.storage_rounding_quantum[-shape_len:]
+ if self.format == TensorFormat.NHCWB16:
+ self.storage_rounding_quantum = self.storage_rounding_quantum[:-1] + (
+ int(self.storage_rounding_quantum[-1] / self.dtype.size_in_bytes()),
+ )
+ self.brick_size = arch.brick_sizes[self.format]
+ self.brick_size = self.brick_size[-shape_len:]
+ if self.shape is None:
+ return
+
+ self.bandwidth_shape = shape_round_to_quantum(self.shape, self.brick_size)
+ self.storage_shape = shape_round_to_quantum(self.shape, self.storage_rounding_quantum)
+
+ if fmt == TensorFormat.WeightsCompressed:
+ compression_ratio = 5 / 8
+ self.storage_compression_scale = compression_ratio
+ self.bandwidth_compression_scale = compression_ratio
+ self.compression_scale_for_worst_weight_stream = compression_ratio
+
+ def storage_elements(self):
+ elems = shape_num_elements(self.storage_shape)
+ if elems is None:
+ return 0
+ return elems
+
+ def elements(self):
+ elems = shape_num_elements(self.shape)
+ if elems is None:
+ return 0
+ return elems
+
+ def has_fully_defined_shape(self):
+ return shape_fully_defined(self.shape)
+
+ def storage_size(self):
+ raw_size = self.storage_elements() * self.element_size()
+ if raw_size == 0:
+ raw_size = 1 # force it to take up space
+ rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment)
+ return rounded_size
+
+ def storage_size_for_sub_purpose(self, sub_purpose, param_a=None, param_b=None):
+ alt_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b)
+ elems = shape_num_elements(alt_shape)
+ if elems is None:
+ return 0
+ if sub_purpose == TensorSubPurpose.DoubleBuffer:
+ raw_size = elems * self.element_size() * self.compression_scale_for_worst_weight_stream
+ else:
+ raw_size = elems * self.element_size() * self.storage_compression_scale
+ rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment)
+ return rounded_size
+
+ def storage_shape_for_sub_purpose(self, sub_purpose, param_a, param_b):
+ shp = list(self.storage_shape)
+ if sub_purpose == TensorSubPurpose.DoubleBuffer:
+ assert len(shp) >= 2
+ shp[-1] = min(shp[-1], param_a * 2)
+ elif sub_purpose == TensorSubPurpose.RollingBufferX:
+ assert len(shp) == 4
+ shp[0] = 1
+ shp[2] = min(shp[2], param_a)
+ elif sub_purpose == TensorSubPurpose.RollingBufferY:
+ assert len(shp) == 4
+ shp[0] = 1
+ shp[1] = min(shp[1], param_a)
+ elif sub_purpose == TensorSubPurpose.RollingBufferXY:
+ assert len(shp) == 4
+ shp[0] = 1
+ shp[2] = min(shp[2], param_a)
+ shp[1] = min(shp[1], param_b)
+ elif sub_purpose == TensorSubPurpose.Standard:
+ pass
+ else:
+ assert 0, "did not expect new sub purpose %s" % (sub_purpose,)
+ return shp
+
+ def set_new_sub_purpose(self, sub_purpose, param_a=None, param_b=None):
+ self.storage_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b)
+ self.sub_purpose = sub_purpose
+ if sub_purpose == TensorSubPurpose.DoubleBuffer:
+ self.storage_compression_scale = self.compression_scale_for_worst_weight_stream
+
+ def bandwidth(self):
+ elems = shape_num_elements(self.bandwidth_shape)
+ if elems is None:
+ return 0
+ return elems * self.element_size() * self.bandwidth_compression_scale
+
+ def consumers(self):
+ return self.consumer_list
+
+ def get_address_ranges_for_coordinates(self, start_coord, end_coord):
+ if self.sub_purpose in set(
+ (TensorSubPurpose.RollingBufferX, TensorSubPurpose.RollingBufferY, TensorSubPurpose.RollingBufferXY)
+ ):
+ # build dummy coordinates that cover the entire buffer
+ start_coord = [0] * len(start_coord)
+ end_coord = [min(self.storage_shape[i], self.shape[i]) for i in range(len(end_coord))]
+
+ start = self.address_for_coordinate(start_coord, is_top_box=False)
+ end = self.address_for_coordinate(end_coord, is_top_box=True)
+ return MemoryRangeSet(self.mem_area, start, end)
+
+ def addresses_for_rolling_buffer(self, start_coord, end_coord):
+ # returns ( box_height0, box_height1, box_width, [address_tl, address_tr, address_bl, address_br] )
+
+ if len(start_coord) < 4:
+ box_height0 = 1
+ box_width = 1
+
+ if len(start_coord) >= 2:
+ box_width = end_coord[-2] - start_coord[-2]
+
+ return box_height0, box_height0, box_width, [self.address_for_coordinate(start_coord), None, None, None]
+
+ crossing_y = numeric_util.round_up(start_coord[1] + 1, self.storage_shape[1])
+ crossing_x = numeric_util.round_up(start_coord[2] + 1, self.storage_shape[2])
+
+ crossing_y = min(crossing_y, end_coord[1])
+ crossing_x = min(crossing_x, end_coord[2])
+
+ box_height0 = crossing_y - start_coord[1]
+ box_width = crossing_x - start_coord[2]
+
+ addresses = [None] * 4
+ addresses[0] = self.address_for_coordinate(start_coord)
+
+ if end_coord[2] > crossing_x:
+ addresses[1] = self.address_for_coordinate([start_coord[0], start_coord[1], crossing_x, start_coord[3]])
+ raise Exception("Striping in vertical direction is not supported")
+ if end_coord[1] > crossing_y:
+ addresses[2] = self.address_for_coordinate([start_coord[0], crossing_y, start_coord[2], start_coord[3]])
+ if end_coord[1] > crossing_y and end_coord[2] > crossing_x:
+ addresses[3] = self.address_for_coordinate([start_coord[0], crossing_y, crossing_x, start_coord[3]])
+
+ return box_height0, box_height0, box_width, addresses
+
+ def address_for_coordinate(self, coord, is_top_box=False):
+ return self.address + self.address_offset_for_coordinate(coord, is_top_box)
+
+ def get_strides_and_coord(self, coord=None):
+ if coord is None:
+ coord = [0] * len(self.storage_shape)
+
+ augmented_coord = coord
+ augmented_shape = self.storage_shape
+ while len(augmented_shape) < 4:
+ augmented_shape = [1] + augmented_shape
+
+ while len(augmented_coord) < 4:
+ augmented_coord = [0] + augmented_coord
+
+ assert len(augmented_coord) == len(augmented_shape)
+
+ if self.format == TensorFormat.NHWC:
+ augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1]
+ augmented_coord = [augmented_coord[0], augmented_coord[3]] + augmented_coord[1:3] + [0]
+ stride_order = [4, 1, 3, 2, 0]
+
+ elif self.format == TensorFormat.NHCWB16:
+ channel_divisor = int(16 / self.element_size())
+ augmented_shape = augmented_shape[0:4] + [1]
+ augmented_coord = (
+ [augmented_coord[0], augmented_coord[3] // channel_divisor]
+ + augmented_coord[1:3]
+ + [augmented_coord[3] % channel_divisor]
+ )
+
+ if augmented_shape[1] == 0:
+ augmented_shape[1] = 1
+
+ else:
+ assert self.format in set((TensorFormat.Unknown, TensorFormat.WeightsCompressed))
+ return None, None
+
+ strides = [0] * len(augmented_shape)
+ stride = self.element_size() * self.storage_compression_scale
+
+ if self.format != TensorFormat.NHCWB16:
+ for i in stride_order:
+ strides[i] = stride
+ stride *= augmented_shape[i]
+ else:
+ assert len(strides) == 5
+ channel_divisor = int(16 / self.element_size())
+ strides[4] = stride
+ strides[3] = channel_divisor # STRIDE_X
+ strides[1] = strides[3] * augmented_shape[2] # STRIDE_C
+ strides[2] = augmented_shape[2] * augmented_shape[3] # STRIDE_Y
+ strides[0] = strides[2] * augmented_shape[1] # STRIDE_N
+
+ return strides, augmented_coord
+
+ def get_strides(self):
+ strides, _ = self.get_strides_and_coord()
+
+ return strides
+
+ def compressed_stream_index_from_coord(self, coord):
+ assert self.format == TensorFormat.WeightsCompressed
+ assert len(self.compressed_values) > 0
+ assert len(self.compressed_values) + 1 == len(self.weight_compressed_offsets)
+
+ depth = coord[-1]
+ brick_depth = self.brick_size[-1]
+ # Clamp position at final element index
+ if depth > self.shape[-1]:
+ depth = self.shape[-1]
+
+ # Always round up to next boundary
+ index = round_up_divide(depth, brick_depth)
+
+ # Check boundaries on all but last weight set (which may be shorter
+ # than the brick we divided it up into)
+ if index < len(self.weight_compressed_offsets) - 1:
+ # There are no half-way points in the weights
+ if (depth % brick_depth) != 0:
+ raise Exception("Offset into weights must be aligned to a brick")
+
+ return index
+
+ def size_of_compressed_stream(self, index):
+ assert 0 <= index < len(self.compressed_values)
+ return len(self.compressed_values[index])
+
+ def is_last_index_in_compressed_stream(self, index):
+ assert 0 <= index < len(self.compressed_values)
+ return index == len(self.compressed_values) - 1
+
+ def address_offset_for_coordinate(self, orig_coord, is_top_box=False):
+ address_offset = 0
+ coord = orig_coord
+
+ coord = coord[-len(self.storage_shape) :]
+
+ if self.sub_purpose == TensorSubPurpose.Standard:
+ for idx, c in enumerate(coord):
+ if is_top_box:
+ assert c > 0 and c <= self.shape[idx]
+ else:
+ assert c >= 0 and c < self.shape[idx]
+
+ if self.format == TensorFormat.WeightsCompressed:
+ if len(self.weight_compressed_offsets) == 0:
+ return 0
+
+ if len(self.ops) == 1 and self.ops[0].type == "DMA" and self.sub_purpose == TensorSubPurpose.DoubleBuffer:
+ depth = orig_coord[-1]
+ brick_depth = self.brick_size[-1]
+ # Clamp position at final element index
+ if depth > self.shape[-1]:
+ depth = self.shape[-1]
+
+ # Always round up to next boundary
+ index = round_up_divide(depth, brick_depth)
+ index = index % 2
+
+ if len(self.compressed_values) <= 2:
+ if is_top_box and index == 0:
+ for cv in self.compressed_values:
+ address_offset += len(cv)
+ else:
+ address_offset = index * len(self.compressed_values[0])
+ else:
+ if is_top_box and index == 0:
+ address_offset = self.storage_shape[-1]
+ else:
+ address_offset = index * (self.storage_shape[-1] // 2)
+ else:
+ index = self.compressed_stream_index_from_coord(orig_coord)
+ assert index < len(self.weight_compressed_offsets)
+ address_offset = self.weight_compressed_offsets[index]
+ else:
+ if is_top_box:
+ coord = [c - 1 for c in coord]
+
+ # handle wraparound for partial buffers. make sure to do this after subtracting top box:
+ coord = [c % self.storage_shape[idx] for idx, c in enumerate(coord)]
+
+ strides, augmented_coord = self.get_strides_and_coord(coord)
+ if strides is None:
+ return None
+
+ if is_top_box:
+ address_offset += 1 * strides[-1] # one element
+
+ address_offset += np.dot(augmented_coord, strides)
+
+ assert address_offset >= 0
+ assert address_offset <= self.storage_size()
+ return address_offset
+
+ def __str__(self):
+ return "<nng.Tensor '%s' shape=%s dtype=%s>" % (self.name, self.shape, self.dtype)
+
+ __repr__ = __str__
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
new file mode 100644
index 00000000..94aa6088
--- /dev/null
+++ b/ethosu/vela/tensor_allocation.py
@@ -0,0 +1,139 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
+# worked out from the allowable overlaps that are calculated by the live range analysis.
+
+from . import live_range
+from .tensor import MemArea
+import math
+from . import numeric_util
+import numpy as np
+from .nn_graph import TensorAllocator, PassPlacement
+
+from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
+
+
+def linear_allocate_live_ranges(live_ranges, alloc_granularity=256):
+ total_sz = 0
+ allocated_tensors = []
+
+ # just assign increasing addresses
+ for tens, lr in live_ranges.ranges.items():
+ if tens in allocated_tensors:
+ continue
+
+ lr.set_address(total_sz)
+ allocated_tensors += lr.tensors
+ total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
+
+ return total_sz
+
+
+def mark_sram_used_for_cascaded_passes(sg, lrs):
+ end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
+ mem_usage = np.zeros(end_pos, dtype=np.int64)
+
+ for tens, rng in lrs.ranges.items():
+ storage_size = tens.storage_size()
+ mem_usage[rng.start_time : rng.end_time] += storage_size
+
+ for cps in sg.cascaded_passes:
+ sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
+ cps.sram_used = sram_used
+ for ps in cps.passes:
+ ps.sram_used = sram_used
+
+
+def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation):
+ if verbose_allocation:
+ if mem_area == MemArea.Sram:
+ print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
+ else:
+ print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
+ for start_time, start, end, name, end_time in sorted(
+ (
+ lr.start_time,
+ tens.address,
+ tens.address + int(math.ceil(tens.storage_size())),
+ tens.name + " " + str(tens.purpose),
+ lr.end_time,
+ )
+ for tens, lr in lrs.ranges.items()
+ ):
+ name = name.replace("\x00", "")
+ print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name))
+ print()
+
+ if show_minimum_possible_allocation and mem_area == MemArea.Sram:
+ min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes)
+ print(
+ "Min possible allocation %d bytes / %.1f KB / %.1f MB"
+ % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024)
+ )
+
+
+def allocate_tensors(
+ nng,
+ sg,
+ arch,
+ mem_area,
+ use_ifm_ofm_overlap=True,
+ tensor_allocator=TensorAllocator.Greedy,
+ verbose_allocation=False,
+ show_minimum_possible_allocation=False,
+ lr_graph=None,
+):
+ ignore_subgraph_input_output_tensors = False
+ lrs = live_range.extract_live_ranges_from_cascaded_passes(
+ sg,
+ mem_area,
+ mark_output_tensors_overlapping_with_input_tensors=False,
+ use_ifm_ofm_overlap=use_ifm_ofm_overlap,
+ ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
+ lr_graph=lr_graph,
+ )
+
+ if lrs.ranges:
+ tens_alloc = tensor_allocator
+ if tens_alloc == TensorAllocator.Greedy:
+ total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation)
+ elif tens_alloc == TensorAllocator.LinearAlloc:
+ total_sz = linear_allocate_live_ranges(lrs)
+ else:
+ assert 0
+
+ sg.memory_used[mem_area] = total_sz
+
+ nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
+ nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
+
+ print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation)
+
+ if mem_area == MemArea.Sram:
+ # Mark Sram usage for all subgraphs
+ for sg_ in nng.subgraphs:
+ mark_sram_used_for_cascaded_passes(sg_, lrs)
+
+ if sg == nng.get_root_subgraph():
+ nng.memory_used = sg.memory_used
+ for mem_area in nng.total_elements.keys():
+ try:
+ nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area]
+ except ZeroDivisionError:
+ nng.bits_per_element[mem_area] = 0.0
diff --git a/ethosu/vela/tflite/AbsOptions.py b/ethosu/vela/tflite/AbsOptions.py
new file mode 100644
index 00000000..0cbfb8c0
--- /dev/null
+++ b/ethosu/vela/tflite/AbsOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class AbsOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsAbsOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = AbsOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # AbsOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def AbsOptionsStart(builder): builder.StartObject(0)
+def AbsOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ActivationFunctionType.py b/ethosu/vela/tflite/ActivationFunctionType.py
new file mode 100644
index 00000000..6d8ec952
--- /dev/null
+++ b/ethosu/vela/tflite/ActivationFunctionType.py
@@ -0,0 +1,11 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class ActivationFunctionType(object):
+ NONE = 0
+ RELU = 1
+ RELU_N1_TO_1 = 2
+ RELU6 = 3
+ TANH = 4
+ SIGN_BIT = 5
diff --git a/ethosu/vela/tflite/AddNOptions.py b/ethosu/vela/tflite/AddNOptions.py
new file mode 100644
index 00000000..b5c2ddb7
--- /dev/null
+++ b/ethosu/vela/tflite/AddNOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class AddNOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsAddNOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = AddNOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # AddNOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def AddNOptionsStart(builder): builder.StartObject(0)
+def AddNOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/AddOptions.py b/ethosu/vela/tflite/AddOptions.py
new file mode 100644
index 00000000..d6cbfcf5
--- /dev/null
+++ b/ethosu/vela/tflite/AddOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class AddOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsAddOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = AddOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # AddOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # AddOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def AddOptionsStart(builder): builder.StartObject(1)
+def AddOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def AddOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ArgMaxOptions.py b/ethosu/vela/tflite/ArgMaxOptions.py
new file mode 100644
index 00000000..fbf1415e
--- /dev/null
+++ b/ethosu/vela/tflite/ArgMaxOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ArgMaxOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsArgMaxOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ArgMaxOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ArgMaxOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ArgMaxOptions
+ def OutputType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def ArgMaxOptionsStart(builder): builder.StartObject(1)
+def ArgMaxOptionsAddOutputType(builder, outputType): builder.PrependInt8Slot(0, outputType, 0)
+def ArgMaxOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ArgMinOptions.py b/ethosu/vela/tflite/ArgMinOptions.py
new file mode 100644
index 00000000..120fdca2
--- /dev/null
+++ b/ethosu/vela/tflite/ArgMinOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ArgMinOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsArgMinOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ArgMinOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ArgMinOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ArgMinOptions
+ def OutputType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def ArgMinOptionsStart(builder): builder.StartObject(1)
+def ArgMinOptionsAddOutputType(builder, outputType): builder.PrependInt8Slot(0, outputType, 0)
+def ArgMinOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/BatchToSpaceNDOptions.py b/ethosu/vela/tflite/BatchToSpaceNDOptions.py
new file mode 100644
index 00000000..3ddcfd3f
--- /dev/null
+++ b/ethosu/vela/tflite/BatchToSpaceNDOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class BatchToSpaceNDOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsBatchToSpaceNDOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = BatchToSpaceNDOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # BatchToSpaceNDOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def BatchToSpaceNDOptionsStart(builder): builder.StartObject(0)
+def BatchToSpaceNDOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py b/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py
new file mode 100644
index 00000000..8d8b7bea
--- /dev/null
+++ b/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py
@@ -0,0 +1,62 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class BidirectionalSequenceLSTMOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsBidirectionalSequenceLSTMOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = BidirectionalSequenceLSTMOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # BidirectionalSequenceLSTMOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # BidirectionalSequenceLSTMOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # BidirectionalSequenceLSTMOptions
+ def CellClip(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # BidirectionalSequenceLSTMOptions
+ def ProjClip(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # BidirectionalSequenceLSTMOptions
+ def MergeOutputs(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+ # BidirectionalSequenceLSTMOptions
+ def TimeMajor(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return True
+
+def BidirectionalSequenceLSTMOptionsStart(builder): builder.StartObject(5)
+def BidirectionalSequenceLSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def BidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0)
+def BidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0)
+def BidirectionalSequenceLSTMOptionsAddMergeOutputs(builder, mergeOutputs): builder.PrependBoolSlot(3, mergeOutputs, 0)
+def BidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(4, timeMajor, 1)
+def BidirectionalSequenceLSTMOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py b/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py
new file mode 100644
index 00000000..673af6b9
--- /dev/null
+++ b/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class BidirectionalSequenceRNNOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsBidirectionalSequenceRNNOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = BidirectionalSequenceRNNOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # BidirectionalSequenceRNNOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # BidirectionalSequenceRNNOptions
+ def TimeMajor(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+ # BidirectionalSequenceRNNOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # BidirectionalSequenceRNNOptions
+ def MergeOutputs(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def BidirectionalSequenceRNNOptionsStart(builder): builder.StartObject(3)
+def BidirectionalSequenceRNNOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(0, timeMajor, 0)
+def BidirectionalSequenceRNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+def BidirectionalSequenceRNNOptionsAddMergeOutputs(builder, mergeOutputs): builder.PrependBoolSlot(2, mergeOutputs, 0)
+def BidirectionalSequenceRNNOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Buffer.py b/ethosu/vela/tflite/Buffer.py
new file mode 100644
index 00000000..754dee3b
--- /dev/null
+++ b/ethosu/vela/tflite/Buffer.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Buffer(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsBuffer(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Buffer()
+ x.Init(buf, n + offset)
+ return x
+
+ # Buffer
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Buffer
+ def Data(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+ return 0
+
+ # Buffer
+ def DataAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+ return 0
+
+ # Buffer
+ def DataLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def BufferStart(builder): builder.StartObject(1)
+def BufferAddData(builder, data): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(data), 0)
+def BufferStartDataVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def BufferEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/BuiltinOperator.py b/ethosu/vela/tflite/BuiltinOperator.py
new file mode 100644
index 00000000..27136538
--- /dev/null
+++ b/ethosu/vela/tflite/BuiltinOperator.py
@@ -0,0 +1,131 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class BuiltinOperator(object):
+ ADD = 0
+ AVERAGE_POOL_2D = 1
+ CONCATENATION = 2
+ CONV_2D = 3
+ DEPTHWISE_CONV_2D = 4
+ DEPTH_TO_SPACE = 5
+ DEQUANTIZE = 6
+ EMBEDDING_LOOKUP = 7
+ FLOOR = 8
+ FULLY_CONNECTED = 9
+ HASHTABLE_LOOKUP = 10
+ L2_NORMALIZATION = 11
+ L2_POOL_2D = 12
+ LOCAL_RESPONSE_NORMALIZATION = 13
+ LOGISTIC = 14
+ LSH_PROJECTION = 15
+ LSTM = 16
+ MAX_POOL_2D = 17
+ MUL = 18
+ RELU = 19
+ RELU_N1_TO_1 = 20
+ RELU6 = 21
+ RESHAPE = 22
+ RESIZE_BILINEAR = 23
+ RNN = 24
+ SOFTMAX = 25
+ SPACE_TO_DEPTH = 26
+ SVDF = 27
+ TANH = 28
+ CONCAT_EMBEDDINGS = 29
+ SKIP_GRAM = 30
+ CALL = 31
+ CUSTOM = 32
+ EMBEDDING_LOOKUP_SPARSE = 33
+ PAD = 34
+ UNIDIRECTIONAL_SEQUENCE_RNN = 35
+ GATHER = 36
+ BATCH_TO_SPACE_ND = 37
+ SPACE_TO_BATCH_ND = 38
+ TRANSPOSE = 39
+ MEAN = 40
+ SUB = 41
+ DIV = 42
+ SQUEEZE = 43
+ UNIDIRECTIONAL_SEQUENCE_LSTM = 44
+ STRIDED_SLICE = 45
+ BIDIRECTIONAL_SEQUENCE_RNN = 46
+ EXP = 47
+ TOPK_V2 = 48
+ SPLIT = 49
+ LOG_SOFTMAX = 50
+ DELEGATE = 51
+ BIDIRECTIONAL_SEQUENCE_LSTM = 52
+ CAST = 53
+ PRELU = 54
+ MAXIMUM = 55
+ ARG_MAX = 56
+ MINIMUM = 57
+ LESS = 58
+ NEG = 59
+ PADV2 = 60
+ GREATER = 61
+ GREATER_EQUAL = 62
+ LESS_EQUAL = 63
+ SELECT = 64
+ SLICE = 65
+ SIN = 66
+ TRANSPOSE_CONV = 67
+ SPARSE_TO_DENSE = 68
+ TILE = 69
+ EXPAND_DIMS = 70
+ EQUAL = 71
+ NOT_EQUAL = 72
+ LOG = 73
+ SUM = 74
+ SQRT = 75
+ RSQRT = 76
+ SHAPE = 77
+ POW = 78
+ ARG_MIN = 79
+ FAKE_QUANT = 80
+ REDUCE_PROD = 81
+ REDUCE_MAX = 82
+ PACK = 83
+ LOGICAL_OR = 84
+ ONE_HOT = 85
+ LOGICAL_AND = 86
+ LOGICAL_NOT = 87
+ UNPACK = 88
+ REDUCE_MIN = 89
+ FLOOR_DIV = 90
+ REDUCE_ANY = 91
+ SQUARE = 92
+ ZEROS_LIKE = 93
+ FILL = 94
+ FLOOR_MOD = 95
+ RANGE = 96
+ RESIZE_NEAREST_NEIGHBOR = 97
+ LEAKY_RELU = 98
+ SQUARED_DIFFERENCE = 99
+ MIRROR_PAD = 100
+ ABS = 101
+ SPLIT_V = 102
+ UNIQUE = 103
+ CEIL = 104
+ REVERSE_V2 = 105
+ ADD_N = 106
+ GATHER_ND = 107
+ COS = 108
+ WHERE = 109
+ RANK = 110
+ ELU = 111
+ REVERSE_SEQUENCE = 112
+ MATRIX_DIAG = 113
+ QUANTIZE = 114
+ MATRIX_SET_DIAG = 115
+ ROUND = 116
+ HARD_SWISH = 117
+ IF = 118
+ WHILE = 119
+ NON_MAX_SUPPRESSION_V4 = 120
+ NON_MAX_SUPPRESSION_V5 = 121
+ SCATTER_ND = 122
+ SELECT_V2 = 123
+ DENSIFY = 124
+ SEGMENT_SUM = 125
diff --git a/ethosu/vela/tflite/BuiltinOptions.py b/ethosu/vela/tflite/BuiltinOptions.py
new file mode 100644
index 00000000..babbcb15
--- /dev/null
+++ b/ethosu/vela/tflite/BuiltinOptions.py
@@ -0,0 +1,106 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class BuiltinOptions(object):
+ NONE = 0
+ Conv2DOptions = 1
+ DepthwiseConv2DOptions = 2
+ ConcatEmbeddingsOptions = 3
+ LSHProjectionOptions = 4
+ Pool2DOptions = 5
+ SVDFOptions = 6
+ RNNOptions = 7
+ FullyConnectedOptions = 8
+ SoftmaxOptions = 9
+ ConcatenationOptions = 10
+ AddOptions = 11
+ L2NormOptions = 12
+ LocalResponseNormalizationOptions = 13
+ LSTMOptions = 14
+ ResizeBilinearOptions = 15
+ CallOptions = 16
+ ReshapeOptions = 17
+ SkipGramOptions = 18
+ SpaceToDepthOptions = 19
+ EmbeddingLookupSparseOptions = 20
+ MulOptions = 21
+ PadOptions = 22
+ GatherOptions = 23
+ BatchToSpaceNDOptions = 24
+ SpaceToBatchNDOptions = 25
+ TransposeOptions = 26
+ ReducerOptions = 27
+ SubOptions = 28
+ DivOptions = 29
+ SqueezeOptions = 30
+ SequenceRNNOptions = 31
+ StridedSliceOptions = 32
+ ExpOptions = 33
+ TopKV2Options = 34
+ SplitOptions = 35
+ LogSoftmaxOptions = 36
+ CastOptions = 37
+ DequantizeOptions = 38
+ MaximumMinimumOptions = 39
+ ArgMaxOptions = 40
+ LessOptions = 41
+ NegOptions = 42
+ PadV2Options = 43
+ GreaterOptions = 44
+ GreaterEqualOptions = 45
+ LessEqualOptions = 46
+ SelectOptions = 47
+ SliceOptions = 48
+ TransposeConvOptions = 49
+ SparseToDenseOptions = 50
+ TileOptions = 51
+ ExpandDimsOptions = 52
+ EqualOptions = 53
+ NotEqualOptions = 54
+ ShapeOptions = 55
+ PowOptions = 56
+ ArgMinOptions = 57
+ FakeQuantOptions = 58
+ PackOptions = 59
+ LogicalOrOptions = 60
+ OneHotOptions = 61
+ LogicalAndOptions = 62
+ LogicalNotOptions = 63
+ UnpackOptions = 64
+ FloorDivOptions = 65
+ SquareOptions = 66
+ ZerosLikeOptions = 67
+ FillOptions = 68
+ BidirectionalSequenceLSTMOptions = 69
+ BidirectionalSequenceRNNOptions = 70
+ UnidirectionalSequenceLSTMOptions = 71
+ FloorModOptions = 72
+ RangeOptions = 73
+ ResizeNearestNeighborOptions = 74
+ LeakyReluOptions = 75
+ SquaredDifferenceOptions = 76
+ MirrorPadOptions = 77
+ AbsOptions = 78
+ SplitVOptions = 79
+ UniqueOptions = 80
+ ReverseV2Options = 81
+ AddNOptions = 82
+ GatherNdOptions = 83
+ CosOptions = 84
+ WhereOptions = 85
+ RankOptions = 86
+ ReverseSequenceOptions = 87
+ MatrixDiagOptions = 88
+ QuantizeOptions = 89
+ MatrixSetDiagOptions = 90
+ HardSwishOptions = 91
+ IfOptions = 92
+ WhileOptions = 93
+ DepthToSpaceOptions = 94
+ NonMaxSuppressionV4Options = 95
+ NonMaxSuppressionV5Options = 96
+ ScatterNdOptions = 97
+ SelectV2Options = 98
+ DensifyOptions = 99
+ SegmentSumOptions = 100
diff --git a/ethosu/vela/tflite/CallOptions.py b/ethosu/vela/tflite/CallOptions.py
new file mode 100644
index 00000000..5ae2eeae
--- /dev/null
+++ b/ethosu/vela/tflite/CallOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class CallOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsCallOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = CallOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # CallOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # CallOptions
+ def Subgraph(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+ return 0
+
+def CallOptionsStart(builder): builder.StartObject(1)
+def CallOptionsAddSubgraph(builder, subgraph): builder.PrependUint32Slot(0, subgraph, 0)
+def CallOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/CastOptions.py b/ethosu/vela/tflite/CastOptions.py
new file mode 100644
index 00000000..70ae2e37
--- /dev/null
+++ b/ethosu/vela/tflite/CastOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class CastOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsCastOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = CastOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # CastOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # CastOptions
+ def InDataType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # CastOptions
+ def OutDataType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def CastOptionsStart(builder): builder.StartObject(2)
+def CastOptionsAddInDataType(builder, inDataType): builder.PrependInt8Slot(0, inDataType, 0)
+def CastOptionsAddOutDataType(builder, outDataType): builder.PrependInt8Slot(1, outDataType, 0)
+def CastOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/CombinerType.py b/ethosu/vela/tflite/CombinerType.py
new file mode 100644
index 00000000..1e3a61f3
--- /dev/null
+++ b/ethosu/vela/tflite/CombinerType.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class CombinerType(object):
+ SUM = 0
+ MEAN = 1
+ SQRTN = 2
diff --git a/ethosu/vela/tflite/ConcatEmbeddingsOptions.py b/ethosu/vela/tflite/ConcatEmbeddingsOptions.py
new file mode 100644
index 00000000..9d26c510
--- /dev/null
+++ b/ethosu/vela/tflite/ConcatEmbeddingsOptions.py
@@ -0,0 +1,78 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ConcatEmbeddingsOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsConcatEmbeddingsOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ConcatEmbeddingsOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ConcatEmbeddingsOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ConcatEmbeddingsOptions
+ def NumChannels(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # ConcatEmbeddingsOptions
+ def NumColumnsPerChannel(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # ConcatEmbeddingsOptions
+ def NumColumnsPerChannelAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # ConcatEmbeddingsOptions
+ def NumColumnsPerChannelLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # ConcatEmbeddingsOptions
+ def EmbeddingDimPerChannel(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # ConcatEmbeddingsOptions
+ def EmbeddingDimPerChannelAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # ConcatEmbeddingsOptions
+ def EmbeddingDimPerChannelLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def ConcatEmbeddingsOptionsStart(builder): builder.StartObject(3)
+def ConcatEmbeddingsOptionsAddNumChannels(builder, numChannels): builder.PrependInt32Slot(0, numChannels, 0)
+def ConcatEmbeddingsOptionsAddNumColumnsPerChannel(builder, numColumnsPerChannel): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(numColumnsPerChannel), 0)
+def ConcatEmbeddingsOptionsStartNumColumnsPerChannelVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ConcatEmbeddingsOptionsAddEmbeddingDimPerChannel(builder, embeddingDimPerChannel): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(embeddingDimPerChannel), 0)
+def ConcatEmbeddingsOptionsStartEmbeddingDimPerChannelVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ConcatEmbeddingsOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ConcatenationOptions.py b/ethosu/vela/tflite/ConcatenationOptions.py
new file mode 100644
index 00000000..c8e0b6ab
--- /dev/null
+++ b/ethosu/vela/tflite/ConcatenationOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ConcatenationOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsConcatenationOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ConcatenationOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ConcatenationOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ConcatenationOptions
+ def Axis(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # ConcatenationOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def ConcatenationOptionsStart(builder): builder.StartObject(2)
+def ConcatenationOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0)
+def ConcatenationOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+def ConcatenationOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Conv2DOptions.py b/ethosu/vela/tflite/Conv2DOptions.py
new file mode 100644
index 00000000..ef49f751
--- /dev/null
+++ b/ethosu/vela/tflite/Conv2DOptions.py
@@ -0,0 +1,70 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Conv2DOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsConv2DOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Conv2DOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # Conv2DOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Conv2DOptions
+ def Padding(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # Conv2DOptions
+ def StrideW(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # Conv2DOptions
+ def StrideH(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # Conv2DOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # Conv2DOptions
+ def DilationWFactor(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 1
+
+ # Conv2DOptions
+ def DilationHFactor(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 1
+
+def Conv2DOptionsStart(builder): builder.StartObject(6)
+def Conv2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0)
+def Conv2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0)
+def Conv2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0)
+def Conv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(3, fusedActivationFunction, 0)
+def Conv2DOptionsAddDilationWFactor(builder, dilationWFactor): builder.PrependInt32Slot(4, dilationWFactor, 1)
+def Conv2DOptionsAddDilationHFactor(builder, dilationHFactor): builder.PrependInt32Slot(5, dilationHFactor, 1)
+def Conv2DOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/CosOptions.py b/ethosu/vela/tflite/CosOptions.py
new file mode 100644
index 00000000..7fbf8487
--- /dev/null
+++ b/ethosu/vela/tflite/CosOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class CosOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsCosOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = CosOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # CosOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def CosOptionsStart(builder): builder.StartObject(0)
+def CosOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/CustomOptionsFormat.py b/ethosu/vela/tflite/CustomOptionsFormat.py
new file mode 100644
index 00000000..c2fc07c2
--- /dev/null
+++ b/ethosu/vela/tflite/CustomOptionsFormat.py
@@ -0,0 +1,6 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class CustomOptionsFormat(object):
+ FLEXBUFFERS = 0
diff --git a/ethosu/vela/tflite/CustomQuantization.py b/ethosu/vela/tflite/CustomQuantization.py
new file mode 100644
index 00000000..21ec0da4
--- /dev/null
+++ b/ethosu/vela/tflite/CustomQuantization.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class CustomQuantization(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsCustomQuantization(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = CustomQuantization()
+ x.Init(buf, n + offset)
+ return x
+
+ # CustomQuantization
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # CustomQuantization
+ def Custom(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+ return 0
+
+ # CustomQuantization
+ def CustomAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+ return 0
+
+ # CustomQuantization
+ def CustomLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def CustomQuantizationStart(builder): builder.StartObject(1)
+def CustomQuantizationAddCustom(builder, custom): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(custom), 0)
+def CustomQuantizationStartCustomVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def CustomQuantizationEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DensifyOptions.py b/ethosu/vela/tflite/DensifyOptions.py
new file mode 100644
index 00000000..12cbfb29
--- /dev/null
+++ b/ethosu/vela/tflite/DensifyOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DensifyOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsDensifyOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = DensifyOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # DensifyOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def DensifyOptionsStart(builder): builder.StartObject(0)
+def DensifyOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DepthToSpaceOptions.py b/ethosu/vela/tflite/DepthToSpaceOptions.py
new file mode 100644
index 00000000..97b93aa7
--- /dev/null
+++ b/ethosu/vela/tflite/DepthToSpaceOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DepthToSpaceOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsDepthToSpaceOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = DepthToSpaceOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # DepthToSpaceOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # DepthToSpaceOptions
+ def BlockSize(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def DepthToSpaceOptionsStart(builder): builder.StartObject(1)
+def DepthToSpaceOptionsAddBlockSize(builder, blockSize): builder.PrependInt32Slot(0, blockSize, 0)
+def DepthToSpaceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DepthwiseConv2DOptions.py b/ethosu/vela/tflite/DepthwiseConv2DOptions.py
new file mode 100644
index 00000000..9689383b
--- /dev/null
+++ b/ethosu/vela/tflite/DepthwiseConv2DOptions.py
@@ -0,0 +1,78 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DepthwiseConv2DOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsDepthwiseConv2DOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = DepthwiseConv2DOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # DepthwiseConv2DOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # DepthwiseConv2DOptions
+ def Padding(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # DepthwiseConv2DOptions
+ def StrideW(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # DepthwiseConv2DOptions
+ def StrideH(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # DepthwiseConv2DOptions
+ def DepthMultiplier(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # DepthwiseConv2DOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # DepthwiseConv2DOptions
+ def DilationWFactor(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 1
+
+ # DepthwiseConv2DOptions
+ def DilationHFactor(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 1
+
+def DepthwiseConv2DOptionsStart(builder): builder.StartObject(7)
+def DepthwiseConv2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0)
+def DepthwiseConv2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0)
+def DepthwiseConv2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0)
+def DepthwiseConv2DOptionsAddDepthMultiplier(builder, depthMultiplier): builder.PrependInt32Slot(3, depthMultiplier, 0)
+def DepthwiseConv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(4, fusedActivationFunction, 0)
+def DepthwiseConv2DOptionsAddDilationWFactor(builder, dilationWFactor): builder.PrependInt32Slot(5, dilationWFactor, 1)
+def DepthwiseConv2DOptionsAddDilationHFactor(builder, dilationHFactor): builder.PrependInt32Slot(6, dilationHFactor, 1)
+def DepthwiseConv2DOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DequantizeOptions.py b/ethosu/vela/tflite/DequantizeOptions.py
new file mode 100644
index 00000000..5ef8b8dd
--- /dev/null
+++ b/ethosu/vela/tflite/DequantizeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DequantizeOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsDequantizeOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = DequantizeOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # DequantizeOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def DequantizeOptionsStart(builder): builder.StartObject(0)
+def DequantizeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DimensionMetadata.py b/ethosu/vela/tflite/DimensionMetadata.py
new file mode 100644
index 00000000..c9fe7cd6
--- /dev/null
+++ b/ethosu/vela/tflite/DimensionMetadata.py
@@ -0,0 +1,76 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DimensionMetadata(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsDimensionMetadata(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = DimensionMetadata()
+ x.Init(buf, n + offset)
+ return x
+
+ # DimensionMetadata
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # DimensionMetadata
+ def Format(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # DimensionMetadata
+ def DenseSize(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # DimensionMetadata
+ def ArraySegmentsType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+ return 0
+
+ # DimensionMetadata
+ def ArraySegments(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ from flatbuffers.table import Table
+ obj = Table(bytearray(), 0)
+ self._tab.Union(obj, o)
+ return obj
+ return None
+
+ # DimensionMetadata
+ def ArrayIndicesType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+ return 0
+
+ # DimensionMetadata
+ def ArrayIndices(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ from flatbuffers.table import Table
+ obj = Table(bytearray(), 0)
+ self._tab.Union(obj, o)
+ return obj
+ return None
+
+def DimensionMetadataStart(builder): builder.StartObject(6)
+def DimensionMetadataAddFormat(builder, format): builder.PrependInt8Slot(0, format, 0)
+def DimensionMetadataAddDenseSize(builder, denseSize): builder.PrependInt32Slot(1, denseSize, 0)
+def DimensionMetadataAddArraySegmentsType(builder, arraySegmentsType): builder.PrependUint8Slot(2, arraySegmentsType, 0)
+def DimensionMetadataAddArraySegments(builder, arraySegments): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(arraySegments), 0)
+def DimensionMetadataAddArrayIndicesType(builder, arrayIndicesType): builder.PrependUint8Slot(4, arrayIndicesType, 0)
+def DimensionMetadataAddArrayIndices(builder, arrayIndices): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(arrayIndices), 0)
+def DimensionMetadataEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/DimensionType.py b/ethosu/vela/tflite/DimensionType.py
new file mode 100644
index 00000000..310d8eed
--- /dev/null
+++ b/ethosu/vela/tflite/DimensionType.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class DimensionType(object):
+ DENSE = 0
+ SPARSE_CSR = 1
diff --git a/ethosu/vela/tflite/DivOptions.py b/ethosu/vela/tflite/DivOptions.py
new file mode 100644
index 00000000..905a3be0
--- /dev/null
+++ b/ethosu/vela/tflite/DivOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class DivOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsDivOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = DivOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # DivOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # DivOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def DivOptionsStart(builder): builder.StartObject(1)
+def DivOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def DivOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py b/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py
new file mode 100644
index 00000000..7d9c1442
--- /dev/null
+++ b/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class EmbeddingLookupSparseOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsEmbeddingLookupSparseOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = EmbeddingLookupSparseOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # EmbeddingLookupSparseOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # EmbeddingLookupSparseOptions
+ def Combiner(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def EmbeddingLookupSparseOptionsStart(builder): builder.StartObject(1)
+def EmbeddingLookupSparseOptionsAddCombiner(builder, combiner): builder.PrependInt8Slot(0, combiner, 0)
+def EmbeddingLookupSparseOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/EqualOptions.py b/ethosu/vela/tflite/EqualOptions.py
new file mode 100644
index 00000000..f787ef85
--- /dev/null
+++ b/ethosu/vela/tflite/EqualOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class EqualOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsEqualOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = EqualOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # EqualOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def EqualOptionsStart(builder): builder.StartObject(0)
+def EqualOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ExpOptions.py b/ethosu/vela/tflite/ExpOptions.py
new file mode 100644
index 00000000..eac1456e
--- /dev/null
+++ b/ethosu/vela/tflite/ExpOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ExpOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsExpOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ExpOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ExpOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def ExpOptionsStart(builder): builder.StartObject(0)
+def ExpOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ExpandDimsOptions.py b/ethosu/vela/tflite/ExpandDimsOptions.py
new file mode 100644
index 00000000..69d63665
--- /dev/null
+++ b/ethosu/vela/tflite/ExpandDimsOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ExpandDimsOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsExpandDimsOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ExpandDimsOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ExpandDimsOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def ExpandDimsOptionsStart(builder): builder.StartObject(0)
+def ExpandDimsOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FakeQuantOptions.py b/ethosu/vela/tflite/FakeQuantOptions.py
new file mode 100644
index 00000000..46c371c3
--- /dev/null
+++ b/ethosu/vela/tflite/FakeQuantOptions.py
@@ -0,0 +1,54 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FakeQuantOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsFakeQuantOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = FakeQuantOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # FakeQuantOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # FakeQuantOptions
+ def Min(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # FakeQuantOptions
+ def Max(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # FakeQuantOptions
+ def NumBits(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # FakeQuantOptions
+ def NarrowRange(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def FakeQuantOptionsStart(builder): builder.StartObject(4)
+def FakeQuantOptionsAddMin(builder, min): builder.PrependFloat32Slot(0, min, 0.0)
+def FakeQuantOptionsAddMax(builder, max): builder.PrependFloat32Slot(1, max, 0.0)
+def FakeQuantOptionsAddNumBits(builder, numBits): builder.PrependInt32Slot(2, numBits, 0)
+def FakeQuantOptionsAddNarrowRange(builder, narrowRange): builder.PrependBoolSlot(3, narrowRange, 0)
+def FakeQuantOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FillOptions.py b/ethosu/vela/tflite/FillOptions.py
new file mode 100644
index 00000000..5a1e651a
--- /dev/null
+++ b/ethosu/vela/tflite/FillOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FillOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsFillOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = FillOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # FillOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def FillOptionsStart(builder): builder.StartObject(0)
+def FillOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FloorDivOptions.py b/ethosu/vela/tflite/FloorDivOptions.py
new file mode 100644
index 00000000..64b474fb
--- /dev/null
+++ b/ethosu/vela/tflite/FloorDivOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FloorDivOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsFloorDivOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = FloorDivOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # FloorDivOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def FloorDivOptionsStart(builder): builder.StartObject(0)
+def FloorDivOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FloorModOptions.py b/ethosu/vela/tflite/FloorModOptions.py
new file mode 100644
index 00000000..37c8e5a5
--- /dev/null
+++ b/ethosu/vela/tflite/FloorModOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FloorModOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsFloorModOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = FloorModOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # FloorModOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def FloorModOptionsStart(builder): builder.StartObject(0)
+def FloorModOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FullyConnectedOptions.py b/ethosu/vela/tflite/FullyConnectedOptions.py
new file mode 100644
index 00000000..a6b4e40f
--- /dev/null
+++ b/ethosu/vela/tflite/FullyConnectedOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class FullyConnectedOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsFullyConnectedOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = FullyConnectedOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # FullyConnectedOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # FullyConnectedOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # FullyConnectedOptions
+ def WeightsFormat(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # FullyConnectedOptions
+ def KeepNumDims(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def FullyConnectedOptionsStart(builder): builder.StartObject(3)
+def FullyConnectedOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def FullyConnectedOptionsAddWeightsFormat(builder, weightsFormat): builder.PrependInt8Slot(1, weightsFormat, 0)
+def FullyConnectedOptionsAddKeepNumDims(builder, keepNumDims): builder.PrependBoolSlot(2, keepNumDims, 0)
+def FullyConnectedOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py b/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py
new file mode 100644
index 00000000..d9a53887
--- /dev/null
+++ b/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class FullyConnectedOptionsWeightsFormat(object):
+ DEFAULT = 0
+ SHUFFLED4x16INT8 = 1
diff --git a/ethosu/vela/tflite/GatherNdOptions.py b/ethosu/vela/tflite/GatherNdOptions.py
new file mode 100644
index 00000000..f515eb5c
--- /dev/null
+++ b/ethosu/vela/tflite/GatherNdOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class GatherNdOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsGatherNdOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = GatherNdOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # GatherNdOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def GatherNdOptionsStart(builder): builder.StartObject(0)
+def GatherNdOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/GatherOptions.py b/ethosu/vela/tflite/GatherOptions.py
new file mode 100644
index 00000000..9fbc3e40
--- /dev/null
+++ b/ethosu/vela/tflite/GatherOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class GatherOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsGatherOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = GatherOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # GatherOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # GatherOptions
+ def Axis(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def GatherOptionsStart(builder): builder.StartObject(1)
+def GatherOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0)
+def GatherOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/GreaterEqualOptions.py b/ethosu/vela/tflite/GreaterEqualOptions.py
new file mode 100644
index 00000000..a29e200a
--- /dev/null
+++ b/ethosu/vela/tflite/GreaterEqualOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class GreaterEqualOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsGreaterEqualOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = GreaterEqualOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # GreaterEqualOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def GreaterEqualOptionsStart(builder): builder.StartObject(0)
+def GreaterEqualOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/GreaterOptions.py b/ethosu/vela/tflite/GreaterOptions.py
new file mode 100644
index 00000000..59d63501
--- /dev/null
+++ b/ethosu/vela/tflite/GreaterOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class GreaterOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsGreaterOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = GreaterOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # GreaterOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def GreaterOptionsStart(builder): builder.StartObject(0)
+def GreaterOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/HardSwishOptions.py b/ethosu/vela/tflite/HardSwishOptions.py
new file mode 100644
index 00000000..4f6a5200
--- /dev/null
+++ b/ethosu/vela/tflite/HardSwishOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class HardSwishOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsHardSwishOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = HardSwishOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # HardSwishOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def HardSwishOptionsStart(builder): builder.StartObject(0)
+def HardSwishOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/IfOptions.py b/ethosu/vela/tflite/IfOptions.py
new file mode 100644
index 00000000..13f4e697
--- /dev/null
+++ b/ethosu/vela/tflite/IfOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class IfOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsIfOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = IfOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # IfOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # IfOptions
+ def ThenSubgraphIndex(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # IfOptions
+ def ElseSubgraphIndex(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def IfOptionsStart(builder): builder.StartObject(2)
+def IfOptionsAddThenSubgraphIndex(builder, thenSubgraphIndex): builder.PrependInt32Slot(0, thenSubgraphIndex, 0)
+def IfOptionsAddElseSubgraphIndex(builder, elseSubgraphIndex): builder.PrependInt32Slot(1, elseSubgraphIndex, 0)
+def IfOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Int32Vector.py b/ethosu/vela/tflite/Int32Vector.py
new file mode 100644
index 00000000..e70851b2
--- /dev/null
+++ b/ethosu/vela/tflite/Int32Vector.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Int32Vector(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsInt32Vector(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Int32Vector()
+ x.Init(buf, n + offset)
+ return x
+
+ # Int32Vector
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Int32Vector
+ def Values(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # Int32Vector
+ def ValuesAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # Int32Vector
+ def ValuesLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def Int32VectorStart(builder): builder.StartObject(1)
+def Int32VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0)
+def Int32VectorStartValuesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def Int32VectorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/L2NormOptions.py b/ethosu/vela/tflite/L2NormOptions.py
new file mode 100644
index 00000000..38bdf573
--- /dev/null
+++ b/ethosu/vela/tflite/L2NormOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class L2NormOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsL2NormOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = L2NormOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # L2NormOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # L2NormOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def L2NormOptionsStart(builder): builder.StartObject(1)
+def L2NormOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def L2NormOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LSHProjectionOptions.py b/ethosu/vela/tflite/LSHProjectionOptions.py
new file mode 100644
index 00000000..ad550be2
--- /dev/null
+++ b/ethosu/vela/tflite/LSHProjectionOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LSHProjectionOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLSHProjectionOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LSHProjectionOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LSHProjectionOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # LSHProjectionOptions
+ def Type(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def LSHProjectionOptionsStart(builder): builder.StartObject(1)
+def LSHProjectionOptionsAddType(builder, type): builder.PrependInt8Slot(0, type, 0)
+def LSHProjectionOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LSHProjectionType.py b/ethosu/vela/tflite/LSHProjectionType.py
new file mode 100644
index 00000000..a7d6a313
--- /dev/null
+++ b/ethosu/vela/tflite/LSHProjectionType.py
@@ -0,0 +1,8 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class LSHProjectionType(object):
+ UNKNOWN = 0
+ SPARSE = 1
+ DENSE = 2
diff --git a/ethosu/vela/tflite/LSTMKernelType.py b/ethosu/vela/tflite/LSTMKernelType.py
new file mode 100644
index 00000000..fd657998
--- /dev/null
+++ b/ethosu/vela/tflite/LSTMKernelType.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class LSTMKernelType(object):
+ FULL = 0
+ BASIC = 1
diff --git a/ethosu/vela/tflite/LSTMOptions.py b/ethosu/vela/tflite/LSTMOptions.py
new file mode 100644
index 00000000..93a83093
--- /dev/null
+++ b/ethosu/vela/tflite/LSTMOptions.py
@@ -0,0 +1,54 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LSTMOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLSTMOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LSTMOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LSTMOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # LSTMOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # LSTMOptions
+ def CellClip(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # LSTMOptions
+ def ProjClip(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # LSTMOptions
+ def KernelType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def LSTMOptionsStart(builder): builder.StartObject(4)
+def LSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def LSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0)
+def LSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0)
+def LSTMOptionsAddKernelType(builder, kernelType): builder.PrependInt8Slot(3, kernelType, 0)
+def LSTMOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LeakyReluOptions.py b/ethosu/vela/tflite/LeakyReluOptions.py
new file mode 100644
index 00000000..b61b21d5
--- /dev/null
+++ b/ethosu/vela/tflite/LeakyReluOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LeakyReluOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLeakyReluOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LeakyReluOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LeakyReluOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # LeakyReluOptions
+ def Alpha(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+def LeakyReluOptionsStart(builder): builder.StartObject(1)
+def LeakyReluOptionsAddAlpha(builder, alpha): builder.PrependFloat32Slot(0, alpha, 0.0)
+def LeakyReluOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LessEqualOptions.py b/ethosu/vela/tflite/LessEqualOptions.py
new file mode 100644
index 00000000..d49b7289
--- /dev/null
+++ b/ethosu/vela/tflite/LessEqualOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LessEqualOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLessEqualOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LessEqualOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LessEqualOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def LessEqualOptionsStart(builder): builder.StartObject(0)
+def LessEqualOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LessOptions.py b/ethosu/vela/tflite/LessOptions.py
new file mode 100644
index 00000000..469cb0b0
--- /dev/null
+++ b/ethosu/vela/tflite/LessOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LessOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLessOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LessOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LessOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def LessOptionsStart(builder): builder.StartObject(0)
+def LessOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LocalResponseNormalizationOptions.py b/ethosu/vela/tflite/LocalResponseNormalizationOptions.py
new file mode 100644
index 00000000..db875603
--- /dev/null
+++ b/ethosu/vela/tflite/LocalResponseNormalizationOptions.py
@@ -0,0 +1,54 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LocalResponseNormalizationOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLocalResponseNormalizationOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LocalResponseNormalizationOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LocalResponseNormalizationOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # LocalResponseNormalizationOptions
+ def Radius(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # LocalResponseNormalizationOptions
+ def Bias(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # LocalResponseNormalizationOptions
+ def Alpha(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # LocalResponseNormalizationOptions
+ def Beta(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+def LocalResponseNormalizationOptionsStart(builder): builder.StartObject(4)
+def LocalResponseNormalizationOptionsAddRadius(builder, radius): builder.PrependInt32Slot(0, radius, 0)
+def LocalResponseNormalizationOptionsAddBias(builder, bias): builder.PrependFloat32Slot(1, bias, 0.0)
+def LocalResponseNormalizationOptionsAddAlpha(builder, alpha): builder.PrependFloat32Slot(2, alpha, 0.0)
+def LocalResponseNormalizationOptionsAddBeta(builder, beta): builder.PrependFloat32Slot(3, beta, 0.0)
+def LocalResponseNormalizationOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LogSoftmaxOptions.py b/ethosu/vela/tflite/LogSoftmaxOptions.py
new file mode 100644
index 00000000..47893855
--- /dev/null
+++ b/ethosu/vela/tflite/LogSoftmaxOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LogSoftmaxOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLogSoftmaxOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LogSoftmaxOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LogSoftmaxOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def LogSoftmaxOptionsStart(builder): builder.StartObject(0)
+def LogSoftmaxOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LogicalAndOptions.py b/ethosu/vela/tflite/LogicalAndOptions.py
new file mode 100644
index 00000000..cee1cdb4
--- /dev/null
+++ b/ethosu/vela/tflite/LogicalAndOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LogicalAndOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLogicalAndOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LogicalAndOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LogicalAndOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def LogicalAndOptionsStart(builder): builder.StartObject(0)
+def LogicalAndOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LogicalNotOptions.py b/ethosu/vela/tflite/LogicalNotOptions.py
new file mode 100644
index 00000000..9971450c
--- /dev/null
+++ b/ethosu/vela/tflite/LogicalNotOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LogicalNotOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLogicalNotOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LogicalNotOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LogicalNotOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def LogicalNotOptionsStart(builder): builder.StartObject(0)
+def LogicalNotOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/LogicalOrOptions.py b/ethosu/vela/tflite/LogicalOrOptions.py
new file mode 100644
index 00000000..e94a5dec
--- /dev/null
+++ b/ethosu/vela/tflite/LogicalOrOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class LogicalOrOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsLogicalOrOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = LogicalOrOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # LogicalOrOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def LogicalOrOptionsStart(builder): builder.StartObject(0)
+def LogicalOrOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MatrixDiagOptions.py b/ethosu/vela/tflite/MatrixDiagOptions.py
new file mode 100644
index 00000000..0f64e657
--- /dev/null
+++ b/ethosu/vela/tflite/MatrixDiagOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MatrixDiagOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsMatrixDiagOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = MatrixDiagOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # MatrixDiagOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def MatrixDiagOptionsStart(builder): builder.StartObject(0)
+def MatrixDiagOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MatrixSetDiagOptions.py b/ethosu/vela/tflite/MatrixSetDiagOptions.py
new file mode 100644
index 00000000..14178cf8
--- /dev/null
+++ b/ethosu/vela/tflite/MatrixSetDiagOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MatrixSetDiagOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsMatrixSetDiagOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = MatrixSetDiagOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # MatrixSetDiagOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def MatrixSetDiagOptionsStart(builder): builder.StartObject(0)
+def MatrixSetDiagOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MaximumMinimumOptions.py b/ethosu/vela/tflite/MaximumMinimumOptions.py
new file mode 100644
index 00000000..f0806e2d
--- /dev/null
+++ b/ethosu/vela/tflite/MaximumMinimumOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MaximumMinimumOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsMaximumMinimumOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = MaximumMinimumOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # MaximumMinimumOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def MaximumMinimumOptionsStart(builder): builder.StartObject(0)
+def MaximumMinimumOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Metadata.py b/ethosu/vela/tflite/Metadata.py
new file mode 100644
index 00000000..273e51ee
--- /dev/null
+++ b/ethosu/vela/tflite/Metadata.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Metadata(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsMetadata(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Metadata()
+ x.Init(buf, n + offset)
+ return x
+
+ # Metadata
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Metadata
+ def Name(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.String(o + self._tab.Pos)
+ return None
+
+ # Metadata
+ def Buffer(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+ return 0
+
+def MetadataStart(builder): builder.StartObject(2)
+def MetadataAddName(builder, name): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
+def MetadataAddBuffer(builder, buffer): builder.PrependUint32Slot(1, buffer, 0)
+def MetadataEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MirrorPadMode.py b/ethosu/vela/tflite/MirrorPadMode.py
new file mode 100644
index 00000000..8fb6396f
--- /dev/null
+++ b/ethosu/vela/tflite/MirrorPadMode.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class MirrorPadMode(object):
+ REFLECT = 0
+ SYMMETRIC = 1
diff --git a/ethosu/vela/tflite/MirrorPadOptions.py b/ethosu/vela/tflite/MirrorPadOptions.py
new file mode 100644
index 00000000..254ae217
--- /dev/null
+++ b/ethosu/vela/tflite/MirrorPadOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MirrorPadOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsMirrorPadOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = MirrorPadOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # MirrorPadOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # MirrorPadOptions
+ def Mode(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def MirrorPadOptionsStart(builder): builder.StartObject(1)
+def MirrorPadOptionsAddMode(builder, mode): builder.PrependInt8Slot(0, mode, 0)
+def MirrorPadOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Model.py b/ethosu/vela/tflite/Model.py
new file mode 100644
index 00000000..cc9991ba
--- /dev/null
+++ b/ethosu/vela/tflite/Model.py
@@ -0,0 +1,150 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Model(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsModel(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Model()
+ x.Init(buf, n + offset)
+ return x
+
+ # Model
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Model
+ def Version(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+ return 0
+
+ # Model
+ def OperatorCodes(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ x = self._tab.Vector(o)
+ x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+ x = self._tab.Indirect(x)
+ from .OperatorCode import OperatorCode
+ obj = OperatorCode()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # Model
+ def OperatorCodesLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Model
+ def Subgraphs(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ x = self._tab.Vector(o)
+ x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+ x = self._tab.Indirect(x)
+ from .SubGraph import SubGraph
+ obj = SubGraph()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # Model
+ def SubgraphsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Model
+ def Description(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.String(o + self._tab.Pos)
+ return None
+
+ # Model
+ def Buffers(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ x = self._tab.Vector(o)
+ x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+ x = self._tab.Indirect(x)
+ from .Buffer import Buffer
+ obj = Buffer()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # Model
+ def BuffersLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Model
+ def MetadataBuffer(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # Model
+ def MetadataBufferAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # Model
+ def MetadataBufferLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Model
+ def Metadata(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+ if o != 0:
+ x = self._tab.Vector(o)
+ x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+ x = self._tab.Indirect(x)
+ from .Metadata import Metadata
+ obj = Metadata()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # Model
+ def MetadataLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def ModelStart(builder): builder.StartObject(7)
+def ModelAddVersion(builder, version): builder.PrependUint32Slot(0, version, 0)
+def ModelAddOperatorCodes(builder, operatorCodes): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(operatorCodes), 0)
+def ModelStartOperatorCodesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelAddSubgraphs(builder, subgraphs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(subgraphs), 0)
+def ModelStartSubgraphsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelAddDescription(builder, description): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(description), 0)
+def ModelAddBuffers(builder, buffers): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(buffers), 0)
+def ModelStartBuffersVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelAddMetadataBuffer(builder, metadataBuffer): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(metadataBuffer), 0)
+def ModelStartMetadataBufferVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelAddMetadata(builder, metadata): builder.PrependUOffsetTRelativeSlot(6, flatbuffers.number_types.UOffsetTFlags.py_type(metadata), 0)
+def ModelStartMetadataVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ModelEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/MulOptions.py b/ethosu/vela/tflite/MulOptions.py
new file mode 100644
index 00000000..55b9506f
--- /dev/null
+++ b/ethosu/vela/tflite/MulOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class MulOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsMulOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = MulOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # MulOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # MulOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def MulOptionsStart(builder): builder.StartObject(1)
+def MulOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def MulOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/NegOptions.py b/ethosu/vela/tflite/NegOptions.py
new file mode 100644
index 00000000..05d55c26
--- /dev/null
+++ b/ethosu/vela/tflite/NegOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class NegOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsNegOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = NegOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # NegOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def NegOptionsStart(builder): builder.StartObject(0)
+def NegOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/NonMaxSuppressionV4Options.py b/ethosu/vela/tflite/NonMaxSuppressionV4Options.py
new file mode 100644
index 00000000..6ad10a2e
--- /dev/null
+++ b/ethosu/vela/tflite/NonMaxSuppressionV4Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class NonMaxSuppressionV4Options(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsNonMaxSuppressionV4Options(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = NonMaxSuppressionV4Options()
+ x.Init(buf, n + offset)
+ return x
+
+ # NonMaxSuppressionV4Options
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def NonMaxSuppressionV4OptionsStart(builder): builder.StartObject(0)
+def NonMaxSuppressionV4OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/NonMaxSuppressionV5Options.py b/ethosu/vela/tflite/NonMaxSuppressionV5Options.py
new file mode 100644
index 00000000..99cbdbbf
--- /dev/null
+++ b/ethosu/vela/tflite/NonMaxSuppressionV5Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class NonMaxSuppressionV5Options(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsNonMaxSuppressionV5Options(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = NonMaxSuppressionV5Options()
+ x.Init(buf, n + offset)
+ return x
+
+ # NonMaxSuppressionV5Options
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def NonMaxSuppressionV5OptionsStart(builder): builder.StartObject(0)
+def NonMaxSuppressionV5OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/NotEqualOptions.py b/ethosu/vela/tflite/NotEqualOptions.py
new file mode 100644
index 00000000..4c511e93
--- /dev/null
+++ b/ethosu/vela/tflite/NotEqualOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class NotEqualOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsNotEqualOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = NotEqualOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # NotEqualOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def NotEqualOptionsStart(builder): builder.StartObject(0)
+def NotEqualOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/OneHotOptions.py b/ethosu/vela/tflite/OneHotOptions.py
new file mode 100644
index 00000000..793a3e75
--- /dev/null
+++ b/ethosu/vela/tflite/OneHotOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class OneHotOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsOneHotOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = OneHotOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # OneHotOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # OneHotOptions
+ def Axis(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def OneHotOptionsStart(builder): builder.StartObject(1)
+def OneHotOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0)
+def OneHotOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Operator.py b/ethosu/vela/tflite/Operator.py
new file mode 100644
index 00000000..cbae3dab
--- /dev/null
+++ b/ethosu/vela/tflite/Operator.py
@@ -0,0 +1,177 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Operator(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsOperator(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Operator()
+ x.Init(buf, n + offset)
+ return x
+
+ # Operator
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Operator
+ def OpcodeIndex(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+ return 0
+
+ # Operator
+ def Inputs(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # Operator
+ def InputsAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # Operator
+ def InputsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Operator
+ def Outputs(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # Operator
+ def OutputsAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # Operator
+ def OutputsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Operator
+ def BuiltinOptionsType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+ return 0
+
+ # Operator
+ def BuiltinOptions(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ from flatbuffers.table import Table
+ obj = Table(bytearray(), 0)
+ self._tab.Union(obj, o)
+ return obj
+ return None
+
+ # Operator
+ def CustomOptions(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+ return 0
+
+ # Operator
+ def CustomOptionsAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+ return 0
+
+ # Operator
+ def CustomOptionsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Operator
+ def CustomOptionsFormat(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # Operator
+ def MutatingVariableInputs(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.BoolFlags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+ return 0
+
+ # Operator
+ def MutatingVariableInputsAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.BoolFlags, o)
+ return 0
+
+ # Operator
+ def MutatingVariableInputsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Operator
+ def Intermediates(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # Operator
+ def IntermediatesAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # Operator
+ def IntermediatesLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def OperatorStart(builder): builder.StartObject(9)
+def OperatorAddOpcodeIndex(builder, opcodeIndex): builder.PrependUint32Slot(0, opcodeIndex, 0)
+def OperatorAddInputs(builder, inputs): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0)
+def OperatorStartInputsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def OperatorAddOutputs(builder, outputs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0)
+def OperatorStartOutputsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def OperatorAddBuiltinOptionsType(builder, builtinOptionsType): builder.PrependUint8Slot(3, builtinOptionsType, 0)
+def OperatorAddBuiltinOptions(builder, builtinOptions): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(builtinOptions), 0)
+def OperatorAddCustomOptions(builder, customOptions): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(customOptions), 0)
+def OperatorStartCustomOptionsVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def OperatorAddCustomOptionsFormat(builder, customOptionsFormat): builder.PrependInt8Slot(6, customOptionsFormat, 0)
+def OperatorAddMutatingVariableInputs(builder, mutatingVariableInputs): builder.PrependUOffsetTRelativeSlot(7, flatbuffers.number_types.UOffsetTFlags.py_type(mutatingVariableInputs), 0)
+def OperatorStartMutatingVariableInputsVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def OperatorAddIntermediates(builder, intermediates): builder.PrependUOffsetTRelativeSlot(8, flatbuffers.number_types.UOffsetTFlags.py_type(intermediates), 0)
+def OperatorStartIntermediatesVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def OperatorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/OperatorCode.py b/ethosu/vela/tflite/OperatorCode.py
new file mode 100644
index 00000000..dd525f53
--- /dev/null
+++ b/ethosu/vela/tflite/OperatorCode.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class OperatorCode(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsOperatorCode(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = OperatorCode()
+ x.Init(buf, n + offset)
+ return x
+
+ # OperatorCode
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # OperatorCode
+ def BuiltinCode(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # OperatorCode
+ def CustomCode(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.String(o + self._tab.Pos)
+ return None
+
+ # OperatorCode
+ def Version(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 1
+
+def OperatorCodeStart(builder): builder.StartObject(3)
+def OperatorCodeAddBuiltinCode(builder, builtinCode): builder.PrependInt8Slot(0, builtinCode, 0)
+def OperatorCodeAddCustomCode(builder, customCode): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(customCode), 0)
+def OperatorCodeAddVersion(builder, version): builder.PrependInt32Slot(2, version, 1)
+def OperatorCodeEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/PackOptions.py b/ethosu/vela/tflite/PackOptions.py
new file mode 100644
index 00000000..6a8ee2bb
--- /dev/null
+++ b/ethosu/vela/tflite/PackOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class PackOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsPackOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = PackOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # PackOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # PackOptions
+ def ValuesCount(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # PackOptions
+ def Axis(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def PackOptionsStart(builder): builder.StartObject(2)
+def PackOptionsAddValuesCount(builder, valuesCount): builder.PrependInt32Slot(0, valuesCount, 0)
+def PackOptionsAddAxis(builder, axis): builder.PrependInt32Slot(1, axis, 0)
+def PackOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/PadOptions.py b/ethosu/vela/tflite/PadOptions.py
new file mode 100644
index 00000000..d0833c68
--- /dev/null
+++ b/ethosu/vela/tflite/PadOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class PadOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsPadOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = PadOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # PadOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def PadOptionsStart(builder): builder.StartObject(0)
+def PadOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/PadV2Options.py b/ethosu/vela/tflite/PadV2Options.py
new file mode 100644
index 00000000..5ea0d70c
--- /dev/null
+++ b/ethosu/vela/tflite/PadV2Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class PadV2Options(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsPadV2Options(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = PadV2Options()
+ x.Init(buf, n + offset)
+ return x
+
+ # PadV2Options
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def PadV2OptionsStart(builder): builder.StartObject(0)
+def PadV2OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Padding.py b/ethosu/vela/tflite/Padding.py
new file mode 100644
index 00000000..168bf74c
--- /dev/null
+++ b/ethosu/vela/tflite/Padding.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class Padding(object):
+ SAME = 0
+ VALID = 1
diff --git a/ethosu/vela/tflite/Pool2DOptions.py b/ethosu/vela/tflite/Pool2DOptions.py
new file mode 100644
index 00000000..b8b9f178
--- /dev/null
+++ b/ethosu/vela/tflite/Pool2DOptions.py
@@ -0,0 +1,70 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Pool2DOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsPool2DOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Pool2DOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # Pool2DOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Pool2DOptions
+ def Padding(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # Pool2DOptions
+ def StrideW(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # Pool2DOptions
+ def StrideH(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # Pool2DOptions
+ def FilterWidth(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # Pool2DOptions
+ def FilterHeight(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # Pool2DOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def Pool2DOptionsStart(builder): builder.StartObject(6)
+def Pool2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0)
+def Pool2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0)
+def Pool2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0)
+def Pool2DOptionsAddFilterWidth(builder, filterWidth): builder.PrependInt32Slot(3, filterWidth, 0)
+def Pool2DOptionsAddFilterHeight(builder, filterHeight): builder.PrependInt32Slot(4, filterHeight, 0)
+def Pool2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(5, fusedActivationFunction, 0)
+def Pool2DOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/PowOptions.py b/ethosu/vela/tflite/PowOptions.py
new file mode 100644
index 00000000..666ca488
--- /dev/null
+++ b/ethosu/vela/tflite/PowOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class PowOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsPowOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = PowOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # PowOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def PowOptionsStart(builder): builder.StartObject(0)
+def PowOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/QuantizationDetails.py b/ethosu/vela/tflite/QuantizationDetails.py
new file mode 100644
index 00000000..8d53af96
--- /dev/null
+++ b/ethosu/vela/tflite/QuantizationDetails.py
@@ -0,0 +1,7 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class QuantizationDetails(object):
+ NONE = 0
+ CustomQuantization = 1
diff --git a/ethosu/vela/tflite/QuantizationParameters.py b/ethosu/vela/tflite/QuantizationParameters.py
new file mode 100644
index 00000000..fcd686cf
--- /dev/null
+++ b/ethosu/vela/tflite/QuantizationParameters.py
@@ -0,0 +1,145 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class QuantizationParameters(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsQuantizationParameters(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = QuantizationParameters()
+ x.Init(buf, n + offset)
+ return x
+
+ # QuantizationParameters
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # QuantizationParameters
+ def Min(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # QuantizationParameters
+ def MinAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+ return 0
+
+ # QuantizationParameters
+ def MinLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # QuantizationParameters
+ def Max(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # QuantizationParameters
+ def MaxAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+ return 0
+
+ # QuantizationParameters
+ def MaxLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # QuantizationParameters
+ def Scale(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # QuantizationParameters
+ def ScaleAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o)
+ return 0
+
+ # QuantizationParameters
+ def ScaleLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # QuantizationParameters
+ def ZeroPoint(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8))
+ return 0
+
+ # QuantizationParameters
+ def ZeroPointAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o)
+ return 0
+
+ # QuantizationParameters
+ def ZeroPointLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # QuantizationParameters
+ def DetailsType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos)
+ return 0
+
+ # QuantizationParameters
+ def Details(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ from flatbuffers.table import Table
+ obj = Table(bytearray(), 0)
+ self._tab.Union(obj, o)
+ return obj
+ return None
+
+ # QuantizationParameters
+ def QuantizedDimension(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def QuantizationParametersStart(builder): builder.StartObject(7)
+def QuantizationParametersAddMin(builder, min): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(min), 0)
+def QuantizationParametersStartMinVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def QuantizationParametersAddMax(builder, max): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(max), 0)
+def QuantizationParametersStartMaxVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def QuantizationParametersAddScale(builder, scale): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(scale), 0)
+def QuantizationParametersStartScaleVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def QuantizationParametersAddZeroPoint(builder, zeroPoint): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(zeroPoint), 0)
+def QuantizationParametersStartZeroPointVector(builder, numElems): return builder.StartVector(8, numElems, 8)
+def QuantizationParametersAddDetailsType(builder, detailsType): builder.PrependUint8Slot(4, detailsType, 0)
+def QuantizationParametersAddDetails(builder, details): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(details), 0)
+def QuantizationParametersAddQuantizedDimension(builder, quantizedDimension): builder.PrependInt32Slot(6, quantizedDimension, 0)
+def QuantizationParametersEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/QuantizeOptions.py b/ethosu/vela/tflite/QuantizeOptions.py
new file mode 100644
index 00000000..28af8cc9
--- /dev/null
+++ b/ethosu/vela/tflite/QuantizeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class QuantizeOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsQuantizeOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = QuantizeOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # QuantizeOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def QuantizeOptionsStart(builder): builder.StartObject(0)
+def QuantizeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/RNNOptions.py b/ethosu/vela/tflite/RNNOptions.py
new file mode 100644
index 00000000..3cfdb6af
--- /dev/null
+++ b/ethosu/vela/tflite/RNNOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class RNNOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsRNNOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = RNNOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # RNNOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # RNNOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def RNNOptionsStart(builder): builder.StartObject(1)
+def RNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def RNNOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/RangeOptions.py b/ethosu/vela/tflite/RangeOptions.py
new file mode 100644
index 00000000..cb705b57
--- /dev/null
+++ b/ethosu/vela/tflite/RangeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class RangeOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsRangeOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = RangeOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # RangeOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def RangeOptionsStart(builder): builder.StartObject(0)
+def RangeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/RankOptions.py b/ethosu/vela/tflite/RankOptions.py
new file mode 100644
index 00000000..4e4a5ecd
--- /dev/null
+++ b/ethosu/vela/tflite/RankOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class RankOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsRankOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = RankOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # RankOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def RankOptionsStart(builder): builder.StartObject(0)
+def RankOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ReducerOptions.py b/ethosu/vela/tflite/ReducerOptions.py
new file mode 100644
index 00000000..93bbde17
--- /dev/null
+++ b/ethosu/vela/tflite/ReducerOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ReducerOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsReducerOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ReducerOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ReducerOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ReducerOptions
+ def KeepDims(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def ReducerOptionsStart(builder): builder.StartObject(1)
+def ReducerOptionsAddKeepDims(builder, keepDims): builder.PrependBoolSlot(0, keepDims, 0)
+def ReducerOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ReshapeOptions.py b/ethosu/vela/tflite/ReshapeOptions.py
new file mode 100644
index 00000000..157d45d9
--- /dev/null
+++ b/ethosu/vela/tflite/ReshapeOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ReshapeOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsReshapeOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ReshapeOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ReshapeOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ReshapeOptions
+ def NewShape(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # ReshapeOptions
+ def NewShapeAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # ReshapeOptions
+ def NewShapeLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def ReshapeOptionsStart(builder): builder.StartObject(1)
+def ReshapeOptionsAddNewShape(builder, newShape): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(newShape), 0)
+def ReshapeOptionsStartNewShapeVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def ReshapeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ResizeBilinearOptions.py b/ethosu/vela/tflite/ResizeBilinearOptions.py
new file mode 100644
index 00000000..fb05ca4b
--- /dev/null
+++ b/ethosu/vela/tflite/ResizeBilinearOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ResizeBilinearOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsResizeBilinearOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ResizeBilinearOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ResizeBilinearOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ResizeBilinearOptions
+ def AlignCorners(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+ # ResizeBilinearOptions
+ def HalfPixelCenters(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def ResizeBilinearOptionsStart(builder): builder.StartObject(4)
+def ResizeBilinearOptionsAddAlignCorners(builder, alignCorners): builder.PrependBoolSlot(2, alignCorners, 0)
+def ResizeBilinearOptionsAddHalfPixelCenters(builder, halfPixelCenters): builder.PrependBoolSlot(3, halfPixelCenters, 0)
+def ResizeBilinearOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ResizeNearestNeighborOptions.py b/ethosu/vela/tflite/ResizeNearestNeighborOptions.py
new file mode 100644
index 00000000..4b166e95
--- /dev/null
+++ b/ethosu/vela/tflite/ResizeNearestNeighborOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ResizeNearestNeighborOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsResizeNearestNeighborOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ResizeNearestNeighborOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ResizeNearestNeighborOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ResizeNearestNeighborOptions
+ def AlignCorners(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def ResizeNearestNeighborOptionsStart(builder): builder.StartObject(1)
+def ResizeNearestNeighborOptionsAddAlignCorners(builder, alignCorners): builder.PrependBoolSlot(0, alignCorners, 0)
+def ResizeNearestNeighborOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ReverseSequenceOptions.py b/ethosu/vela/tflite/ReverseSequenceOptions.py
new file mode 100644
index 00000000..cbaf96db
--- /dev/null
+++ b/ethosu/vela/tflite/ReverseSequenceOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ReverseSequenceOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsReverseSequenceOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ReverseSequenceOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ReverseSequenceOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ReverseSequenceOptions
+ def SeqDim(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # ReverseSequenceOptions
+ def BatchDim(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def ReverseSequenceOptionsStart(builder): builder.StartObject(2)
+def ReverseSequenceOptionsAddSeqDim(builder, seqDim): builder.PrependInt32Slot(0, seqDim, 0)
+def ReverseSequenceOptionsAddBatchDim(builder, batchDim): builder.PrependInt32Slot(1, batchDim, 0)
+def ReverseSequenceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ReverseV2Options.py b/ethosu/vela/tflite/ReverseV2Options.py
new file mode 100644
index 00000000..dbac9362
--- /dev/null
+++ b/ethosu/vela/tflite/ReverseV2Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ReverseV2Options(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsReverseV2Options(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ReverseV2Options()
+ x.Init(buf, n + offset)
+ return x
+
+ # ReverseV2Options
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def ReverseV2OptionsStart(builder): builder.StartObject(0)
+def ReverseV2OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SVDFOptions.py b/ethosu/vela/tflite/SVDFOptions.py
new file mode 100644
index 00000000..6f391db1
--- /dev/null
+++ b/ethosu/vela/tflite/SVDFOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SVDFOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSVDFOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SVDFOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SVDFOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SVDFOptions
+ def Rank(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # SVDFOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def SVDFOptionsStart(builder): builder.StartObject(2)
+def SVDFOptionsAddRank(builder, rank): builder.PrependInt32Slot(0, rank, 0)
+def SVDFOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+def SVDFOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ScatterNdOptions.py b/ethosu/vela/tflite/ScatterNdOptions.py
new file mode 100644
index 00000000..e6bf3a11
--- /dev/null
+++ b/ethosu/vela/tflite/ScatterNdOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ScatterNdOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsScatterNdOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ScatterNdOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ScatterNdOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def ScatterNdOptionsStart(builder): builder.StartObject(0)
+def ScatterNdOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SegmentSumOptions.py b/ethosu/vela/tflite/SegmentSumOptions.py
new file mode 100644
index 00000000..d1c32133
--- /dev/null
+++ b/ethosu/vela/tflite/SegmentSumOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SegmentSumOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSegmentSumOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SegmentSumOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SegmentSumOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def SegmentSumOptionsStart(builder): builder.StartObject(0)
+def SegmentSumOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SelectOptions.py b/ethosu/vela/tflite/SelectOptions.py
new file mode 100644
index 00000000..d67daf36
--- /dev/null
+++ b/ethosu/vela/tflite/SelectOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SelectOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSelectOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SelectOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SelectOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def SelectOptionsStart(builder): builder.StartObject(0)
+def SelectOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SelectV2Options.py b/ethosu/vela/tflite/SelectV2Options.py
new file mode 100644
index 00000000..5d03fc2d
--- /dev/null
+++ b/ethosu/vela/tflite/SelectV2Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SelectV2Options(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSelectV2Options(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SelectV2Options()
+ x.Init(buf, n + offset)
+ return x
+
+ # SelectV2Options
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def SelectV2OptionsStart(builder): builder.StartObject(0)
+def SelectV2OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SequenceRNNOptions.py b/ethosu/vela/tflite/SequenceRNNOptions.py
new file mode 100644
index 00000000..74a4954a
--- /dev/null
+++ b/ethosu/vela/tflite/SequenceRNNOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SequenceRNNOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSequenceRNNOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SequenceRNNOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SequenceRNNOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SequenceRNNOptions
+ def TimeMajor(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+ # SequenceRNNOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def SequenceRNNOptionsStart(builder): builder.StartObject(2)
+def SequenceRNNOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(0, timeMajor, 0)
+def SequenceRNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0)
+def SequenceRNNOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ShapeOptions.py b/ethosu/vela/tflite/ShapeOptions.py
new file mode 100644
index 00000000..2d24c05f
--- /dev/null
+++ b/ethosu/vela/tflite/ShapeOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ShapeOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsShapeOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ShapeOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ShapeOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # ShapeOptions
+ def OutType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def ShapeOptionsStart(builder): builder.StartObject(1)
+def ShapeOptionsAddOutType(builder, outType): builder.PrependInt8Slot(0, outType, 0)
+def ShapeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SkipGramOptions.py b/ethosu/vela/tflite/SkipGramOptions.py
new file mode 100644
index 00000000..0e8bdc1d
--- /dev/null
+++ b/ethosu/vela/tflite/SkipGramOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SkipGramOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSkipGramOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SkipGramOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SkipGramOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SkipGramOptions
+ def NgramSize(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # SkipGramOptions
+ def MaxSkipSize(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # SkipGramOptions
+ def IncludeAllNgrams(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def SkipGramOptionsStart(builder): builder.StartObject(3)
+def SkipGramOptionsAddNgramSize(builder, ngramSize): builder.PrependInt32Slot(0, ngramSize, 0)
+def SkipGramOptionsAddMaxSkipSize(builder, maxSkipSize): builder.PrependInt32Slot(1, maxSkipSize, 0)
+def SkipGramOptionsAddIncludeAllNgrams(builder, includeAllNgrams): builder.PrependBoolSlot(2, includeAllNgrams, 0)
+def SkipGramOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SliceOptions.py b/ethosu/vela/tflite/SliceOptions.py
new file mode 100644
index 00000000..4b41568d
--- /dev/null
+++ b/ethosu/vela/tflite/SliceOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SliceOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSliceOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SliceOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SliceOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def SliceOptionsStart(builder): builder.StartObject(0)
+def SliceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SoftmaxOptions.py b/ethosu/vela/tflite/SoftmaxOptions.py
new file mode 100644
index 00000000..a7168534
--- /dev/null
+++ b/ethosu/vela/tflite/SoftmaxOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SoftmaxOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSoftmaxOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SoftmaxOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SoftmaxOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SoftmaxOptions
+ def Beta(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+def SoftmaxOptionsStart(builder): builder.StartObject(1)
+def SoftmaxOptionsAddBeta(builder, beta): builder.PrependFloat32Slot(0, beta, 0.0)
+def SoftmaxOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SpaceToBatchNDOptions.py b/ethosu/vela/tflite/SpaceToBatchNDOptions.py
new file mode 100644
index 00000000..b61ef96f
--- /dev/null
+++ b/ethosu/vela/tflite/SpaceToBatchNDOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SpaceToBatchNDOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSpaceToBatchNDOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SpaceToBatchNDOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SpaceToBatchNDOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def SpaceToBatchNDOptionsStart(builder): builder.StartObject(0)
+def SpaceToBatchNDOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SpaceToDepthOptions.py b/ethosu/vela/tflite/SpaceToDepthOptions.py
new file mode 100644
index 00000000..d571174a
--- /dev/null
+++ b/ethosu/vela/tflite/SpaceToDepthOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SpaceToDepthOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSpaceToDepthOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SpaceToDepthOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SpaceToDepthOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SpaceToDepthOptions
+ def BlockSize(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def SpaceToDepthOptionsStart(builder): builder.StartObject(1)
+def SpaceToDepthOptionsAddBlockSize(builder, blockSize): builder.PrependInt32Slot(0, blockSize, 0)
+def SpaceToDepthOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SparseIndexVector.py b/ethosu/vela/tflite/SparseIndexVector.py
new file mode 100644
index 00000000..e2c9db78
--- /dev/null
+++ b/ethosu/vela/tflite/SparseIndexVector.py
@@ -0,0 +1,9 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class SparseIndexVector(object):
+ NONE = 0
+ Int32Vector = 1
+ Uint16Vector = 2
+ Uint8Vector = 3
diff --git a/ethosu/vela/tflite/SparseToDenseOptions.py b/ethosu/vela/tflite/SparseToDenseOptions.py
new file mode 100644
index 00000000..826eee08
--- /dev/null
+++ b/ethosu/vela/tflite/SparseToDenseOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SparseToDenseOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSparseToDenseOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SparseToDenseOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SparseToDenseOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SparseToDenseOptions
+ def ValidateIndices(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def SparseToDenseOptionsStart(builder): builder.StartObject(1)
+def SparseToDenseOptionsAddValidateIndices(builder, validateIndices): builder.PrependBoolSlot(0, validateIndices, 0)
+def SparseToDenseOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SparsityParameters.py b/ethosu/vela/tflite/SparsityParameters.py
new file mode 100644
index 00000000..de550a67
--- /dev/null
+++ b/ethosu/vela/tflite/SparsityParameters.py
@@ -0,0 +1,92 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SparsityParameters(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSparsityParameters(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SparsityParameters()
+ x.Init(buf, n + offset)
+ return x
+
+ # SparsityParameters
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SparsityParameters
+ def TraversalOrder(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # SparsityParameters
+ def TraversalOrderAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # SparsityParameters
+ def TraversalOrderLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # SparsityParameters
+ def BlockMap(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # SparsityParameters
+ def BlockMapAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # SparsityParameters
+ def BlockMapLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # SparsityParameters
+ def DimMetadata(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ x = self._tab.Vector(o)
+ x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+ x = self._tab.Indirect(x)
+ from .DimensionMetadata import DimensionMetadata
+ obj = DimensionMetadata()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # SparsityParameters
+ def DimMetadataLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def SparsityParametersStart(builder): builder.StartObject(3)
+def SparsityParametersAddTraversalOrder(builder, traversalOrder): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(traversalOrder), 0)
+def SparsityParametersStartTraversalOrderVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SparsityParametersAddBlockMap(builder, blockMap): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(blockMap), 0)
+def SparsityParametersStartBlockMapVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SparsityParametersAddDimMetadata(builder, dimMetadata): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(dimMetadata), 0)
+def SparsityParametersStartDimMetadataVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SparsityParametersEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SplitOptions.py b/ethosu/vela/tflite/SplitOptions.py
new file mode 100644
index 00000000..3207525b
--- /dev/null
+++ b/ethosu/vela/tflite/SplitOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SplitOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSplitOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SplitOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SplitOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SplitOptions
+ def NumSplits(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def SplitOptionsStart(builder): builder.StartObject(1)
+def SplitOptionsAddNumSplits(builder, numSplits): builder.PrependInt32Slot(0, numSplits, 0)
+def SplitOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SplitVOptions.py b/ethosu/vela/tflite/SplitVOptions.py
new file mode 100644
index 00000000..418959de
--- /dev/null
+++ b/ethosu/vela/tflite/SplitVOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SplitVOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSplitVOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SplitVOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SplitVOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SplitVOptions
+ def NumSplits(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def SplitVOptionsStart(builder): builder.StartObject(1)
+def SplitVOptionsAddNumSplits(builder, numSplits): builder.PrependInt32Slot(0, numSplits, 0)
+def SplitVOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SquareOptions.py b/ethosu/vela/tflite/SquareOptions.py
new file mode 100644
index 00000000..56633f6a
--- /dev/null
+++ b/ethosu/vela/tflite/SquareOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SquareOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSquareOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SquareOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SquareOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def SquareOptionsStart(builder): builder.StartObject(0)
+def SquareOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SquaredDifferenceOptions.py b/ethosu/vela/tflite/SquaredDifferenceOptions.py
new file mode 100644
index 00000000..906855d1
--- /dev/null
+++ b/ethosu/vela/tflite/SquaredDifferenceOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SquaredDifferenceOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSquaredDifferenceOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SquaredDifferenceOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SquaredDifferenceOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def SquaredDifferenceOptionsStart(builder): builder.StartObject(0)
+def SquaredDifferenceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SqueezeOptions.py b/ethosu/vela/tflite/SqueezeOptions.py
new file mode 100644
index 00000000..25b294dc
--- /dev/null
+++ b/ethosu/vela/tflite/SqueezeOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SqueezeOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSqueezeOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SqueezeOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SqueezeOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SqueezeOptions
+ def SqueezeDims(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # SqueezeOptions
+ def SqueezeDimsAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # SqueezeOptions
+ def SqueezeDimsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def SqueezeOptionsStart(builder): builder.StartObject(1)
+def SqueezeOptionsAddSqueezeDims(builder, squeezeDims): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(squeezeDims), 0)
+def SqueezeOptionsStartSqueezeDimsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SqueezeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/StridedSliceOptions.py b/ethosu/vela/tflite/StridedSliceOptions.py
new file mode 100644
index 00000000..3bbb36b8
--- /dev/null
+++ b/ethosu/vela/tflite/StridedSliceOptions.py
@@ -0,0 +1,62 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class StridedSliceOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsStridedSliceOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = StridedSliceOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # StridedSliceOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # StridedSliceOptions
+ def BeginMask(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # StridedSliceOptions
+ def EndMask(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # StridedSliceOptions
+ def EllipsisMask(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # StridedSliceOptions
+ def NewAxisMask(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # StridedSliceOptions
+ def ShrinkAxisMask(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def StridedSliceOptionsStart(builder): builder.StartObject(5)
+def StridedSliceOptionsAddBeginMask(builder, beginMask): builder.PrependInt32Slot(0, beginMask, 0)
+def StridedSliceOptionsAddEndMask(builder, endMask): builder.PrependInt32Slot(1, endMask, 0)
+def StridedSliceOptionsAddEllipsisMask(builder, ellipsisMask): builder.PrependInt32Slot(2, ellipsisMask, 0)
+def StridedSliceOptionsAddNewAxisMask(builder, newAxisMask): builder.PrependInt32Slot(3, newAxisMask, 0)
+def StridedSliceOptionsAddShrinkAxisMask(builder, shrinkAxisMask): builder.PrependInt32Slot(4, shrinkAxisMask, 0)
+def StridedSliceOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SubGraph.py b/ethosu/vela/tflite/SubGraph.py
new file mode 100644
index 00000000..eaa42fac
--- /dev/null
+++ b/ethosu/vela/tflite/SubGraph.py
@@ -0,0 +1,122 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SubGraph(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSubGraph(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SubGraph()
+ x.Init(buf, n + offset)
+ return x
+
+ # SubGraph
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SubGraph
+ def Tensors(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ x = self._tab.Vector(o)
+ x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+ x = self._tab.Indirect(x)
+ from .Tensor import Tensor
+ obj = Tensor()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # SubGraph
+ def TensorsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # SubGraph
+ def Inputs(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # SubGraph
+ def InputsAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # SubGraph
+ def InputsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # SubGraph
+ def Outputs(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # SubGraph
+ def OutputsAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # SubGraph
+ def OutputsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # SubGraph
+ def Operators(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ x = self._tab.Vector(o)
+ x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4
+ x = self._tab.Indirect(x)
+ from .Operator import Operator
+ obj = Operator()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # SubGraph
+ def OperatorsLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # SubGraph
+ def Name(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ return self._tab.String(o + self._tab.Pos)
+ return None
+
+def SubGraphStart(builder): builder.StartObject(5)
+def SubGraphAddTensors(builder, tensors): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(tensors), 0)
+def SubGraphStartTensorsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SubGraphAddInputs(builder, inputs): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0)
+def SubGraphStartInputsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SubGraphAddOutputs(builder, outputs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0)
+def SubGraphStartOutputsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SubGraphAddOperators(builder, operators): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(operators), 0)
+def SubGraphStartOperatorsVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def SubGraphAddName(builder, name): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
+def SubGraphEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/SubOptions.py b/ethosu/vela/tflite/SubOptions.py
new file mode 100644
index 00000000..eccd7aba
--- /dev/null
+++ b/ethosu/vela/tflite/SubOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class SubOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsSubOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = SubOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # SubOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # SubOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+def SubOptionsStart(builder): builder.StartObject(1)
+def SubOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def SubOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Tensor.py b/ethosu/vela/tflite/Tensor.py
new file mode 100644
index 00000000..4c39b7cb
--- /dev/null
+++ b/ethosu/vela/tflite/Tensor.py
@@ -0,0 +1,126 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Tensor(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsTensor(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Tensor()
+ x.Init(buf, n + offset)
+ return x
+
+ # Tensor
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Tensor
+ def Shape(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # Tensor
+ def ShapeAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # Tensor
+ def ShapeLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+ # Tensor
+ def Type(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # Tensor
+ def Buffer(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos)
+ return 0
+
+ # Tensor
+ def Name(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return self._tab.String(o + self._tab.Pos)
+ return None
+
+ # Tensor
+ def Quantization(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12))
+ if o != 0:
+ x = self._tab.Indirect(o + self._tab.Pos)
+ from .QuantizationParameters import QuantizationParameters
+ obj = QuantizationParameters()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # Tensor
+ def IsVariable(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+ # Tensor
+ def Sparsity(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16))
+ if o != 0:
+ x = self._tab.Indirect(o + self._tab.Pos)
+ from .SparsityParameters import SparsityParameters
+ obj = SparsityParameters()
+ obj.Init(self._tab.Bytes, x)
+ return obj
+ return None
+
+ # Tensor
+ def ShapeSignature(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4))
+ return 0
+
+ # Tensor
+ def ShapeSignatureAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o)
+ return 0
+
+ # Tensor
+ def ShapeSignatureLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def TensorStart(builder): builder.StartObject(8)
+def TensorAddShape(builder, shape): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(shape), 0)
+def TensorStartShapeVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def TensorAddType(builder, type): builder.PrependInt8Slot(1, type, 0)
+def TensorAddBuffer(builder, buffer): builder.PrependUint32Slot(2, buffer, 0)
+def TensorAddName(builder, name): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0)
+def TensorAddQuantization(builder, quantization): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(quantization), 0)
+def TensorAddIsVariable(builder, isVariable): builder.PrependBoolSlot(5, isVariable, 0)
+def TensorAddSparsity(builder, sparsity): builder.PrependUOffsetTRelativeSlot(6, flatbuffers.number_types.UOffsetTFlags.py_type(sparsity), 0)
+def TensorAddShapeSignature(builder, shapeSignature): builder.PrependUOffsetTRelativeSlot(7, flatbuffers.number_types.UOffsetTFlags.py_type(shapeSignature), 0)
+def TensorStartShapeSignatureVector(builder, numElems): return builder.StartVector(4, numElems, 4)
+def TensorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/TensorType.py b/ethosu/vela/tflite/TensorType.py
new file mode 100644
index 00000000..53c011bc
--- /dev/null
+++ b/ethosu/vela/tflite/TensorType.py
@@ -0,0 +1,15 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+class TensorType(object):
+ FLOAT32 = 0
+ FLOAT16 = 1
+ INT32 = 2
+ UINT8 = 3
+ INT64 = 4
+ STRING = 5
+ BOOL = 6
+ INT16 = 7
+ COMPLEX64 = 8
+ INT8 = 9
diff --git a/ethosu/vela/tflite/TileOptions.py b/ethosu/vela/tflite/TileOptions.py
new file mode 100644
index 00000000..ec8396dc
--- /dev/null
+++ b/ethosu/vela/tflite/TileOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class TileOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsTileOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = TileOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # TileOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def TileOptionsStart(builder): builder.StartObject(0)
+def TileOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/TopKV2Options.py b/ethosu/vela/tflite/TopKV2Options.py
new file mode 100644
index 00000000..ccd51033
--- /dev/null
+++ b/ethosu/vela/tflite/TopKV2Options.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class TopKV2Options(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsTopKV2Options(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = TopKV2Options()
+ x.Init(buf, n + offset)
+ return x
+
+ # TopKV2Options
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def TopKV2OptionsStart(builder): builder.StartObject(0)
+def TopKV2OptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/TransposeConvOptions.py b/ethosu/vela/tflite/TransposeConvOptions.py
new file mode 100644
index 00000000..423571c8
--- /dev/null
+++ b/ethosu/vela/tflite/TransposeConvOptions.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class TransposeConvOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsTransposeConvOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = TransposeConvOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # TransposeConvOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # TransposeConvOptions
+ def Padding(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # TransposeConvOptions
+ def StrideW(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # TransposeConvOptions
+ def StrideH(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def TransposeConvOptionsStart(builder): builder.StartObject(3)
+def TransposeConvOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0)
+def TransposeConvOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0)
+def TransposeConvOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0)
+def TransposeConvOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/TransposeOptions.py b/ethosu/vela/tflite/TransposeOptions.py
new file mode 100644
index 00000000..42c596d9
--- /dev/null
+++ b/ethosu/vela/tflite/TransposeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class TransposeOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsTransposeOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = TransposeOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # TransposeOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def TransposeOptionsStart(builder): builder.StartObject(0)
+def TransposeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Uint16Vector.py b/ethosu/vela/tflite/Uint16Vector.py
new file mode 100644
index 00000000..750e52a4
--- /dev/null
+++ b/ethosu/vela/tflite/Uint16Vector.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Uint16Vector(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsUint16Vector(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Uint16Vector()
+ x.Init(buf, n + offset)
+ return x
+
+ # Uint16Vector
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Uint16Vector
+ def Values(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Uint16Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 2))
+ return 0
+
+ # Uint16Vector
+ def ValuesAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint16Flags, o)
+ return 0
+
+ # Uint16Vector
+ def ValuesLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def Uint16VectorStart(builder): builder.StartObject(1)
+def Uint16VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0)
+def Uint16VectorStartValuesVector(builder, numElems): return builder.StartVector(2, numElems, 2)
+def Uint16VectorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/Uint8Vector.py b/ethosu/vela/tflite/Uint8Vector.py
new file mode 100644
index 00000000..dc475f9f
--- /dev/null
+++ b/ethosu/vela/tflite/Uint8Vector.py
@@ -0,0 +1,46 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class Uint8Vector(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsUint8Vector(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = Uint8Vector()
+ x.Init(buf, n + offset)
+ return x
+
+ # Uint8Vector
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # Uint8Vector
+ def Values(self, j):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ a = self._tab.Vector(o)
+ return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1))
+ return 0
+
+ # Uint8Vector
+ def ValuesAsNumpy(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o)
+ return 0
+
+ # Uint8Vector
+ def ValuesLength(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.VectorLen(o)
+ return 0
+
+def Uint8VectorStart(builder): builder.StartObject(1)
+def Uint8VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0)
+def Uint8VectorStartValuesVector(builder, numElems): return builder.StartVector(1, numElems, 1)
+def Uint8VectorEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py b/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py
new file mode 100644
index 00000000..1b0c112c
--- /dev/null
+++ b/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py
@@ -0,0 +1,54 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class UnidirectionalSequenceLSTMOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsUnidirectionalSequenceLSTMOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = UnidirectionalSequenceLSTMOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # UnidirectionalSequenceLSTMOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # UnidirectionalSequenceLSTMOptions
+ def FusedActivationFunction(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 0
+
+ # UnidirectionalSequenceLSTMOptions
+ def CellClip(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # UnidirectionalSequenceLSTMOptions
+ def ProjClip(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos)
+ return 0.0
+
+ # UnidirectionalSequenceLSTMOptions
+ def TimeMajor(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10))
+ if o != 0:
+ return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos))
+ return False
+
+def UnidirectionalSequenceLSTMOptionsStart(builder): builder.StartObject(4)
+def UnidirectionalSequenceLSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0)
+def UnidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0)
+def UnidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0)
+def UnidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(3, timeMajor, 0)
+def UnidirectionalSequenceLSTMOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/UniqueOptions.py b/ethosu/vela/tflite/UniqueOptions.py
new file mode 100644
index 00000000..841c6977
--- /dev/null
+++ b/ethosu/vela/tflite/UniqueOptions.py
@@ -0,0 +1,30 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class UniqueOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsUniqueOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = UniqueOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # UniqueOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # UniqueOptions
+ def IdxOutType(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos)
+ return 2
+
+def UniqueOptionsStart(builder): builder.StartObject(1)
+def UniqueOptionsAddIdxOutType(builder, idxOutType): builder.PrependInt8Slot(0, idxOutType, 2)
+def UniqueOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/UnpackOptions.py b/ethosu/vela/tflite/UnpackOptions.py
new file mode 100644
index 00000000..eed40193
--- /dev/null
+++ b/ethosu/vela/tflite/UnpackOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class UnpackOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsUnpackOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = UnpackOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # UnpackOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # UnpackOptions
+ def Num(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # UnpackOptions
+ def Axis(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def UnpackOptionsStart(builder): builder.StartObject(2)
+def UnpackOptionsAddNum(builder, num): builder.PrependInt32Slot(0, num, 0)
+def UnpackOptionsAddAxis(builder, axis): builder.PrependInt32Slot(1, axis, 0)
+def UnpackOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/WhereOptions.py b/ethosu/vela/tflite/WhereOptions.py
new file mode 100644
index 00000000..ab69f6aa
--- /dev/null
+++ b/ethosu/vela/tflite/WhereOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class WhereOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsWhereOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = WhereOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # WhereOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def WhereOptionsStart(builder): builder.StartObject(0)
+def WhereOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/WhileOptions.py b/ethosu/vela/tflite/WhileOptions.py
new file mode 100644
index 00000000..7d5a6dfa
--- /dev/null
+++ b/ethosu/vela/tflite/WhileOptions.py
@@ -0,0 +1,38 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class WhileOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsWhileOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = WhileOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # WhileOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+ # WhileOptions
+ def CondSubgraphIndex(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+ # WhileOptions
+ def BodySubgraphIndex(self):
+ o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
+ if o != 0:
+ return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos)
+ return 0
+
+def WhileOptionsStart(builder): builder.StartObject(2)
+def WhileOptionsAddCondSubgraphIndex(builder, condSubgraphIndex): builder.PrependInt32Slot(0, condSubgraphIndex, 0)
+def WhileOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex): builder.PrependInt32Slot(1, bodySubgraphIndex, 0)
+def WhileOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/ZerosLikeOptions.py b/ethosu/vela/tflite/ZerosLikeOptions.py
new file mode 100644
index 00000000..e6aa9639
--- /dev/null
+++ b/ethosu/vela/tflite/ZerosLikeOptions.py
@@ -0,0 +1,22 @@
+# automatically generated by the FlatBuffers compiler, do not modify
+
+# namespace: tflite
+
+import flatbuffers
+
+class ZerosLikeOptions(object):
+ __slots__ = ['_tab']
+
+ @classmethod
+ def GetRootAsZerosLikeOptions(cls, buf, offset):
+ n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
+ x = ZerosLikeOptions()
+ x.Init(buf, n + offset)
+ return x
+
+ # ZerosLikeOptions
+ def Init(self, buf, pos):
+ self._tab = flatbuffers.table.Table(buf, pos)
+
+def ZerosLikeOptionsStart(builder): builder.StartObject(0)
+def ZerosLikeOptionsEnd(builder): return builder.EndObject()
diff --git a/ethosu/vela/tflite/__init__.py b/ethosu/vela/tflite/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/ethosu/vela/tflite/__init__.py
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
new file mode 100644
index 00000000..8e46ef2e
--- /dev/null
+++ b/ethosu/vela/tflite_mapping.py
@@ -0,0 +1,644 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# TensorFlow Lite mapping functions used by both reader and writer.
+# Contains a mapping from the various TensorFlow Lite enums and options structs, generated by the FlatBuffer code
+# generator, to Vela's internal format.
+
+import numpy as np
+import struct
+
+from .data_type import DataType
+
+from .tflite.TensorType import TensorType
+from .tflite.BuiltinOperator import BuiltinOperator
+from .tflite.BuiltinOptions import BuiltinOptions
+
+
+from .tflite.Padding import Padding
+from .tflite.ActivationFunctionType import ActivationFunctionType
+
+from .tflite import Conv2DOptions
+from .tflite import DepthwiseConv2DOptions
+from .tflite import ConcatEmbeddingsOptions
+from .tflite import LSHProjectionOptions
+from .tflite import Pool2DOptions
+from .tflite import SVDFOptions
+from .tflite import RNNOptions
+from .tflite import FullyConnectedOptions
+from .tflite import SoftmaxOptions
+from .tflite import ConcatenationOptions
+from .tflite import AddOptions
+from .tflite import L2NormOptions
+from .tflite import LocalResponseNormalizationOptions
+from .tflite import LSTMOptions
+from .tflite import ResizeBilinearOptions
+from .tflite import CallOptions
+from .tflite import ReshapeOptions
+from .tflite import SkipGramOptions
+from .tflite import SpaceToDepthOptions
+from .tflite import EmbeddingLookupSparseOptions
+from .tflite import MulOptions
+from .tflite import PadOptions
+from .tflite import GatherOptions
+from .tflite import BatchToSpaceNDOptions
+from .tflite import SpaceToBatchNDOptions
+from .tflite import TransposeOptions
+from .tflite import ReducerOptions
+from .tflite import SubOptions
+from .tflite import DivOptions
+from .tflite import SqueezeOptions
+from .tflite import SequenceRNNOptions
+from .tflite import StridedSliceOptions
+from .tflite import ExpOptions
+from .tflite import TopKV2Options
+from .tflite import SplitOptions
+from .tflite import LogSoftmaxOptions
+from .tflite import CastOptions
+from .tflite import DequantizeOptions
+from .tflite import MaximumMinimumOptions
+from .tflite import ArgMaxOptions
+from .tflite import LessOptions
+from .tflite import NegOptions
+from .tflite import PadV2Options
+from .tflite import GreaterOptions
+from .tflite import GreaterEqualOptions
+from .tflite import LessEqualOptions
+from .tflite import SelectOptions
+from .tflite import SliceOptions
+from .tflite import TransposeConvOptions
+from .tflite import SparseToDenseOptions
+from .tflite import TileOptions
+from .tflite import ExpandDimsOptions
+from .tflite import EqualOptions
+from .tflite import NotEqualOptions
+from .tflite import ShapeOptions
+from .tflite import PowOptions
+from .tflite import ArgMinOptions
+from .tflite import FakeQuantOptions
+from .tflite import PackOptions
+from .tflite import LogicalOrOptions
+from .tflite import OneHotOptions
+from .tflite import LogicalAndOptions
+from .tflite import LogicalNotOptions
+from .tflite import UnpackOptions
+from .tflite import FloorDivOptions
+from .tflite import SquareOptions
+from .tflite import ZerosLikeOptions
+from .tflite import FillOptions
+from .tflite import BidirectionalSequenceLSTMOptions
+from .tflite import BidirectionalSequenceRNNOptions
+from .tflite import UnidirectionalSequenceLSTMOptions
+from .tflite import FloorModOptions
+from .tflite import RangeOptions
+from .tflite import ResizeNearestNeighborOptions
+from .tflite import LeakyReluOptions
+from .tflite import SquaredDifferenceOptions
+from .tflite import MirrorPadOptions
+from .tflite import AbsOptions
+from .tflite import SplitVOptions
+from .tflite import UniqueOptions
+from .tflite import ReverseV2Options
+from .tflite import AddNOptions
+from .tflite import GatherNdOptions
+from .tflite import CosOptions
+from .tflite import WhereOptions
+from .tflite import RankOptions
+from .tflite import ReverseSequenceOptions
+from .tflite import MatrixDiagOptions
+from .tflite import QuantizeOptions
+from .tflite import MatrixSetDiagOptions
+from .tflite import DensifyOptions
+from .tflite import DepthToSpaceOptions
+from .tflite import IfOptions
+from .tflite import NonMaxSuppressionV4Options
+from .tflite import NonMaxSuppressionV5Options
+from .tflite import ScatterNdOptions
+from .tflite import SegmentSumOptions
+from .tflite import SelectV2Options
+from .tflite import WhileOptions
+
+
+def inverse_map(map):
+ return {v: k for k, v in map.items()}
+
+
+datatype_map = {
+ TensorType.UINT8: DataType.uint8,
+ TensorType.INT8: DataType.int8,
+ TensorType.INT16: DataType.int16,
+ TensorType.INT32: DataType.int32,
+ TensorType.INT64: DataType.int64,
+ TensorType.FLOAT16: DataType.float16,
+ TensorType.FLOAT32: DataType.float32,
+ TensorType.STRING: DataType.string,
+ TensorType.BOOL: DataType.bool,
+ # no TensorType.COMPLEX64 for now
+}
+
+datatype_inv_map = inverse_map(datatype_map)
+datatype_inv_map[DataType.quint8] = TensorType.UINT8
+
+datatype_inv_map[DataType.qint8] = TensorType.INT8
+datatype_inv_map[DataType.qint16] = TensorType.INT16
+datatype_inv_map[DataType.qint32] = TensorType.INT32
+
+
+datatype_map_numpy = {
+ TensorType.UINT8: np.uint8,
+ TensorType.INT8: np.int8,
+ TensorType.INT16: np.int16,
+ TensorType.INT32: np.int32,
+ TensorType.INT64: np.int64,
+ TensorType.FLOAT16: np.float16,
+ TensorType.FLOAT32: np.float32,
+ TensorType.BOOL: np.bool,
+}
+
+
+builtin_options_map = {
+ BuiltinOptions.Conv2DOptions: Conv2DOptions.Conv2DOptions,
+ BuiltinOptions.DepthwiseConv2DOptions: DepthwiseConv2DOptions.DepthwiseConv2DOptions,
+ BuiltinOptions.ConcatEmbeddingsOptions: ConcatEmbeddingsOptions.ConcatEmbeddingsOptions,
+ BuiltinOptions.LSHProjectionOptions: LSHProjectionOptions.LSHProjectionOptions,
+ BuiltinOptions.Pool2DOptions: Pool2DOptions.Pool2DOptions,
+ BuiltinOptions.SVDFOptions: SVDFOptions.SVDFOptions,
+ BuiltinOptions.RNNOptions: RNNOptions.RNNOptions,
+ BuiltinOptions.FullyConnectedOptions: FullyConnectedOptions.FullyConnectedOptions,
+ BuiltinOptions.SoftmaxOptions: SoftmaxOptions.SoftmaxOptions,
+ BuiltinOptions.ConcatenationOptions: ConcatenationOptions.ConcatenationOptions,
+ BuiltinOptions.AddOptions: AddOptions.AddOptions,
+ BuiltinOptions.L2NormOptions: L2NormOptions.L2NormOptions,
+ BuiltinOptions.LocalResponseNormalizationOptions: LocalResponseNormalizationOptions.LocalResponseNormalizationOptions, # noqa: E501
+ BuiltinOptions.LSTMOptions: LSTMOptions.LSTMOptions,
+ BuiltinOptions.ResizeBilinearOptions: ResizeBilinearOptions.ResizeBilinearOptions,
+ BuiltinOptions.CallOptions: CallOptions.CallOptions,
+ BuiltinOptions.ReshapeOptions: ReshapeOptions.ReshapeOptions,
+ BuiltinOptions.SkipGramOptions: SkipGramOptions.SkipGramOptions,
+ BuiltinOptions.SpaceToDepthOptions: SpaceToDepthOptions.SpaceToDepthOptions,
+ BuiltinOptions.EmbeddingLookupSparseOptions: EmbeddingLookupSparseOptions.EmbeddingLookupSparseOptions,
+ BuiltinOptions.MulOptions: MulOptions.MulOptions,
+ BuiltinOptions.PadOptions: PadOptions.PadOptions,
+ BuiltinOptions.GatherOptions: GatherOptions.GatherOptions,
+ BuiltinOptions.BatchToSpaceNDOptions: BatchToSpaceNDOptions.BatchToSpaceNDOptions,
+ BuiltinOptions.SpaceToBatchNDOptions: SpaceToBatchNDOptions.SpaceToBatchNDOptions,
+ BuiltinOptions.TransposeOptions: TransposeOptions.TransposeOptions,
+ BuiltinOptions.ReducerOptions: ReducerOptions.ReducerOptions,
+ BuiltinOptions.SubOptions: SubOptions.SubOptions,
+ BuiltinOptions.DivOptions: DivOptions.DivOptions,
+ BuiltinOptions.SqueezeOptions: SqueezeOptions.SqueezeOptions,
+ BuiltinOptions.SequenceRNNOptions: SequenceRNNOptions.SequenceRNNOptions,
+ BuiltinOptions.StridedSliceOptions: StridedSliceOptions.StridedSliceOptions,
+ BuiltinOptions.ExpOptions: ExpOptions.ExpOptions,
+ BuiltinOptions.TopKV2Options: TopKV2Options.TopKV2Options,
+ BuiltinOptions.SplitOptions: SplitOptions.SplitOptions,
+ BuiltinOptions.LogSoftmaxOptions: LogSoftmaxOptions.LogSoftmaxOptions,
+ BuiltinOptions.CastOptions: CastOptions.CastOptions,
+ BuiltinOptions.DequantizeOptions: DequantizeOptions.DequantizeOptions,
+ BuiltinOptions.MaximumMinimumOptions: MaximumMinimumOptions.MaximumMinimumOptions,
+ BuiltinOptions.ArgMaxOptions: ArgMaxOptions.ArgMaxOptions,
+ BuiltinOptions.LessOptions: LessOptions.LessOptions,
+ BuiltinOptions.NegOptions: NegOptions.NegOptions,
+ BuiltinOptions.PadV2Options: PadV2Options.PadV2Options,
+ BuiltinOptions.GreaterOptions: GreaterOptions.GreaterOptions,
+ BuiltinOptions.GreaterEqualOptions: GreaterEqualOptions.GreaterEqualOptions,
+ BuiltinOptions.LessEqualOptions: LessEqualOptions.LessEqualOptions,
+ BuiltinOptions.SelectOptions: SelectOptions.SelectOptions,
+ BuiltinOptions.SliceOptions: SliceOptions.SliceOptions,
+ BuiltinOptions.TransposeConvOptions: TransposeConvOptions.TransposeConvOptions,
+ BuiltinOptions.SparseToDenseOptions: SparseToDenseOptions.SparseToDenseOptions,
+ BuiltinOptions.TileOptions: TileOptions.TileOptions,
+ BuiltinOptions.ExpandDimsOptions: ExpandDimsOptions.ExpandDimsOptions,
+ BuiltinOptions.EqualOptions: EqualOptions.EqualOptions,
+ BuiltinOptions.NotEqualOptions: NotEqualOptions.NotEqualOptions,
+ BuiltinOptions.ShapeOptions: ShapeOptions.ShapeOptions,
+ BuiltinOptions.PowOptions: PowOptions.PowOptions,
+ BuiltinOptions.ArgMinOptions: ArgMinOptions.ArgMinOptions,
+ BuiltinOptions.FakeQuantOptions: FakeQuantOptions.FakeQuantOptions,
+ BuiltinOptions.PackOptions: PackOptions.PackOptions,
+ BuiltinOptions.LogicalOrOptions: LogicalOrOptions.LogicalOrOptions,
+ BuiltinOptions.OneHotOptions: OneHotOptions.OneHotOptions,
+ BuiltinOptions.LogicalAndOptions: LogicalAndOptions.LogicalAndOptions,
+ BuiltinOptions.LogicalNotOptions: LogicalNotOptions.LogicalNotOptions,
+ BuiltinOptions.UnpackOptions: UnpackOptions.UnpackOptions,
+ BuiltinOptions.FloorDivOptions: FloorDivOptions.FloorDivOptions,
+ BuiltinOptions.SquareOptions: SquareOptions.SquareOptions,
+ BuiltinOptions.ZerosLikeOptions: ZerosLikeOptions.ZerosLikeOptions,
+ BuiltinOptions.FillOptions: FillOptions.FillOptions,
+ BuiltinOptions.BidirectionalSequenceLSTMOptions: BidirectionalSequenceLSTMOptions.BidirectionalSequenceLSTMOptions,
+ BuiltinOptions.BidirectionalSequenceRNNOptions: BidirectionalSequenceRNNOptions.BidirectionalSequenceRNNOptions,
+ BuiltinOptions.UnidirectionalSequenceLSTMOptions: UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptions, # noqa: E501
+ BuiltinOptions.FloorModOptions: FloorModOptions.FloorModOptions,
+ BuiltinOptions.RangeOptions: RangeOptions.RangeOptions,
+ BuiltinOptions.ResizeNearestNeighborOptions: ResizeNearestNeighborOptions.ResizeNearestNeighborOptions,
+ BuiltinOptions.LeakyReluOptions: LeakyReluOptions.LeakyReluOptions,
+ BuiltinOptions.SquaredDifferenceOptions: SquaredDifferenceOptions.SquaredDifferenceOptions,
+ BuiltinOptions.MirrorPadOptions: MirrorPadOptions.MirrorPadOptions,
+ BuiltinOptions.AbsOptions: AbsOptions.AbsOptions,
+ BuiltinOptions.SplitVOptions: SplitVOptions.SplitVOptions,
+ BuiltinOptions.UniqueOptions: UniqueOptions.UniqueOptions,
+ BuiltinOptions.ReverseV2Options: ReverseV2Options.ReverseV2Options,
+ BuiltinOptions.AddNOptions: AddNOptions.AddNOptions,
+ BuiltinOptions.GatherNdOptions: GatherNdOptions.GatherNdOptions,
+ BuiltinOptions.CosOptions: CosOptions.CosOptions,
+ BuiltinOptions.WhereOptions: WhereOptions.WhereOptions,
+ BuiltinOptions.RankOptions: RankOptions.RankOptions,
+ BuiltinOptions.ReverseSequenceOptions: ReverseSequenceOptions.ReverseSequenceOptions,
+ BuiltinOptions.MatrixDiagOptions: MatrixDiagOptions.MatrixDiagOptions,
+ BuiltinOptions.QuantizeOptions: QuantizeOptions.QuantizeOptions,
+ BuiltinOptions.MatrixSetDiagOptions: MatrixSetDiagOptions.MatrixSetDiagOptions,
+ BuiltinOptions.DensifyOptions: DensifyOptions.DensifyOptions,
+ BuiltinOptions.DepthToSpaceOptions: DepthToSpaceOptions.DepthToSpaceOptions,
+ BuiltinOptions.IfOptions: IfOptions.IfOptions,
+ BuiltinOptions.NonMaxSuppressionV4Options: NonMaxSuppressionV4Options.NonMaxSuppressionV4Options,
+ BuiltinOptions.NonMaxSuppressionV5Options: NonMaxSuppressionV5Options.NonMaxSuppressionV5Options,
+ BuiltinOptions.ScatterNdOptions: ScatterNdOptions.ScatterNdOptions,
+ BuiltinOptions.SegmentSumOptions: SegmentSumOptions.SegmentSumOptions,
+ BuiltinOptions.SelectV2Options: SelectV2Options.SelectV2Options,
+ BuiltinOptions.WhileOptions: WhileOptions.WhileOptions,
+}
+
+builtin_options_inv_map = inverse_map(builtin_options_map)
+
+
+def underscore_to_camel_case(s):
+ return "".join(x.title() for x in s.split("_"))
+
+
+def padding_deserialize(x):
+ return padding_map[x]
+
+
+def padding_serialize(builder, x):
+ return padding_inv_map[x]
+
+
+def activation_deserialize(x):
+ return activation_function_map[x]
+
+
+def activation_serialize(builder, x):
+ return activation_function_inv_map[x]
+
+
+def datatype_deserialize(x):
+ return datatype_map[x]
+
+
+def datatype_serialize(builder, x):
+ return datatype_inv_map[x]
+
+
+def identity(x):
+ return x
+
+
+def identity_serialize(builder, x):
+ return x
+
+
+def write_byte_vector(builder, v):
+ builder.StartVector(1, len(v), 1)
+ for e in v[::-1]:
+ builder.PrependByte(e)
+ return builder.EndVector(len(v))
+
+
+def write_int_vector(builder, v):
+ builder.StartVector(4, len(v), 4)
+ for e in v[::-1]:
+ builder.PrependInt32(e)
+ return builder.EndVector(len(v))
+
+
+class OptionsSerializer:
+ def __init__(self, name, members=[]):
+ self.name = name
+ self.module = globals()[self.name]
+ self.cls = getattr(self.module, self.name)
+ self.builtin_opt_type = builtin_options_inv_map[self.cls]
+ self.custom_opt_format = 0
+ self.members = []
+ for mem in members:
+ deserialize = identity
+ serialize = identity_serialize
+ is_vector = False
+ if isinstance(mem, tuple):
+ if len(mem) == 3:
+ mem, deserialize, serialize = mem
+ elif len(mem) == 2:
+ mem, is_vector = mem
+ deserialize = tuple
+ serialize = write_int_vector
+ else:
+ assert 0
+ underscore_mem = mem
+ camelcase_mem = underscore_to_camel_case(mem)
+ self.members.append((underscore_mem, camelcase_mem, deserialize, serialize, is_vector))
+
+ def deserialize(self, builtin_data, custom_data):
+ attrs = {}
+ if builtin_data:
+ tfattrs = self.cls()
+ tfattrs.Init(builtin_data.Bytes, builtin_data.Pos)
+ for underscore_mem, camelcase_mem, deserialize, serialize, is_vector in self.members:
+ fun = camelcase_mem
+ if is_vector:
+ fun += "AsNumpy"
+
+ a = deserialize(getattr(tfattrs, fun)())
+ attrs[underscore_mem] = a
+ return attrs
+
+ def serialize(self, builder, attrs):
+ ser_attrs = []
+ for underscore_mem, camelcase_mem, deserialize, serialize, is_vector in self.members:
+ a = serialize(builder, attrs[underscore_mem])
+ ser_attrs.append((camelcase_mem, a))
+
+ getattr(self.module, self.name + "Start")(builder)
+
+ for camelcase_mem, a in ser_attrs:
+ getattr(self.module, self.name + "Add" + camelcase_mem)(builder, a)
+
+ return getattr(self.module, self.name + "End")(builder), None
+
+
+class CustomOptionsSerializer:
+ def __init__(self):
+ self.builtin_opt_type = 0
+ self.custom_opt_format = 0
+
+ def deserialize(self, builtin_data, custom_data):
+ attrs = {}
+ attrs["custom_options"] = custom_data
+ return attrs
+
+ def serialize(self, builder, attrs):
+
+ custom_opts = attrs.get("custom_options", [])
+ custom_data = []
+
+ # Set NPU op custom options for the TensorFlow Lite custom operator
+ if custom_opts["type"] == "NpuOp":
+ custom_data = [0x01, 0x04, 0x01] # NpuOp=1, FlexbufferFormat.UINT8=4, byte length=1
+
+ custom_data_bytes = struct.pack("<{0}B".format(len(custom_data)), *custom_data)
+ custom_offset = write_byte_vector(builder, custom_data_bytes)
+
+ return None, custom_offset
+
+
+padding_map = {
+ Padding.SAME: b"SAME",
+ Padding.VALID: b"VALID",
+}
+
+padding_inv_map = inverse_map(padding_map)
+
+
+activation_function_map = {
+ ActivationFunctionType.NONE: None,
+ ActivationFunctionType.RELU: "Relu",
+ ActivationFunctionType.RELU_N1_TO_1: "ReluN1To1",
+ ActivationFunctionType.RELU6: "Relu6",
+ ActivationFunctionType.TANH: "Tanh",
+ ActivationFunctionType.SIGN_BIT: "SignBit",
+}
+
+activation_function_inv_map = inverse_map(activation_function_map)
+
+fused_act = ("fused_activation_function", activation_deserialize, activation_serialize)
+padding = ("padding", padding_deserialize, padding_serialize)
+
+pool2d_opts = OptionsSerializer(
+ "Pool2DOptions", (padding, "stride_w", "stride_h", "filter_width", "filter_height", fused_act,)
+)
+
+depthwise_opts = OptionsSerializer(
+ "DepthwiseConv2DOptions",
+ (padding, "stride_w", "stride_h", "depth_multiplier", fused_act, "dilation_w_factor", "dilation_h_factor",),
+)
+
+conv2d_opts = OptionsSerializer(
+ "Conv2DOptions", (padding, "stride_w", "stride_h", fused_act, "dilation_w_factor", "dilation_h_factor",)
+)
+
+lstm_opts = OptionsSerializer("LSTMOptions", (fused_act, "cell_clip", "proj_clip", "kernel_type"))
+
+unidir_seq_lstm_opts = OptionsSerializer(
+ "UnidirectionalSequenceLSTMOptions", (fused_act, "cell_clip", "proj_clip", "time_major")
+)
+
+bidir_seq_lstm_opts = OptionsSerializer(
+ "BidirectionalSequenceLSTMOptions", (fused_act, "cell_clip", "proj_clip", "merge_outputs", "time_major")
+)
+
+rnn_opts = OptionsSerializer("RNNOptions", (fused_act,))
+
+seq_rnn_opts = OptionsSerializer("SequenceRNNOptions", ("time_major", fused_act,))
+
+bidir_seq_rnn_opts = OptionsSerializer("BidirectionalSequenceRNNOptions", ("time_major", fused_act, "merge_outputs",))
+
+
+reducer_opts = OptionsSerializer("ReducerOptions", ("keep_dims",))
+
+is_int_vec = True
+
+custom_prefix = "Custom_"
+
+builtin_operator_map = {
+ BuiltinOperator.ADD: ("AddAct", OptionsSerializer("AddOptions", (fused_act,))),
+ BuiltinOperator.AVERAGE_POOL_2D: ("AvgPoolAct", pool2d_opts),
+ BuiltinOperator.CONCATENATION: ("ConcatTFLite", OptionsSerializer("ConcatenationOptions", ("axis", fused_act))),
+ BuiltinOperator.CONV_2D: ("Conv2DBiasAct", conv2d_opts),
+ BuiltinOperator.DEPTHWISE_CONV_2D: ("DepthwiseConv2dBiasAct", depthwise_opts),
+ BuiltinOperator.DEPTH_TO_SPACE: ("DepthToSpace", OptionsSerializer("DepthToSpaceOptions", ("block_size",))),
+ BuiltinOperator.DEQUANTIZE: ("Dequantize", OptionsSerializer("DequantizeOptions")),
+ BuiltinOperator.EMBEDDING_LOOKUP: (None, None),
+ BuiltinOperator.FLOOR: ("Floor", None),
+ BuiltinOperator.FULLY_CONNECTED: (
+ "FullyConnectedAct",
+ OptionsSerializer("FullyConnectedOptions", (fused_act, "weights_format")),
+ ),
+ BuiltinOperator.HASHTABLE_LOOKUP: (None, None),
+ # BuiltinOperator.L2_NORMALIZATION : "L2NormAct",
+ BuiltinOperator.L2_POOL_2D: (None, pool2d_opts),
+ BuiltinOperator.LOCAL_RESPONSE_NORMALIZATION: (
+ "LRN",
+ OptionsSerializer("LocalResponseNormalizationOptions", ("radius", "bias", "alpha", "beta")),
+ ),
+ BuiltinOperator.LOGISTIC: ("Sigmoid", None),
+ # BuiltinOperator.LSH_PROJECTION : "",
+ BuiltinOperator.LSTM: ("LstmAct", lstm_opts),
+ BuiltinOperator.MAX_POOL_2D: ("MaxPool", pool2d_opts),
+ BuiltinOperator.MUL: ("MulAct", OptionsSerializer("MulOptions", (fused_act,))),
+ BuiltinOperator.RELU: ("Relu", None),
+ BuiltinOperator.RELU_N1_TO_1: (None, None),
+ BuiltinOperator.RELU6: ("Relu6", None),
+ BuiltinOperator.RESHAPE: ("Reshape", OptionsSerializer("ReshapeOptions", (("new_shape", is_int_vec),))),
+ BuiltinOperator.RESIZE_BILINEAR: (
+ "ResizeBilinear",
+ OptionsSerializer("ResizeBilinearOptions", ("align_corners", "half_pixel_centers")),
+ ),
+ BuiltinOperator.RNN: ("RnnAct", rnn_opts),
+ BuiltinOperator.SOFTMAX: ("Softmax", OptionsSerializer("SoftmaxOptions", ("beta",))),
+ BuiltinOperator.SPACE_TO_DEPTH: ("SpaceToDepth", OptionsSerializer("SpaceToDepthOptions", ("block_size",))),
+ BuiltinOperator.SVDF: ("SvdfAct", OptionsSerializer("SVDFOptions", ("rank", fused_act))),
+ BuiltinOperator.TANH: ("Tanh", None),
+ # BuiltinOperator.CONCAT_EMBEDDINGS : "",
+ # BuiltinOperator.SKIP_GRAM : "",
+ # BuiltinOperator.CALL : "",
+ BuiltinOperator.EMBEDDING_LOOKUP_SPARSE: (None, OptionsSerializer("EmbeddingLookupSparseOptions", ("combiner",))),
+ BuiltinOperator.PAD: ("Pad", OptionsSerializer("PadOptions")),
+ BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_RNN: ("UnidirectionalSequenceRnnAct", seq_rnn_opts),
+ BuiltinOperator.GATHER: ("GatherV2", OptionsSerializer("GatherOptions", ("axis",))),
+ BuiltinOperator.BATCH_TO_SPACE_ND: ("BatchToSpaceND", OptionsSerializer("BatchToSpaceNDOptions")),
+ BuiltinOperator.SPACE_TO_BATCH_ND: ("SpaceToBatchND", OptionsSerializer("SpaceToBatchNDOptions")),
+ BuiltinOperator.TRANSPOSE: ("Transpose", OptionsSerializer("TransposeOptions")),
+ BuiltinOperator.MEAN: ("Mean", None),
+ BuiltinOperator.SUB: ("SubAct", OptionsSerializer("SubOptions", (fused_act,))),
+ BuiltinOperator.DIV: ("DivAct", OptionsSerializer("DivOptions", (fused_act,))),
+ BuiltinOperator.SQUEEZE: ("Squeeze", OptionsSerializer("SqueezeOptions", (("squeeze_dims", is_int_vec),))),
+ BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: ("UnidirectionalSequenceLstmAct", unidir_seq_lstm_opts),
+ BuiltinOperator.STRIDED_SLICE: (
+ "StridedSlice",
+ OptionsSerializer(
+ "StridedSliceOptions", ("begin_mask", "end_mask", "ellipsis_mask", "new_axis_mask", "shrink_axis_mask")
+ ),
+ ),
+ BuiltinOperator.BIDIRECTIONAL_SEQUENCE_RNN: ("BidirectionalSequenceRnnAct", bidir_seq_rnn_opts),
+ BuiltinOperator.EXP: ("Exp", OptionsSerializer("ExpOptions")),
+ BuiltinOperator.TOPK_V2: ("TopKV2", OptionsSerializer("TopKV2Options")),
+ BuiltinOperator.SPLIT: ("Split", OptionsSerializer("SplitOptions", ("num_splits",))),
+ BuiltinOperator.LOG_SOFTMAX: ("LogSoftmax", OptionsSerializer("LogSoftmaxOptions")),
+ # BuiltinOperator.DELEGATE : "",
+ BuiltinOperator.BIDIRECTIONAL_SEQUENCE_LSTM: ("BidirectionalSequenceLstmAct", bidir_seq_lstm_opts),
+ BuiltinOperator.CAST: (
+ "Cast",
+ OptionsSerializer(
+ "CastOptions",
+ (
+ ("in_data_type", datatype_deserialize, datatype_serialize),
+ ("out_data_type", datatype_deserialize, datatype_serialize),
+ ),
+ ),
+ ),
+ # BuiltinOperator.PRELU : "",
+ BuiltinOperator.MAXIMUM: ("Maximum", OptionsSerializer("MaximumMinimumOptions")),
+ BuiltinOperator.ARG_MAX: (
+ "ArgMax",
+ OptionsSerializer("ArgMaxOptions", (("output_type", datatype_deserialize, datatype_serialize),)),
+ ),
+ BuiltinOperator.MINIMUM: ("Minimum", OptionsSerializer("MaximumMinimumOptions")),
+ BuiltinOperator.LESS: ("Less", None),
+ BuiltinOperator.NEG: ("Neg", None),
+ BuiltinOperator.PADV2: ("PadV2", None),
+ BuiltinOperator.GREATER: ("Greater", None),
+ BuiltinOperator.GREATER_EQUAL: ("GreaterEqual", None),
+ BuiltinOperator.LESS_EQUAL: ("LessEqual", None),
+ BuiltinOperator.SELECT: ("Select", None),
+ BuiltinOperator.SLICE: ("Slice", None),
+ BuiltinOperator.SIN: ("Sin", None),
+ BuiltinOperator.TRANSPOSE_CONV: (
+ "Conv2DBackpropInput",
+ OptionsSerializer("TransposeConvOptions", (padding, "stride_w", "stride_h")),
+ ),
+ BuiltinOperator.SPARSE_TO_DENSE: (
+ "SparseToDense",
+ OptionsSerializer("SparseToDenseOptions", ("validate_indices",)),
+ ),
+ BuiltinOperator.TILE: ("Tile", OptionsSerializer("TileOptions")),
+ BuiltinOperator.EXPAND_DIMS: ("ExpandDims", None),
+ BuiltinOperator.EQUAL: ("Equal", None),
+ BuiltinOperator.NOT_EQUAL: ("NotEqual", None),
+ BuiltinOperator.LOG: ("Log", None),
+ BuiltinOperator.SUM: ("Sum", None),
+ BuiltinOperator.SQRT: ("Sqrt", None),
+ BuiltinOperator.RSQRT: ("Rsqrt", None),
+ BuiltinOperator.SHAPE: (
+ "Shape",
+ OptionsSerializer("ShapeOptions", (("out_type", datatype_deserialize, datatype_serialize),)),
+ ),
+ BuiltinOperator.POW: "Pow",
+ BuiltinOperator.ARG_MIN: (
+ "ArgMin",
+ OptionsSerializer("ArgMinOptions", (("output_type", datatype_deserialize, datatype_serialize),)),
+ ),
+ BuiltinOperator.FAKE_QUANT: (
+ "FakeQuantWithMinMaxArgs",
+ OptionsSerializer("FakeQuantOptions", ("min", "max", "num_bits", "narrow_range")),
+ ),
+ BuiltinOperator.REDUCE_PROD: ("Prod", reducer_opts),
+ BuiltinOperator.REDUCE_MAX: ("Max", reducer_opts),
+ BuiltinOperator.PACK: ("Pack", OptionsSerializer("PackOptions", ("values_count", "axis"))),
+ BuiltinOperator.LOGICAL_OR: ("LogicalOr", None),
+ BuiltinOperator.ONE_HOT: ("OneHot", OptionsSerializer("OneHotOptions", ("axis",))),
+ BuiltinOperator.LOGICAL_AND: ("LogicalAnd", None),
+ BuiltinOperator.LOGICAL_NOT: ("LogicalNot", None),
+ BuiltinOperator.UNPACK: ("Unpack", OptionsSerializer("UnpackOptions", ("num", "axis"))),
+ BuiltinOperator.REDUCE_MIN: ("Min", reducer_opts),
+ BuiltinOperator.FLOOR_DIV: ("FloorDiv", None),
+ BuiltinOperator.REDUCE_ANY: ("Any", reducer_opts),
+ BuiltinOperator.SQUARE: ("Square", None),
+ BuiltinOperator.ZEROS_LIKE: ("ZerosLike", None),
+ BuiltinOperator.FILL: ("Fill", None),
+ BuiltinOperator.FLOOR_MOD: ("FloorMod", None),
+ BuiltinOperator.RANGE: ("Range", None),
+ BuiltinOperator.RESIZE_NEAREST_NEIGHBOR: (
+ "ResizeNearestNeighbor",
+ OptionsSerializer("ResizeNearestNeighborOptions", ("align_corners",)),
+ ),
+ BuiltinOperator.LEAKY_RELU: ("LeakyRelu", OptionsSerializer("LeakyReluOptions", ("alpha",))),
+ BuiltinOperator.SQUARED_DIFFERENCE: ("SquaredDifference", None),
+ BuiltinOperator.MIRROR_PAD: ("MirrorPad", OptionsSerializer("MirrorPadOptions", ("mode",))),
+ BuiltinOperator.ABS: ("Abs", None),
+ BuiltinOperator.SPLIT_V: ("SplitV", OptionsSerializer("SplitVOptions", ("num_splits",))),
+ BuiltinOperator.UNIQUE: (
+ "Unique",
+ OptionsSerializer("UniqueOptions", (("idx_out_type", datatype_deserialize, datatype_serialize),)),
+ ),
+ BuiltinOperator.CEIL: ("Ceil", None),
+ BuiltinOperator.REVERSE_V2: ("ReverseV2", None),
+ BuiltinOperator.ADD_N: ("AddN", None),
+ BuiltinOperator.GATHER_ND: ("GatherNd", None),
+ BuiltinOperator.COS: ("Cos", None),
+ BuiltinOperator.WHERE: ("Where", None),
+ BuiltinOperator.RANK: ("Rank", None),
+ BuiltinOperator.ELU: ("Elu", None),
+ BuiltinOperator.REVERSE_SEQUENCE: (
+ "ReverseSequence",
+ OptionsSerializer("ReverseSequenceOptions", ("seq_dim", "batch_dim")),
+ ),
+ BuiltinOperator.MATRIX_DIAG: ("MatrixDiag", None),
+ BuiltinOperator.QUANTIZE: ("Quantize", None),
+ BuiltinOperator.MATRIX_SET_DIAG: ("MatrixSetDiag", None),
+ BuiltinOperator.IF: ("If", OptionsSerializer("IfOptions", ("then_subgraph_index", "else_subgraph_index"))),
+ BuiltinOperator.WHILE: ("While", OptionsSerializer("WhileOptions", ("cond_subgraph_index", "body_subgraph_index"))),
+ BuiltinOperator.NON_MAX_SUPPRESSION_V4: ("NonMaxSuppressionV4", OptionsSerializer("NonMaxSuppressionV4Options")),
+ BuiltinOperator.NON_MAX_SUPPRESSION_V5: ("NonMaxSuppressionV5", OptionsSerializer("NonMaxSuppressionV5Options")),
+ BuiltinOperator.SCATTER_ND: ("ScatterNd", OptionsSerializer("ScatterNdOptions")),
+ BuiltinOperator.SELECT_V2: ("SelectV2", OptionsSerializer("SelectV2Options")),
+ BuiltinOperator.DENSIFY: ("Densify", OptionsSerializer("DensifyOptions")),
+ BuiltinOperator.SEGMENT_SUM: ("SegmentSum", OptionsSerializer("SegmentSumOptions")),
+ BuiltinOperator.CUSTOM: (custom_prefix, CustomOptionsSerializer()),
+}
+
+builtin_operator_inv_map = {v[0]: (k, v[1]) for k, v in builtin_operator_map.items()}
+
+builtin_operator_inv_map["NpuOp"] = (BuiltinOperator.CUSTOM, CustomOptionsSerializer())
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
new file mode 100644
index 00000000..535847d7
--- /dev/null
+++ b/ethosu/vela/tflite_reader.py
@@ -0,0 +1,252 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Functions used to read from a TensorFlow Lite format file.
+
+from .tflite.Model import Model
+from .tflite.BuiltinOperator import BuiltinOperator
+
+import numpy as np
+import os.path
+from .nn_graph import Graph, Operation, Subgraph
+from .tensor import Tensor, QuantizationParameters
+
+from .tflite_mapping import builtin_operator_map, datatype_map, datatype_map_numpy, DataType
+
+
+def decode_str(s):
+ if s is None:
+ return ""
+ return s.decode("utf-8")
+
+
+def reshape_tensor_add_const_op(tens, reorder):
+ if not tens.reshaped:
+ original_shape = tens.shape
+ tens.name = tens.name + "_reshape"
+ tens.shape = [original_shape[idx] for idx in reorder]
+ tens.bandwidth_shape = tens.shape
+ tens.storage_shape = tens.shape
+
+ if tens.values is not None:
+ tens.values = tens.values.transpose(reorder)
+
+ if tens.quant_values is not None:
+ tens.quant_values = tens.quant_values.transpose(reorder)
+
+ op = Operation("Const", tens.name)
+ op.outputs = [tens]
+ tens.ops = [op]
+ tens.reshaped = True
+
+
+class TFLiteSubgraph:
+ def __init__(self, graph, subgraph):
+ self.graph = graph
+ self.name = decode_str(subgraph.Name())
+
+ self.tensors = []
+ for idx in range(subgraph.TensorsLength()):
+ self.tensors.append(self.parse_tensor(subgraph.Tensors(idx)))
+
+ for idx in range(subgraph.OperatorsLength()):
+ self.parse_operator(subgraph.Operators(idx))
+
+ self.outputs = [self.tensors[idx] for idx in subgraph.OutputsAsNumpy()]
+ self.inputs = [self.tensors[idx] for idx in subgraph.InputsAsNumpy()]
+
+ # Fix up tensors without operations. Generate either Placeholder or Constant ops
+ for tens in self.inputs:
+ assert not tens.ops
+ op = Operation("Placeholder", tens.name)
+ op.outputs = [tens]
+ tens.ops = [op]
+
+ for tens in self.tensors:
+ if not tens.ops:
+ op = Operation("Const", tens.name)
+ op.outputs = [tens]
+ tens.ops = [op]
+
+ def parse_tensor(self, tens_data):
+ np_shape = tens_data.ShapeAsNumpy()
+ shape = list(np_shape) if type(np_shape) is np.ndarray else []
+ name = decode_str(tens_data.Name())
+ dtype = datatype_map[tens_data.Type()]
+
+ tens = Tensor(shape, dtype, name)
+
+ quant = tens_data.Quantization()
+
+ def len1_array_to_scalar(arr):
+ # The following flatbuffer quantisation fields all return a scalar value of 0 if they are not definied in
+ # the input buffer. This is represented in Vela by using None.
+ # Otherwise, the fields returned are a single or multi-element array. In which case, single element arrays
+ # are converted to scalars
+ if isinstance(arr, int) and arr == 0:
+ return None
+ if len(arr) == 1:
+ return arr[0]
+ return arr
+
+ tens.quantization = QuantizationParameters()
+ tens.quantization.min = len1_array_to_scalar(quant.MinAsNumpy())
+ tens.quantization.max = len1_array_to_scalar(quant.MaxAsNumpy())
+ tens.quantization.scale_f32 = len1_array_to_scalar(quant.ScaleAsNumpy())
+ tens.quantization.zero_point = len1_array_to_scalar(quant.ZeroPointAsNumpy())
+
+ if dtype == DataType.uint8:
+ tens.quantization.quant_min = 0
+ tens.quantization.quant_max = (1 << dtype.bits) - 1
+ elif dtype in set((DataType.int8, DataType.int16, DataType.int32, DataType.int64)):
+ tens.quantization.quant_min = -(1 << (dtype.bits - 1))
+ tens.quantization.quant_max = (1 << (dtype.bits - 1)) - 1
+ else:
+ raise Exception("DataType '" + str(dtype) + "' is not supported for quantization.")
+
+ if tens.quantization.scale_f32 is None and tens.quantization.zero_point is None:
+ tens.quantization = None
+
+ tens.values = None
+ buf = self.graph.buffers[tens_data.Buffer()]
+ if buf is not None:
+ tens.values = np.array(buf.view(datatype_map_numpy[tens_data.Type()]).reshape(shape))
+ if tens.quantization is not None:
+ tens.quant_values = tens.values
+ tens.values = tens.quantization.dequantize(tens.quant_values)
+ return tens
+
+ def parse_operator(self, op_data):
+ op_type, opt_serializer = self.graph.operator_codes[op_data.OpcodeIndex()]
+ inputs = [self.tensors[idx] for idx in op_data.InputsAsNumpy()]
+ outputs = [self.tensors[idx] for idx in op_data.OutputsAsNumpy()]
+ name = "unknown_op_name"
+ if len(outputs):
+ name = outputs[0].name
+ op = Operation(op_type, name)
+ op.inputs = inputs
+ op.outputs = outputs
+ for out in op.outputs:
+ out.ops = [op]
+
+ activation_function_to_split_out = None
+
+ if op_type.startswith("DepthwiseConv2d") or op_type.startswith("Conv2D"):
+ reshape_tensor_add_const_op(inputs[1], (1, 2, 3, 0))
+
+ if op_type.startswith("FullyConnected"):
+ reshape_tensor_add_const_op(inputs[1], (1, 0))
+
+ if opt_serializer is not None:
+ op.attrs = opt_serializer.deserialize(op_data.BuiltinOptions(), op_data.CustomOptionsAsNumpy())
+
+ if "stride_w" in op.attrs:
+ op.attrs["strides"] = (1, op.attrs["stride_h"], op.attrs["stride_w"], 1)
+ if "filter_width" in op.attrs:
+ op.attrs["ksize"] = (1, op.attrs["filter_height"], op.attrs["filter_width"], 1)
+ if "dilation_w_factor" in op.attrs:
+ op.attrs["dilation"] = (1, op.attrs["dilation_h_factor"], op.attrs["dilation_w_factor"], 1)
+ if "depth_multiplier" in op.attrs:
+ op.attrs["channel_multiplier"] = op.attrs["depth_multiplier"]
+
+ if "fused_activation_function" in op.attrs:
+ if op_type in set(("ConcatTFLite",)):
+ act = op.attrs["fused_activation_function"]
+ del op.attrs["fused_activation_function"]
+ if act is not None:
+ activation_function_to_split_out = act
+
+ if activation_function_to_split_out is not None:
+ act_op = Operation(activation_function_to_split_out, name + activation_function_to_split_out)
+ out_tens = op.outputs[0]
+ intermediate_tens = out_tens.clone("_act_intermediate")
+ out_tens.ops = [act_op]
+ act_op.outputs = [out_tens]
+ intermediate_tens.ops = [op]
+ op.outputs[0] = intermediate_tens
+ act_op.inputs = [intermediate_tens]
+
+
+class TFLiteGraph:
+ def __init__(
+ self,
+ filename,
+ batch_size=1,
+ feed_dict={},
+ output_node_names=[],
+ initialisation_nodes=[],
+ ):
+
+ self.op_times = {}
+ if batch_size is None:
+ batch_size = 1
+ self.batch_size = batch_size
+ self.name = os.path.splitext(os.path.basename(filename))[0]
+ self.initialisation_nodes = initialisation_nodes
+
+ with open(filename, "rb") as f:
+ buf = bytearray(f.read())
+
+ model = Model.GetRootAsModel(buf, 0)
+
+ self.buffers = []
+ for idx in range(model.BuffersLength()):
+ self.buffers.append(self.parse_buffer(model.Buffers(idx)))
+
+ self.operator_codes = []
+ for idx in range(model.OperatorCodesLength()):
+ self.operator_codes.append(self.parse_operator_code(model.OperatorCodes(idx)))
+
+ self.subgraphs = []
+ for idx in range(model.SubgraphsLength()):
+ self.subgraphs.append(TFLiteSubgraph(self, model.Subgraphs(idx)))
+
+ self.nng = Graph(self.name, self.batch_size)
+ for tflite_sg in self.subgraphs:
+ sg = Subgraph(tflite_sg.name)
+ sg.original_inputs = tflite_sg.inputs # Preserve the original input order
+ sg.output_tensors = tflite_sg.outputs
+ self.nng.subgraphs.append(sg)
+
+ def parse_buffer(self, buf_data):
+ if buf_data.DataLength() == 0:
+ return None
+ data = buf_data.DataAsNumpy()
+ return data
+
+ def parse_operator_code(self, code):
+ c = code.BuiltinCode()
+ op_type, ser = builtin_operator_map[c]
+ if c == BuiltinOperator.CUSTOM:
+ op_type += decode_str(code.CustomCode())
+ return op_type, ser
+
+
+def read_tflite(
+ filename,
+ batch_size=1,
+ feed_dict={},
+ output_node_names=[],
+ initialisation_nodes=[],
+):
+ tflite_graph = TFLiteGraph(
+ filename, batch_size, feed_dict, output_node_names, initialisation_nodes
+ )
+ nng = tflite_graph.nng
+ nng.refresh_after_modification()
+ return nng
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
new file mode 100644
index 00000000..f55d1ce5
--- /dev/null
+++ b/ethosu/vela/tflite_writer.py
@@ -0,0 +1,424 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Functions used to write to a TensorFlow Lite format file. Supports adding in file identifiers.
+
+import flatbuffers
+
+from .tflite import Tensor
+from .tflite import QuantizationParameters
+from .tflite import Model
+from .tflite import SubGraph
+from .tflite import OperatorCode
+from .tflite import Operator
+from .tflite import Buffer
+from .tflite import Metadata
+
+import numpy as np
+
+from .tflite_mapping import datatype_inv_map, builtin_operator_inv_map, custom_prefix, BuiltinOperator
+from .nn_graph import PassPlacement
+from .tensor import TensorPurpose, MemArea
+from flatbuffers.builder import UOffsetTFlags
+
+tflite_version = 3
+tflite_file_identifier = "TFL" + str(tflite_version)
+
+
+import flatbuffers.number_types as N
+from flatbuffers import encode
+
+
+def FinishWithFileIdentifier(self, rootTable, fid):
+ if fid is None or len(fid) != 4:
+ raise Exception("fid must be 4 chars")
+
+ flags = N.Uint8Flags
+ prepSize = 4
+ self.Prep(self.minalign, prepSize + len(fid))
+ for i in range(3, -1, -1):
+ self.head = self.head - flags.bytewidth
+ encode.Write(flags.packer_type, self.Bytes, self.Head(), ord(fid[i]))
+
+ return self.Finish(rootTable)
+
+
+flatbuffers.Builder.FinishWithFileIdentifier = FinishWithFileIdentifier
+
+
+def make_vector(v):
+ try:
+ len(v)
+ return v
+ except TypeError:
+ return [v]
+
+
+class TFLiteSerialiser:
+ def __init__(self, nng):
+ self.builder = flatbuffers.Builder(0)
+ self.nng = nng
+
+ self.scratch_buf_id = 0 # Always assign scratch to buffer 0
+ self.buffer_offsets_map = {}
+ self.buffers_to_write = [] # have an empty array there
+
+ self.input_tensors = []
+ self.ops_to_ignore = set(("Const", "Placeholder", "SubgraphInput"))
+
+ self.tensors_to_reshape = {}
+
+ self.subgraphs_to_write = [sg for sg in self.nng.subgraphs if sg.placement == PassPlacement.Cpu]
+
+ all_ops = []
+ for sg in self.subgraphs_to_write:
+ for ps in sg.passes:
+ for op in ps.ops:
+ if op.type not in self.ops_to_ignore:
+ all_ops.append(op)
+ if op.type.startswith("Conv2D") or op.type.startswith("DepthwiseConv2d"):
+ self.tensors_to_reshape[op.inputs[1]] = (3, 0, 1, 2)
+ if op.type.startswith("FullyConnected"):
+ self.tensors_to_reshape[op.inputs[1]] = (1, 0)
+
+ self.operator_codes = list(sorted(set(op.type for op in all_ops)))
+ self.operator_code_map = {}
+
+ def write_byte_vector(self, v, alignment=1):
+ builder = self.builder
+ builder.StartVector(1, len(v), alignment)
+ for e in v[::-1]:
+ builder.PrependByte(e)
+ return builder.EndVector(len(v))
+
+ def write_int_vector(self, v):
+ builder = self.builder
+ builder.StartVector(4, len(v), 4)
+ for e in v[::-1]:
+ builder.PrependInt32(e)
+ return builder.EndVector(len(v))
+
+ def write_long_vector(self, v):
+ builder = self.builder
+ builder.StartVector(8, len(v), 8)
+ for e in v[::-1]:
+ builder.PrependInt64(e)
+ return builder.EndVector(len(v))
+
+ def write_float_vector(self, v):
+ builder = self.builder
+ builder.StartVector(4, len(v), 4)
+ for e in v[::-1]:
+ builder.PrependFloat32(e)
+ return builder.EndVector(len(v))
+
+ def write_offset_vector(self, v):
+ builder = self.builder
+ builder.StartVector(4, len(v), 4)
+ for e in v[::-1]:
+ builder.PrependUOffsetTRelative(e)
+ return builder.EndVector(len(v))
+
+ def assign_buffers_to_tensors(self, tensors):
+ buffer_map = {}
+ scratch_tensor = [tens for tens in tensors if tens.purpose == TensorPurpose.Scratch][0]
+ buf_idx = 1
+
+ for tens in tensors:
+ if tens.mem_area == scratch_tensor.mem_area:
+ buffer_map[tens] = self.scratch_buf_id
+ else:
+ buffer_map[tens] = buf_idx
+ buf_idx += 1
+
+ # Initialize buffers_to_write to a length equal to numer of buffers so
+ # they can be appended at the correct index during tensor serialization
+ self.buffers_to_write = [None] * (buf_idx)
+
+ return buffer_map
+
+ def serialise_operator_code(self, idx, code):
+ builder = self.builder
+ custom_code_offset = None
+ if code.startswith(custom_prefix):
+ tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix]
+ custom_code_offset = builder.CreateString(code[len(custom_prefix) :])
+ else:
+ try:
+ tf_code, opt_serializer = builtin_operator_inv_map[code]
+ except KeyError:
+ print(
+ "Warning: Writing operation %s, which does not have a direct TensorFlow Lite mapping, as a custom operation"
+ % (code,)
+ )
+ tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix]
+
+ if tf_code == BuiltinOperator.CUSTOM:
+ assert code == "NpuOp" # Currently only support serialising NPU operators as a custom op
+ custom_code_offset = builder.CreateString("ethos-u")
+
+ self.operator_code_map[code] = (idx, tf_code, opt_serializer)
+
+ OperatorCode.OperatorCodeStart(builder)
+ OperatorCode.OperatorCodeAddBuiltinCode(builder, tf_code)
+ if custom_code_offset is not None:
+ OperatorCode.OperatorCodeAddCustomCode(builder, custom_code_offset)
+
+ return OperatorCode.OperatorCodeEnd(builder)
+
+ def serialise_quantization_parameters(self, quant):
+ builder = self.builder
+
+ min = None
+ max = None
+ scale = None
+ zero_point = None
+ if quant is not None:
+ if quant.min is not None:
+ min = self.write_float_vector(make_vector(quant.min))
+ if quant.max is not None:
+ max = self.write_float_vector(make_vector(quant.max))
+ if quant.scale_f32 is not None:
+ scale = self.write_float_vector(make_vector(quant.scale_f32))
+ if quant.zero_point is not None:
+ zero_point = self.write_long_vector(make_vector(quant.zero_point))
+
+ QuantizationParameters.QuantizationParametersStart(builder)
+ if min is not None:
+ QuantizationParameters.QuantizationParametersAddMin(builder, min)
+ if max is not None:
+ QuantizationParameters.QuantizationParametersAddMax(builder, max)
+ if scale is not None:
+ QuantizationParameters.QuantizationParametersAddScale(builder, scale)
+ if zero_point is not None:
+ QuantizationParameters.QuantizationParametersAddZeroPoint(builder, zero_point)
+ return QuantizationParameters.QuantizationParametersEnd(builder)
+
+ def serialise_tensor(self, tens):
+ builder = self.builder
+ tens_shape = tens.shape
+ values = tens.quant_values
+ if values is None:
+ values = tens.values
+
+ if values is None:
+ values = np.empty(shape=(0), dtype=np.uint8)
+
+ if tens in self.tensors_to_reshape:
+ reorder = self.tensors_to_reshape[tens]
+ tens_shape = [tens_shape[idx] for idx in reorder]
+ values = values.transpose(reorder)
+
+ if tens.purpose == TensorPurpose.Scratch:
+ tens_shape = [0]
+ self.buffers_to_write[self.scratch_buf_id] = values.flatten().view(np.uint8)
+
+ buf_id = self.buffer_map[tens]
+ if buf_id != self.scratch_buf_id:
+ self.buffers_to_write[buf_id] = values.flatten().view(np.uint8)
+
+ shape = self.write_int_vector(tens_shape)
+
+ name = builder.CreateString(tens.name)
+ quant = self.serialise_quantization_parameters(tens.quantization)
+
+ Tensor.TensorStart(builder)
+ Tensor.TensorAddShape(builder, shape)
+ Tensor.TensorAddType(builder, datatype_inv_map[tens.dtype])
+ # All tensors must have a valid backing buffer, even if it is empty.
+ # Empty buffers should be kept unique for TensorFlow Lite Micro
+ Tensor.TensorAddBuffer(builder, buf_id)
+ Tensor.TensorAddName(builder, name)
+ Tensor.TensorAddQuantization(builder, quant)
+
+ res = Tensor.TensorEnd(builder)
+ return res
+
+ def serialise_operator(self, op):
+ builder = self.builder
+
+ inputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in op.inputs])
+ outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in op.outputs])
+
+ op_idx, tflop, opt_serializer = self.operator_code_map[op.type]
+
+ builtin_opt_offset = None
+ custom_opt_offset = None
+ if opt_serializer is not None:
+ attrs = dict(op.attrs)
+ if "strides" in attrs:
+ attrs["stride_h"] = attrs["strides"][1]
+ attrs["stride_w"] = attrs["strides"][2]
+ if "ksize" in attrs:
+ attrs["filter_height"] = attrs["ksize"][1]
+ attrs["filter_width"] = attrs["ksize"][2]
+ if "dilation" in attrs:
+ attrs["dilation_h_factor"] = attrs["dilation"][1]
+ attrs["dilation_w_factor"] = attrs["dilation"][2]
+ if "channel_multiplier" in attrs:
+ attrs["depth_multiplier"] = attrs["channel_multiplier"]
+
+ builtin_opt_offset, custom_opt_offset = opt_serializer.serialize(builder, attrs)
+
+ mutating_variable_inputs_offset = self.write_byte_vector([])
+ Operator.OperatorStart(builder)
+ Operator.OperatorAddOpcodeIndex(builder, op_idx)
+ Operator.OperatorAddInputs(builder, inputs_offset)
+ Operator.OperatorAddOutputs(builder, outputs_offset)
+
+ if builtin_opt_offset is not None:
+ Operator.OperatorAddBuiltinOptionsType(builder, opt_serializer.builtin_opt_type)
+ Operator.OperatorAddBuiltinOptions(builder, builtin_opt_offset)
+ if custom_opt_offset is not None:
+ Operator.OperatorAddCustomOptions(builder, custom_opt_offset)
+ Operator.OperatorAddCustomOptionsFormat(builder, opt_serializer.custom_opt_format)
+
+ Operator.OperatorAddMutatingVariableInputs(builder, mutating_variable_inputs_offset)
+ return Operator.OperatorEnd(builder)
+
+ def serialise_subgraph(self, sg):
+ builder = self.builder
+ tensor_set = set()
+
+ all_ops = []
+ for ps in sg.passes:
+ for op in ps.ops:
+ if op.type not in self.ops_to_ignore:
+ all_ops.append(op)
+
+ for op in all_ops:
+ for tens in op.inputs + op.outputs:
+ tensor_set.add(tens)
+
+ all_tensors = [tens for nm, idx, tens in sorted((tens.name, idx, tens) for idx, tens in enumerate(tensor_set))]
+
+ self.tensor_map = {tens: idx for idx, tens in enumerate(all_tensors)}
+ self.buffer_map = self.assign_buffers_to_tensors(all_tensors)
+
+ tensors_offset = self.write_offset_vector([self.serialise_tensor(tens) for tens in all_tensors])
+
+ # Add the Scratch Tensor as input to the NPU subgraph to get it allocated by TensorFlow Lite Micro
+ scratch_tensor_idx = [v for k, v in self.tensor_map.items() if k.name.endswith("scratch")]
+
+ # Make sure the input_tensors haven't been modified
+ assert all(inp in sg.original_inputs for inp in sg.input_tensors)
+ inputs_offset = self.write_int_vector(
+ [self.tensor_map[tens] for tens in sg.original_inputs] + scratch_tensor_idx
+ )
+ outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in sg.output_tensors])
+
+ operators_offset = self.write_offset_vector([self.serialise_operator(op) for op in all_ops])
+
+ SubGraph.SubGraphStart(builder)
+ SubGraph.SubGraphAddTensors(builder, tensors_offset)
+ SubGraph.SubGraphAddInputs(builder, inputs_offset)
+ SubGraph.SubGraphAddOutputs(builder, outputs_offset)
+
+ SubGraph.SubGraphAddOperators(builder, operators_offset)
+
+ return SubGraph.SubGraphEnd(builder)
+
+ def write_aligned_bytes(self, buf):
+ builder = self.builder
+ builder.nested = True
+ data = bytes(buf)
+ length_bytes = UOffsetTFlags.py_type(len(data))
+ builder.Prep(16, length_bytes) # Reserve aligned storage
+ builder.head = UOffsetTFlags.py_type(builder.Head() - length_bytes) # Update FlatBuffer internal pointer
+ builder.Bytes[builder.Head() : builder.Head() + length_bytes] = data # Assign bytes to aligned area
+ return builder.EndVector(length_bytes)
+
+ def serialise_buffer(self, buf):
+ builder = self.builder
+ data = None
+ if buf is not None:
+ data = self.write_aligned_bytes(buf)
+ Buffer.BufferStart(builder)
+ if data is not None:
+ Buffer.BufferAddData(builder, data)
+ return Buffer.BufferEnd(builder)
+
+ def serialise_metadata(self, metadata):
+ builder = self.builder
+ name = builder.CreateString(metadata[0])
+
+ Metadata.MetadataStart(builder)
+ Metadata.MetadataAddName(builder, name)
+ Metadata.MetadataAddBuffer(builder, metadata[1])
+
+ return Metadata.MetadataEnd(builder)
+
+ def serialise_model(self):
+ builder = self.builder
+ operator_code_offset = self.write_offset_vector(
+ [self.serialise_operator_code(idx, code) for idx, code in enumerate(self.operator_codes)]
+ )
+
+ description = builder.CreateString("Vela Optimised")
+
+ subgraph_offset = self.write_offset_vector([self.serialise_subgraph(sg) for sg in self.subgraphs_to_write])
+
+ # Fill the metadata buffer
+ version = np.int32(0)
+ subgraph_idx = np.int32(len(self.subgraphs_to_write)) # Only 1 supported currently
+ nbr_tensors = np.int32(len(self.tensor_map))
+
+ # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro
+ offsets = [np.int32(-1)] * nbr_tensors
+
+ # Ensure that the order of the offsets match the order of the tensors
+ for tens, idx in self.tensor_map.items():
+ if tens.mem_area == MemArea.Sram:
+ offsets[idx] = np.int32(tens.address)
+
+ metadata_buffer = np.array([version, subgraph_idx, nbr_tensors] + offsets)
+ self.buffers_to_write.append(metadata_buffer)
+
+ buffers_offset = self.write_offset_vector([self.serialise_buffer(buf) for buf in self.buffers_to_write])
+
+ metadata_list = [("OfflineMemoryAllocation", len(self.buffers_to_write) - 1)]
+ metadata_offset = self.write_offset_vector([self.serialise_metadata(metadata) for metadata in metadata_list])
+
+ Model.ModelStart(builder)
+ Model.ModelAddVersion(builder, tflite_version)
+ Model.ModelAddOperatorCodes(builder, operator_code_offset)
+ Model.ModelAddSubgraphs(builder, subgraph_offset)
+ Model.ModelAddDescription(builder, description)
+ Model.ModelAddBuffers(builder, buffers_offset)
+ Model.ModelAddMetadata(builder, metadata_offset)
+ return Model.ModelEnd(builder)
+
+ def serialise(self):
+
+ model = self.serialise_model()
+
+ self.builder.FinishWithFileIdentifier(model, tflite_file_identifier)
+
+ return self.builder.Output()
+
+ def write(self, filename):
+ with open(self.filename, "wb") as f:
+ f.write(self.serialised_buf)
+
+
+def write_tflite(nng, filename):
+ writer = TFLiteSerialiser(nng)
+ buf = writer.serialise()
+
+ with open(filename, "wb") as f:
+ f.write(buf)
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
new file mode 100644
index 00000000..f07aec89
--- /dev/null
+++ b/ethosu/vela/vela.py
@@ -0,0 +1,334 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Main entry point for the Vela compiler.
+#
+# Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
+
+import sys
+import os.path
+import os
+import time
+import subprocess
+import configparser
+import argparse
+import ast
+
+from . import architecture_features
+from . import stats_writer
+from . import tflite_writer
+from . import model_reader
+from . import compiler_driver
+from . import scheduler
+from ._version import __version__
+from .scheduler import ParetoMetric
+from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement
+
+
+def process(fname, arch, model_reader_options, compiler_options, scheduler_options):
+ if compiler_options.timing:
+ start = time.time()
+
+ nng = model_reader.read_model(fname, model_reader_options)
+
+ if not nng:
+ print("reading of", fname, "failed")
+ assert False
+
+ if compiler_options.verbose_operators:
+ nng.print_operators()
+
+ if compiler_options.timing:
+ stop = time.time()
+ print("Model reading took %f s" % (stop - start))
+ start = time.time()
+
+ compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options)
+
+ passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
+ stats_writer.write_pass_metrics_csv(nng, passes_csv_file)
+
+ summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
+ stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
+
+ stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
+
+ if fname.endswith(".tflite"):
+ tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name))
+
+ if compiler_options.timing:
+ stop = time.time()
+ print("Compiler driver took %f s" % (stop - start))
+
+ return nng
+
+
+def print_subgraph_io_summary(nng):
+ """Print a summary of all the input and output tensor sizes for all subgraphs.
+ Also displays the total tensor size and the memory used area for sram.
+ """
+
+ print("Subgraph IO Summary")
+ print("-------------------")
+ print("NNG: {0}".format(nng.name))
+ max_sg_size = 0
+ for sg in reversed(nng.subgraphs):
+ print(" Subgraph: {0} = {1}".format(sg.name, sg.placement))
+ sg_size = 0
+
+ if sg.placement == PassPlacement.Npu:
+ for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors:
+ if tens in sg.input_tensors:
+ tens_dir = "In"
+ elif tens in sg.output_tensors:
+ tens_dir = "Out"
+ else:
+ tens_dir = "In/Out"
+
+ size = tens.elements() * tens.element_size() / 1024.0
+ sg_size = sg_size + size
+ print(" Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size))
+
+ print(" Total Size = {0} KiB".format(sg_size))
+ print(" SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0))
+ max_sg_size = max(sg_size, max_sg_size)
+
+ print(" Maximum Subgraph Size = {0} KiB".format(max_sg_size))
+
+
+def main(args=None):
+ if args is None:
+ args = sys.argv[1:]
+
+ parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55")
+
+ parser.add_argument(
+ "network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process"
+ )
+
+ parser.add_argument("--version", action="version", version=__version__)
+ parser.add_argument(
+ "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
+ )
+ parser.add_argument("--config", type=str, help="Location of vela configuration file")
+ parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)")
+
+ parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter")
+ parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization")
+ parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing")
+ parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose")
+ parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format")
+ parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule")
+ parser.add_argument(
+ "--verbose-pareto-frontier-schedules",
+ action="store_true",
+ help="Show all schedules along the pareto frontier of optimisation criteria",
+ )
+ parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation")
+ parser.add_argument(
+ "--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream"
+ )
+ parser.add_argument(
+ "--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
+ )
+ parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
+
+ parser.add_argument(
+ "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
+ )
+ parser.add_argument(
+ "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
+ )
+ parser.add_argument(
+ "--cascading",
+ type=ast.literal_eval,
+ default=True,
+ choices=[True, False],
+ help="Controls the packing of multiple passes into a cascade (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--ifm-ofm-overlap",
+ type=ast.literal_eval,
+ default=True,
+ choices=[True, False],
+ help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)",
+ )
+ parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC")
+ parser.add_argument(
+ "--inter-pass-cycle-delay",
+ type=int,
+ default=0,
+ help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)",
+ )
+ parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
+ parser.add_argument(
+ "--accelerator-config",
+ type=str,
+ default="ethos-u55-256",
+ choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()),
+ help="Accelerator configuration to use (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--system-config",
+ type=str,
+ default="internal-default",
+ help="System configuration to use (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--dram-bandwidth",
+ type=float,
+ default=0.0,
+ help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--permanent-storage",
+ default=MemArea.OffChipFlash,
+ type=lambda s: MemArea[s],
+ choices=list(MemArea)[3:-1],
+ help=(
+ "Memory area for permanent storage. To store the weights and other constant data in SRAM select "
+ "'OnChipFlash' (default: %(default)s)"
+ ),
+ )
+ parser.add_argument(
+ "--tensor-allocator",
+ default=TensorAllocator.Greedy,
+ type=lambda s: TensorAllocator[s],
+ choices=list(TensorAllocator),
+ help="Tensor Allocator algorithm (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--show-subgraph-io-summary",
+ action="store_true",
+ help="Shows a summary of all the subgraphs and their inputs and outputs",
+ )
+ parser.add_argument(
+ "--ifm-streaming",
+ type=ast.literal_eval,
+ default=True,
+ choices=[True, False],
+ help="Controls scheduler IFM streaming search (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--block-config-limit",
+ type=int,
+ default=16,
+ help="Limit block config search space, use zero for unlimited (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--global-memory-clock-scale",
+ type=float,
+ default=1.0,
+ help=(
+ "Performs an additional scaling of the individual memory clock scales specified by the system config "
+ "(default: %(default)s)"
+ ),
+ )
+ parser.add_argument(
+ "--pareto-metric",
+ default=ParetoMetric.BwCycMem,
+ type=lambda s: ParetoMetric[s],
+ choices=list(ParetoMetric),
+ help="Controls the calculation of the pareto metric (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--recursion-limit",
+ type=int,
+ default=10000,
+ help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
+ )
+ parser.add_argument(
+ "--max-block-dependency",
+ type=int,
+ default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
+ choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
+ help=(
+ "Set the maximum value that can be used for the block dependency between npu kernel operations "
+ "(default: %(default)s)"
+ ),
+ )
+
+ args = parser.parse_args(args=args)
+
+ # Read configuration file
+ config_file = args.config
+ config = None
+ if config_file is not None:
+ with open(config_file) as f:
+ config = configparser.ConfigParser()
+ config.read_file(f)
+
+ if args.network is None:
+ parser.error("the following argument is required: NETWORK")
+
+ sys.setrecursionlimit(args.recursion_limit)
+
+ if args.force_block_config:
+ force_block_config = architecture_features.Block.from_string(args.force_block_config)
+ else:
+ force_block_config = None
+
+ arch = architecture_features.ArchitectureFeatures(
+ vela_config=config,
+ system_config=args.system_config,
+ accelerator_config=args.accelerator_config,
+ permanent_storage=args.permanent_storage,
+ inter_pass_cycle_delay=args.inter_pass_cycle_delay,
+ dram_bandwidth=args.dram_bandwidth,
+ override_block_config=force_block_config,
+ block_config_limit=args.block_config_limit,
+ global_memory_clock_scale=args.global_memory_clock_scale,
+ max_blockdep=args.max_block_dependency,
+ )
+
+ compiler_options = compiler_driver.CompilerOptions(
+ verbose_graph=args.verbose_graph,
+ verbose_quantization=args.verbose_quantization,
+ verbose_packing=args.verbose_packing,
+ verbose_tensor_purpose=args.verbose_tensor_purpose,
+ verbose_tensor_format=args.verbose_tensor_format,
+ verbose_allocation=args.verbose_allocation,
+ verbose_high_level_command_stream=args.verbose_high_level_command_stream,
+ verbose_register_command_stream=args.verbose_register_command_stream,
+ verbose_operators=args.verbose_operators,
+ show_minimum_possible_allocation=args.show_minimum_possible_allocation,
+ show_cpu_operations=args.show_cpu_operations,
+ tensor_allocator=args.tensor_allocator,
+ timing=args.timing,
+ output_dir=args.output_dir,
+ )
+
+ scheduler_options = scheduler.SchedulerOptions(
+ use_cascading=args.cascading,
+ use_ifm_ofm_overlap=args.ifm_ofm_overlap,
+ verbose_schedule=args.verbose_schedule,
+ verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules,
+ use_ifm_streaming=args.ifm_streaming,
+ pareto_metric=args.pareto_metric,
+ )
+
+ model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size)
+
+ os.makedirs(args.output_dir, exist_ok=True)
+
+ nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options)
+
+ if args.show_subgraph_io_summary:
+ print_subgraph_io_summary(nng)
+
+ return 0
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
new file mode 100644
index 00000000..0b4ac696
--- /dev/null
+++ b/ethosu/vela/weight_compressor.py
@@ -0,0 +1,387 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Compresses and pads the weigths. It also calculates the scales and packs with the biases.
+
+import os
+import sys
+import enum
+import math
+import numpy as np
+from collections import namedtuple
+from .numeric_util import round_up
+from .scaling import quantise_scale
+from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, TensorBlockTraversal
+from .operation import NpuBlockType
+from .architecture_features import Block
+from .nn_graph import SchedulingStrategy
+from .data_type import DataType
+
+from ethosu import mlw_codec
+
+
+def encode(weight_stream):
+ assert np.amin(weight_stream) >= -255
+ assert np.amax(weight_stream) <= 255
+
+ # Encode flattened signed weight stream
+ compressed = mlw_codec.encode(weight_stream)
+
+ # pad with 0xFF as needed so the length of the weight stream
+ # is a multiple of 16
+
+ while (len(compressed) % 16) != 0:
+ compressed.append(0xFF)
+
+ return compressed
+
+
+def generate_brick(arch, brick_weights, ofm_block, block_traversal, ifm_bitdepth):
+ is_depthwise = block_traversal == TensorBlockTraversal.DepthWise
+ is_partkernel = block_traversal == TensorBlockTraversal.PartKernelFirst
+ subkernel_max = arch.subkernel_max
+ ofm_ublock = arch.ofm_ublock
+ ifm_ublock = arch.ifm_ublock
+ # Expect weights formatted HWIO
+ ofm_depth = brick_weights.shape[-1]
+ ifm_depth = brick_weights.shape[-2]
+ kernel_width = brick_weights.shape[-3]
+ kernel_height = brick_weights.shape[-4]
+ # IFM block depth
+ if is_partkernel or (ifm_bitdepth == 16):
+ # IFM block depth is always 16 for part-kernel-first
+ ifm_block_depth = 16
+ elif ifm_bitdepth == 8:
+ ifm_block_depth = 32
+ else:
+ assert False
+
+ stream = []
+
+ # Top level striping - OFM blocks in the entire brick's depth
+ for ofm_block_z in range(0, ofm_depth, ofm_block.depth):
+ clipped_ofm_block_depth = min(ofm_block.depth, ofm_depth - ofm_block_z)
+ # IFM blocks required for the brick
+ for ifm_block_z in range(0, (1 if is_depthwise else ifm_depth), ifm_block_depth):
+ if is_depthwise:
+ clipped_ifm_block_depth = ifm_ublock.depth
+ else:
+ clipped_ifm_block_depth = (
+ min(ifm_block_depth, ifm_depth - ifm_block_z) if is_partkernel else ifm_block_depth
+ )
+ # Weight decomposition
+ # Subkernel Splitting (H)
+ for subkernel_y in range(0, kernel_height, subkernel_max.height):
+ sub_height = min(kernel_height - subkernel_y, subkernel_max.height)
+ # Subkernel splitting (W)
+ for subkernel_x in range(0, kernel_width, subkernel_max.width):
+ sub_width = min(kernel_width - subkernel_x, subkernel_max.width)
+ subkernel_elements = sub_width * sub_height
+ # Part kernel first works across the kernel H/W and needs padding
+ if is_partkernel:
+ if ifm_bitdepth == 16 and subkernel_elements % 2 != 0:
+ subkernel_elements = int(math.ceil(subkernel_elements / 2) * 2)
+ elif ifm_bitdepth == 8 and subkernel_elements % 4 != 0:
+ subkernel_elements = int(math.ceil(subkernel_elements / 4) * 4)
+
+ # Depthwise Conv requires multiple of 4 kernel elements in its weight block
+ # this is different from normal conv which is considered "weights depth-first"
+ elif is_depthwise:
+ subkernel_elements = int(math.ceil(subkernel_elements / 4.0) * 4)
+
+ ifm_block_depth_outer = clipped_ifm_block_depth if is_partkernel else 1
+ ifm_block_depth_inner = 1 if is_partkernel else clipped_ifm_block_depth
+ # IFM Ublocks in IFM-block over depth for part-kernel-first mode
+ # For depth-first IFM Ublocks are traversed after subkernel elements so this loop is ignored.
+ for ifm_ublk_outer in range(0, ifm_block_depth_outer, ifm_ublock.depth):
+ # OFM Ublocks in OFM-block over depth
+ for ofm_ublk in range(0, clipped_ofm_block_depth, ofm_ublock.depth):
+ # HW Kernel element traversal - cannot be a H/W loop due to element
+ # padding requirement on depthwise/part-kernel configurations
+ for element in range(subkernel_elements):
+ kx = element % sub_width
+ ky = element // sub_width
+ # IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise)
+ # In case of part-kernel-first IFM Ublock traversal have already been handled
+ # and this loop is ignored.
+ for ifm_ublk_inner in range(0, ifm_block_depth_inner, ifm_ublock.depth):
+ # Feed OFM ublock elements
+ for ofm_ublock_z in range(ofm_ublock.depth):
+ # Source IFM ublock elements (only 1 element deep if depthwise)
+ for ifm_ublock_z in range(1 if is_depthwise else ifm_ublock.depth):
+ # Source position within the current subkernel
+ wx = subkernel_x + kx
+ wy = subkernel_y + ky
+ # Source IFM/OFM slices
+ ifm_ublk = ifm_ublk_inner + ifm_ublk_outer
+ ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z
+ ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z
+ if (ifm_z >= ifm_depth) or (ofm_z >= ofm_depth) or (ky >= sub_height):
+ stream.append(0)
+ else:
+ stream.append(brick_weights[wy][wx][ifm_z][ofm_z])
+ return stream
+
+
+# Compress the weights
+def compress_weights(tens, arch, npu_block_type, ofm_block, ofm_depth_step, min_val=None, max_val=None):
+ assert tens.purpose == TensorPurpose.Weights
+ assert tens.format == TensorFormat.WeightsCompressed
+
+ WeightCompressionConfig = namedtuple("WeightCompressionConfig", ["npu_block_type", "ofm_block", "ofm_depth_step"])
+
+ # check if weights have already been compressed
+ wcc = tens.weight_compression_config
+ if wcc is not None:
+ assert wcc.npu_block_type == npu_block_type, "Weights not used by the same operator type"
+
+ if wcc.ofm_block == ofm_block and wcc.ofm_depth_step == ofm_depth_step:
+ return
+
+ assert tens.quantization is not None
+ assert tens.quantization.scale_f32 is not None
+ assert tens.quantization.zero_point is not None
+
+ zero_point = tens.quantization.zero_point
+ quant_buf = tens.quant_values.astype(np.int64)
+
+ # Early zero-point correction
+ weights = quant_buf - zero_point
+
+ if len(weights.shape) == 2:
+ weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0)
+ weights_shape = (weights.shape[0], 1, 1, weights.shape[1])
+ else:
+ weights_shape = weights.shape
+
+ compression_scales = []
+ compressed_offsets = []
+ encoded_streams = []
+ offset = 0
+ max_single_buffer_len = 0
+
+ ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits()
+ ifm_depth = weights.shape[-2]
+ if npu_block_type == NpuBlockType.ConvolutionDepthWise:
+ tens.block_traversal = TensorBlockTraversal.DepthWise
+ if npu_block_type == NpuBlockType.ConvolutionMxN:
+ # Determine which block traversal strategy has better DPU utilization
+ kernel_size = weights_shape[0] * weights_shape[1]
+ depth_utilization = weights_shape[2] / round_up(weights_shape[2], 32 if ifm_bitdepth == 8 else 16)
+ part_kernel_utilization = (weights_shape[2] / round_up(weights_shape[2], 8)) * (
+ kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2)
+ )
+ if part_kernel_utilization >= depth_utilization or ifm_depth <= 8:
+ # Part-kernel first is always better for ifm depths <= 8
+ tens.block_traversal = TensorBlockTraversal.PartKernelFirst
+ else:
+ tens.block_traversal = TensorBlockTraversal.DepthFirst
+
+ # Slice weight stream up depth-ways into bricks and compress
+ full_ofm_depth = quant_buf.shape[-1]
+ for idx in range(0, full_ofm_depth, ofm_depth_step):
+ # Get the weights necessary for this brick
+ count = min(full_ofm_depth - idx, ofm_depth_step)
+ brick_weights = weights[:, :, :, idx : idx + count]
+
+ # Encode all weights into one chunk
+ raw_stream = generate_brick(arch, brick_weights, ofm_block, tens.block_traversal, ifm_bitdepth)
+ encoded = encode(raw_stream)
+ encoded_streams.append(encoded)
+
+ # Remember maximum encoded length for DoubleBuffering
+ if max_single_buffer_len < len(encoded):
+ max_single_buffer_len = len(encoded)
+
+ # Remember where we put it for linear addressing
+ compressed_offsets.append(offset)
+ offset += len(encoded)
+ assert offset % 16 == 0
+
+ # Compression scale tracking
+ compression_scales.append(len(encoded) / len(raw_stream))
+
+ # Also track complete length in the offsets array
+ compressed_offsets.append(offset)
+
+ if tens.sub_purpose == TensorSubPurpose.DoubleBuffer and len(encoded_streams) > 2:
+ offset = 2 * max_single_buffer_len
+ assert offset % 16 == 0
+
+ tens.storage_shape = [1, 1, 1, offset]
+ tens.weight_compression_scales = compression_scales
+ tens.weight_compression_config = WeightCompressionConfig(npu_block_type, ofm_block, ofm_depth_step)
+ tens.weight_compressed_offsets = compressed_offsets
+ tens.compression_scale_for_worst_weight_stream = np.amax(compression_scales)
+ tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales)
+ tens.compressed_values = encoded_streams
+ tens.brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step))
+
+
+def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False):
+ assert tens.purpose == TensorPurpose.FeatureMap
+ assert tens.format == TensorFormat.NHWC
+ # the connected operator should expect a bias input unless it is a FullyConnected
+ assert "Bias" in tens.consumer_list[0].type or tens.consumer_list[0].type.startswith("FullyConnected")
+ # the input bias tensor is the same as that connected to the operator
+ assert tens is tens.consumer_list[0].inputs[2]
+ # the operator should only have a single output
+ assert len(tens.consumer_list[0].outputs) == 1
+
+ def pack_bias_and_scale(bias, scale, shift):
+ bias = np.int64(bias)
+ assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range
+ assert 0 <= scale < (1 << 32) # unsigned 32-bit range
+ assert 0 <= shift < (1 << 6) # unsigned 6-bit range
+
+ # pack the 80 bit value = [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)]
+ data = bytearray(10)
+ data[0] = (bias >> (0 * 8)) & 0xFF
+ data[1] = (bias >> (1 * 8)) & 0xFF
+ data[2] = (bias >> (2 * 8)) & 0xFF
+ data[3] = (bias >> (3 * 8)) & 0xFF
+ data[4] = (bias >> (4 * 8)) & 0xFF
+ data[5] = (scale >> (0 * 8)) & 0xFF
+ data[6] = (scale >> (1 * 8)) & 0xFF
+ data[7] = (scale >> (2 * 8)) & 0xFF
+ data[8] = (scale >> (3 * 8)) & 0xFF
+ data[9] = shift & 0x3F
+ return data
+
+ biases = tens.quant_values
+
+ first_consumer_op = tens.consumer_list[0]
+ ifm_dtype = first_consumer_op.inputs[0].dtype
+ ifm_scale = first_consumer_op.inputs[0].quantization.scale_f32
+ ofm_scale = first_consumer_op.outputs[0].quantization.scale_f32
+ weight_scales = first_consumer_op.inputs[1].quantization.scale_f32
+
+ # biases can have multiple consumers for rnn cells. if so, then check that they are all the same
+ for op in tens.consumer_list[1:]:
+ assert ifm_scale == op.inputs[0].quantization.scale_f32
+ assert ofm_scale == op.outputs[0].quantization.scale_f32
+ assert weight_scales == op.inputs[1].quantization.scale_f32
+
+ if not hasattr(weight_scales, "__iter__"):
+ # If weight_scales is not already an iterable make it into a list
+ weight_scales = [weight_scales]
+
+ # Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which
+ # uses double during scaling calculations
+ # TensorFlow Lite casts the scales slightly differently for uint8 and int8
+ if not rescale_for_faf:
+ if ifm_dtype == DataType.uint8:
+ scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]
+ elif ifm_dtype == DataType.int8:
+ scales = [
+ (np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)
+ for weight_scale in weight_scales
+ ]
+ else:
+ assert False, str(ifm_dtype) + " not implemented"
+ else:
+ if ifm_dtype == DataType.uint8:
+ scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]
+ elif ifm_dtype == DataType.int8:
+ scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]
+ else:
+ assert False, str(ifm_dtype) + " not implemented"
+
+ # quantise all of the weight scales into (scale_factor, shift)
+ quantised_scales = [quantise_scale(scale) for scale in scales]
+
+ for _, shift in quantised_scales:
+ assert shift >= 16
+
+ # pack the biases and scales
+ tens.compressed_values = []
+ if len(quantised_scales) == 1:
+ # If only 1 quantised scale is used, repeat that value for the length of the biases
+ quantised_scales = [quantised_scales[0]] * len(biases)
+
+ assert len(quantised_scales) == len(biases)
+ for i, bias in enumerate(biases):
+ tens.compressed_values.append(pack_bias_and_scale(bias, *quantised_scales[i]))
+
+ tens.element_size_bytes = 10
+
+ # Figure out if we need padded storage (extra whole elements)
+ padding = (len(tens.compressed_values) * tens.element_size_bytes) % 16
+ if padding != 0:
+ padding = 16 - padding
+
+ # This adds enough padding to allow over-reads
+ while padding > 0:
+ tens.compressed_values.append(pack_bias_and_scale(0, 0, 0))
+ padding = padding - tens.element_size_bytes
+
+ tens.storage_shape = [len(tens.compressed_values)]
+
+
+def update_pass_weight_and_scale_tensors(nng, arch):
+ def find_npu_usage_of_tensor(tens):
+ # TODO: This function is identical to the one in mark_tensors.py. A common version should be used.
+ for op in tens.consumers():
+ if op.type == "DMA":
+ return find_npu_usage_of_tensor(op.outputs[0])
+ if "npu_block_type" in op.attrs:
+ return op.attrs["npu_block_type"]
+ return NpuBlockType.Default
+
+ for sg in nng.subgraphs:
+ for ps in sg.passes:
+ if ps.weight_tensor != None:
+ npu_usage_of_tensor = find_npu_usage_of_tensor(ps.weight_tensor)
+ if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise:
+ ps.weight_tensor.quant_values = np.transpose(ps.weight_tensor.quant_values, (0, 1, 3, 2))
+ ps.weight_tensor.shape = ps.weight_tensor.storage_shape = ps.weight_tensor.bandwidth_shape = list(
+ ps.weight_tensor.quant_values.shape
+ )
+ ps.weight_tensor.weight_transpose_depthwise = True
+
+ needs_dma = len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA"
+ if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma:
+ ofm_depth_step = ps.block_config[-1]
+ else:
+ ofm_depth_step = ps.weight_tensor.shape[-1]
+
+ compress_weights(
+ ps.weight_tensor,
+ arch,
+ npu_usage_of_tensor,
+ Block(ps.block_config[-3], ps.block_config[-4], ps.block_config[-1]),
+ ofm_depth_step,
+ )
+ # Update source tensor
+ if len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA":
+ src_tens = ps.weight_tensor.ops[0].inputs[0]
+ src_tens.shape = ps.weight_tensor.shape
+ src_tens.weight_transpose_depthwise = ps.weight_tensor.weight_transpose_depthwise
+ src_tens.quant_values = ps.weight_tensor.quant_values
+ src_tens.compressed_values = ps.weight_tensor.compressed_values
+ src_tens.storage_shape = [1, 1, 1, ps.weight_tensor.weight_compressed_offsets[-1]]
+ src_tens.brick_size = ps.weight_tensor.brick_size
+ src_tens.weight_compression_scales = ps.weight_tensor.weight_compression_scales
+ src_tens.weight_compressed_offsets = ps.weight_tensor.weight_compressed_offsets
+
+ if ps.scale_tensor != None:
+ rescale_for_faf = False
+ activation_ops = set(("Sigmoid", "Tanh"))
+ if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):
+ rescale_for_faf = True
+ calc_scales_and_pack_biases(ps.scale_tensor, arch, ps.block_config[3], rescale_for_faf)
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..1a1ae845
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,63 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Packaging for the Vela compiler
+
+from os import path
+from setuptools import setup, find_namespace_packages, Extension
+
+# Read the contents of README.md file
+this_directory = path.abspath(path.dirname(__file__))
+with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
+ long_description = f.read()
+
+mlw_module = Extension(
+ "ethosu.mlw_codec",
+ ["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"],
+)
+
+setup(
+ name="ethos-u-vela",
+ use_scm_version=True,
+ description="Optimise TensorFlow Lite models for Ethos-U55 NPU.",
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ url="https://git.mlplatform.org/ml/ethos-u/ethos-u-vela.git/",
+ author="Arm Ltd.",
+ license="Apache License 2.0",
+ classifiers=[
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: POSIX :: Linux",
+ "Programming Language :: C",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ "Topic :: Software Development :: Compilers",
+ ],
+ keywords=["ethos-u", "vela compiler", "tflite", "npu"],
+ packages=find_namespace_packages(include=["ethosu.*"]),
+ python_requires="~=3.6", # We support only 3.6+
+ install_requires=["flatbuffers==1.11.0", "numpy>=1.16.6"],
+ entry_points={"console_scripts": ["vela = ethosu.vela.vela:main"]},
+ ext_modules=[mlw_module],
+ setup_requires=["setuptools_scm"],
+)