From 79d07d2cbf1c5013ab40bb46a6ccd4c569966536 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Mon, 27 Apr 2020 18:20:16 +0100 Subject: Add Vela codebase - Added modules ethosu.vela and ethosu.mlw_codec. - Added README and various configuration files. Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee --- Pipfile | 9 + Pipfile.lock | 56 + README.md | 112 + ethosu/mlw_codec/makefile | 49 + ethosu/mlw_codec/mlw_codecmodule.c | 174 ++ ethosu/mlw_codec/mlw_common.h | 29 + ethosu/mlw_codec/mlw_decode.c | 300 ++ ethosu/mlw_codec/mlw_decode.h | 42 + ethosu/mlw_codec/mlw_encode.c | 874 ++++++ ethosu/mlw_codec/mlw_encode.h | 45 + ethosu/mlw_codec/mlw_main.c | 177 ++ ethosu/mlw_codec/test_mlw_codec.py | 43 + ethosu/vela/__init__.py | 20 + ethosu/vela/__main__.py | 22 + ethosu/vela/_version.py | 19 + ethosu/vela/architecture_features.py | 618 ++++ ethosu/vela/compiler_driver.py | 204 ++ ethosu/vela/data_type.py | 116 + ethosu/vela/driver_actions.py | 107 + ethosu/vela/ethos_u55_regs/ethos_u55_regs.py | 3138 ++++++++++++++++++++ ethosu/vela/extract_npu_subgraphs.py | 253 ++ ethosu/vela/graph_optimiser.py | 485 +++ ethosu/vela/greedy_allocation.py | 95 + ethosu/vela/high_level_command_stream.py | 365 +++ ethosu/vela/high_level_command_stream_generator.py | 315 ++ ethosu/vela/insert_dma.py | 60 + ethosu/vela/live_range.py | 324 ++ ethosu/vela/mark_tensors.py | 363 +++ ethosu/vela/model_reader.py | 45 + ethosu/vela/nn_graph.py | 548 ++++ ethosu/vela/npu_performance.py | 516 ++++ ethosu/vela/npu_serialisation.py | 145 + ethosu/vela/numeric_util.py | 89 + ethosu/vela/operation.py | 285 ++ ethosu/vela/pass_packing.py | 489 +++ ethosu/vela/range_set.py | 154 + ethosu/vela/register_command_stream_generator.py | 945 ++++++ ethosu/vela/rewrite_graph.py | 171 ++ ethosu/vela/scaling.py | 91 + ethosu/vela/scheduler.py | 949 ++++++ ethosu/vela/shared_buffer_allocation.py | 199 ++ ethosu/vela/stats_writer.py | 367 +++ ethosu/vela/supported_operators.py | 243 ++ ethosu/vela/tensor.py | 629 ++++ ethosu/vela/tensor_allocation.py | 139 + ethosu/vela/tflite/AbsOptions.py | 22 + ethosu/vela/tflite/ActivationFunctionType.py | 11 + ethosu/vela/tflite/AddNOptions.py | 22 + ethosu/vela/tflite/AddOptions.py | 30 + ethosu/vela/tflite/ArgMaxOptions.py | 30 + ethosu/vela/tflite/ArgMinOptions.py | 30 + ethosu/vela/tflite/BatchToSpaceNDOptions.py | 22 + .../tflite/BidirectionalSequenceLSTMOptions.py | 62 + .../vela/tflite/BidirectionalSequenceRNNOptions.py | 46 + ethosu/vela/tflite/Buffer.py | 46 + ethosu/vela/tflite/BuiltinOperator.py | 131 + ethosu/vela/tflite/BuiltinOptions.py | 106 + ethosu/vela/tflite/CallOptions.py | 30 + ethosu/vela/tflite/CastOptions.py | 38 + ethosu/vela/tflite/CombinerType.py | 8 + ethosu/vela/tflite/ConcatEmbeddingsOptions.py | 78 + ethosu/vela/tflite/ConcatenationOptions.py | 38 + ethosu/vela/tflite/Conv2DOptions.py | 70 + ethosu/vela/tflite/CosOptions.py | 22 + ethosu/vela/tflite/CustomOptionsFormat.py | 6 + ethosu/vela/tflite/CustomQuantization.py | 46 + ethosu/vela/tflite/DensifyOptions.py | 22 + ethosu/vela/tflite/DepthToSpaceOptions.py | 30 + ethosu/vela/tflite/DepthwiseConv2DOptions.py | 78 + ethosu/vela/tflite/DequantizeOptions.py | 22 + ethosu/vela/tflite/DimensionMetadata.py | 76 + ethosu/vela/tflite/DimensionType.py | 7 + ethosu/vela/tflite/DivOptions.py | 30 + ethosu/vela/tflite/EmbeddingLookupSparseOptions.py | 30 + ethosu/vela/tflite/EqualOptions.py | 22 + ethosu/vela/tflite/ExpOptions.py | 22 + ethosu/vela/tflite/ExpandDimsOptions.py | 22 + ethosu/vela/tflite/FakeQuantOptions.py | 54 + ethosu/vela/tflite/FillOptions.py | 22 + ethosu/vela/tflite/FloorDivOptions.py | 22 + ethosu/vela/tflite/FloorModOptions.py | 22 + ethosu/vela/tflite/FullyConnectedOptions.py | 46 + .../tflite/FullyConnectedOptionsWeightsFormat.py | 7 + ethosu/vela/tflite/GatherNdOptions.py | 22 + ethosu/vela/tflite/GatherOptions.py | 30 + ethosu/vela/tflite/GreaterEqualOptions.py | 22 + ethosu/vela/tflite/GreaterOptions.py | 22 + ethosu/vela/tflite/HardSwishOptions.py | 22 + ethosu/vela/tflite/IfOptions.py | 38 + ethosu/vela/tflite/Int32Vector.py | 46 + ethosu/vela/tflite/L2NormOptions.py | 30 + ethosu/vela/tflite/LSHProjectionOptions.py | 30 + ethosu/vela/tflite/LSHProjectionType.py | 8 + ethosu/vela/tflite/LSTMKernelType.py | 7 + ethosu/vela/tflite/LSTMOptions.py | 54 + ethosu/vela/tflite/LeakyReluOptions.py | 30 + ethosu/vela/tflite/LessEqualOptions.py | 22 + ethosu/vela/tflite/LessOptions.py | 22 + .../tflite/LocalResponseNormalizationOptions.py | 54 + ethosu/vela/tflite/LogSoftmaxOptions.py | 22 + ethosu/vela/tflite/LogicalAndOptions.py | 22 + ethosu/vela/tflite/LogicalNotOptions.py | 22 + ethosu/vela/tflite/LogicalOrOptions.py | 22 + ethosu/vela/tflite/MatrixDiagOptions.py | 22 + ethosu/vela/tflite/MatrixSetDiagOptions.py | 22 + ethosu/vela/tflite/MaximumMinimumOptions.py | 22 + ethosu/vela/tflite/Metadata.py | 38 + ethosu/vela/tflite/MirrorPadMode.py | 7 + ethosu/vela/tflite/MirrorPadOptions.py | 30 + ethosu/vela/tflite/Model.py | 150 + ethosu/vela/tflite/MulOptions.py | 30 + ethosu/vela/tflite/NegOptions.py | 22 + ethosu/vela/tflite/NonMaxSuppressionV4Options.py | 22 + ethosu/vela/tflite/NonMaxSuppressionV5Options.py | 22 + ethosu/vela/tflite/NotEqualOptions.py | 22 + ethosu/vela/tflite/OneHotOptions.py | 30 + ethosu/vela/tflite/Operator.py | 177 ++ ethosu/vela/tflite/OperatorCode.py | 46 + ethosu/vela/tflite/PackOptions.py | 38 + ethosu/vela/tflite/PadOptions.py | 22 + ethosu/vela/tflite/PadV2Options.py | 22 + ethosu/vela/tflite/Padding.py | 7 + ethosu/vela/tflite/Pool2DOptions.py | 70 + ethosu/vela/tflite/PowOptions.py | 22 + ethosu/vela/tflite/QuantizationDetails.py | 7 + ethosu/vela/tflite/QuantizationParameters.py | 145 + ethosu/vela/tflite/QuantizeOptions.py | 22 + ethosu/vela/tflite/RNNOptions.py | 30 + ethosu/vela/tflite/RangeOptions.py | 22 + ethosu/vela/tflite/RankOptions.py | 22 + ethosu/vela/tflite/ReducerOptions.py | 30 + ethosu/vela/tflite/ReshapeOptions.py | 46 + ethosu/vela/tflite/ResizeBilinearOptions.py | 38 + ethosu/vela/tflite/ResizeNearestNeighborOptions.py | 30 + ethosu/vela/tflite/ReverseSequenceOptions.py | 38 + ethosu/vela/tflite/ReverseV2Options.py | 22 + ethosu/vela/tflite/SVDFOptions.py | 38 + ethosu/vela/tflite/ScatterNdOptions.py | 22 + ethosu/vela/tflite/SegmentSumOptions.py | 22 + ethosu/vela/tflite/SelectOptions.py | 22 + ethosu/vela/tflite/SelectV2Options.py | 22 + ethosu/vela/tflite/SequenceRNNOptions.py | 38 + ethosu/vela/tflite/ShapeOptions.py | 30 + ethosu/vela/tflite/SkipGramOptions.py | 46 + ethosu/vela/tflite/SliceOptions.py | 22 + ethosu/vela/tflite/SoftmaxOptions.py | 30 + ethosu/vela/tflite/SpaceToBatchNDOptions.py | 22 + ethosu/vela/tflite/SpaceToDepthOptions.py | 30 + ethosu/vela/tflite/SparseIndexVector.py | 9 + ethosu/vela/tflite/SparseToDenseOptions.py | 30 + ethosu/vela/tflite/SparsityParameters.py | 92 + ethosu/vela/tflite/SplitOptions.py | 30 + ethosu/vela/tflite/SplitVOptions.py | 30 + ethosu/vela/tflite/SquareOptions.py | 22 + ethosu/vela/tflite/SquaredDifferenceOptions.py | 22 + ethosu/vela/tflite/SqueezeOptions.py | 46 + ethosu/vela/tflite/StridedSliceOptions.py | 62 + ethosu/vela/tflite/SubGraph.py | 122 + ethosu/vela/tflite/SubOptions.py | 30 + ethosu/vela/tflite/Tensor.py | 126 + ethosu/vela/tflite/TensorType.py | 15 + ethosu/vela/tflite/TileOptions.py | 22 + ethosu/vela/tflite/TopKV2Options.py | 22 + ethosu/vela/tflite/TransposeConvOptions.py | 46 + ethosu/vela/tflite/TransposeOptions.py | 22 + ethosu/vela/tflite/Uint16Vector.py | 46 + ethosu/vela/tflite/Uint8Vector.py | 46 + .../tflite/UnidirectionalSequenceLSTMOptions.py | 54 + ethosu/vela/tflite/UniqueOptions.py | 30 + ethosu/vela/tflite/UnpackOptions.py | 38 + ethosu/vela/tflite/WhereOptions.py | 22 + ethosu/vela/tflite/WhileOptions.py | 38 + ethosu/vela/tflite/ZerosLikeOptions.py | 22 + ethosu/vela/tflite/__init__.py | 0 ethosu/vela/tflite_mapping.py | 644 ++++ ethosu/vela/tflite_reader.py | 252 ++ ethosu/vela/tflite_writer.py | 424 +++ ethosu/vela/vela.py | 334 +++ ethosu/vela/weight_compressor.py | 387 +++ setup.py | 63 + 180 files changed, 21180 insertions(+) create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 README.md create mode 100644 ethosu/mlw_codec/makefile create mode 100644 ethosu/mlw_codec/mlw_codecmodule.c create mode 100644 ethosu/mlw_codec/mlw_common.h create mode 100644 ethosu/mlw_codec/mlw_decode.c create mode 100644 ethosu/mlw_codec/mlw_decode.h create mode 100644 ethosu/mlw_codec/mlw_encode.c create mode 100644 ethosu/mlw_codec/mlw_encode.h create mode 100644 ethosu/mlw_codec/mlw_main.c create mode 100644 ethosu/mlw_codec/test_mlw_codec.py create mode 100644 ethosu/vela/__init__.py create mode 100644 ethosu/vela/__main__.py create mode 100644 ethosu/vela/_version.py create mode 100644 ethosu/vela/architecture_features.py create mode 100644 ethosu/vela/compiler_driver.py create mode 100644 ethosu/vela/data_type.py create mode 100644 ethosu/vela/driver_actions.py create mode 100644 ethosu/vela/ethos_u55_regs/ethos_u55_regs.py create mode 100644 ethosu/vela/extract_npu_subgraphs.py create mode 100644 ethosu/vela/graph_optimiser.py create mode 100644 ethosu/vela/greedy_allocation.py create mode 100644 ethosu/vela/high_level_command_stream.py create mode 100644 ethosu/vela/high_level_command_stream_generator.py create mode 100644 ethosu/vela/insert_dma.py create mode 100644 ethosu/vela/live_range.py create mode 100644 ethosu/vela/mark_tensors.py create mode 100644 ethosu/vela/model_reader.py create mode 100644 ethosu/vela/nn_graph.py create mode 100644 ethosu/vela/npu_performance.py create mode 100644 ethosu/vela/npu_serialisation.py create mode 100644 ethosu/vela/numeric_util.py create mode 100644 ethosu/vela/operation.py create mode 100644 ethosu/vela/pass_packing.py create mode 100644 ethosu/vela/range_set.py create mode 100644 ethosu/vela/register_command_stream_generator.py create mode 100644 ethosu/vela/rewrite_graph.py create mode 100644 ethosu/vela/scaling.py create mode 100644 ethosu/vela/scheduler.py create mode 100644 ethosu/vela/shared_buffer_allocation.py create mode 100644 ethosu/vela/stats_writer.py create mode 100644 ethosu/vela/supported_operators.py create mode 100644 ethosu/vela/tensor.py create mode 100644 ethosu/vela/tensor_allocation.py create mode 100644 ethosu/vela/tflite/AbsOptions.py create mode 100644 ethosu/vela/tflite/ActivationFunctionType.py create mode 100644 ethosu/vela/tflite/AddNOptions.py create mode 100644 ethosu/vela/tflite/AddOptions.py create mode 100644 ethosu/vela/tflite/ArgMaxOptions.py create mode 100644 ethosu/vela/tflite/ArgMinOptions.py create mode 100644 ethosu/vela/tflite/BatchToSpaceNDOptions.py create mode 100644 ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py create mode 100644 ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py create mode 100644 ethosu/vela/tflite/Buffer.py create mode 100644 ethosu/vela/tflite/BuiltinOperator.py create mode 100644 ethosu/vela/tflite/BuiltinOptions.py create mode 100644 ethosu/vela/tflite/CallOptions.py create mode 100644 ethosu/vela/tflite/CastOptions.py create mode 100644 ethosu/vela/tflite/CombinerType.py create mode 100644 ethosu/vela/tflite/ConcatEmbeddingsOptions.py create mode 100644 ethosu/vela/tflite/ConcatenationOptions.py create mode 100644 ethosu/vela/tflite/Conv2DOptions.py create mode 100644 ethosu/vela/tflite/CosOptions.py create mode 100644 ethosu/vela/tflite/CustomOptionsFormat.py create mode 100644 ethosu/vela/tflite/CustomQuantization.py create mode 100644 ethosu/vela/tflite/DensifyOptions.py create mode 100644 ethosu/vela/tflite/DepthToSpaceOptions.py create mode 100644 ethosu/vela/tflite/DepthwiseConv2DOptions.py create mode 100644 ethosu/vela/tflite/DequantizeOptions.py create mode 100644 ethosu/vela/tflite/DimensionMetadata.py create mode 100644 ethosu/vela/tflite/DimensionType.py create mode 100644 ethosu/vela/tflite/DivOptions.py create mode 100644 ethosu/vela/tflite/EmbeddingLookupSparseOptions.py create mode 100644 ethosu/vela/tflite/EqualOptions.py create mode 100644 ethosu/vela/tflite/ExpOptions.py create mode 100644 ethosu/vela/tflite/ExpandDimsOptions.py create mode 100644 ethosu/vela/tflite/FakeQuantOptions.py create mode 100644 ethosu/vela/tflite/FillOptions.py create mode 100644 ethosu/vela/tflite/FloorDivOptions.py create mode 100644 ethosu/vela/tflite/FloorModOptions.py create mode 100644 ethosu/vela/tflite/FullyConnectedOptions.py create mode 100644 ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py create mode 100644 ethosu/vela/tflite/GatherNdOptions.py create mode 100644 ethosu/vela/tflite/GatherOptions.py create mode 100644 ethosu/vela/tflite/GreaterEqualOptions.py create mode 100644 ethosu/vela/tflite/GreaterOptions.py create mode 100644 ethosu/vela/tflite/HardSwishOptions.py create mode 100644 ethosu/vela/tflite/IfOptions.py create mode 100644 ethosu/vela/tflite/Int32Vector.py create mode 100644 ethosu/vela/tflite/L2NormOptions.py create mode 100644 ethosu/vela/tflite/LSHProjectionOptions.py create mode 100644 ethosu/vela/tflite/LSHProjectionType.py create mode 100644 ethosu/vela/tflite/LSTMKernelType.py create mode 100644 ethosu/vela/tflite/LSTMOptions.py create mode 100644 ethosu/vela/tflite/LeakyReluOptions.py create mode 100644 ethosu/vela/tflite/LessEqualOptions.py create mode 100644 ethosu/vela/tflite/LessOptions.py create mode 100644 ethosu/vela/tflite/LocalResponseNormalizationOptions.py create mode 100644 ethosu/vela/tflite/LogSoftmaxOptions.py create mode 100644 ethosu/vela/tflite/LogicalAndOptions.py create mode 100644 ethosu/vela/tflite/LogicalNotOptions.py create mode 100644 ethosu/vela/tflite/LogicalOrOptions.py create mode 100644 ethosu/vela/tflite/MatrixDiagOptions.py create mode 100644 ethosu/vela/tflite/MatrixSetDiagOptions.py create mode 100644 ethosu/vela/tflite/MaximumMinimumOptions.py create mode 100644 ethosu/vela/tflite/Metadata.py create mode 100644 ethosu/vela/tflite/MirrorPadMode.py create mode 100644 ethosu/vela/tflite/MirrorPadOptions.py create mode 100644 ethosu/vela/tflite/Model.py create mode 100644 ethosu/vela/tflite/MulOptions.py create mode 100644 ethosu/vela/tflite/NegOptions.py create mode 100644 ethosu/vela/tflite/NonMaxSuppressionV4Options.py create mode 100644 ethosu/vela/tflite/NonMaxSuppressionV5Options.py create mode 100644 ethosu/vela/tflite/NotEqualOptions.py create mode 100644 ethosu/vela/tflite/OneHotOptions.py create mode 100644 ethosu/vela/tflite/Operator.py create mode 100644 ethosu/vela/tflite/OperatorCode.py create mode 100644 ethosu/vela/tflite/PackOptions.py create mode 100644 ethosu/vela/tflite/PadOptions.py create mode 100644 ethosu/vela/tflite/PadV2Options.py create mode 100644 ethosu/vela/tflite/Padding.py create mode 100644 ethosu/vela/tflite/Pool2DOptions.py create mode 100644 ethosu/vela/tflite/PowOptions.py create mode 100644 ethosu/vela/tflite/QuantizationDetails.py create mode 100644 ethosu/vela/tflite/QuantizationParameters.py create mode 100644 ethosu/vela/tflite/QuantizeOptions.py create mode 100644 ethosu/vela/tflite/RNNOptions.py create mode 100644 ethosu/vela/tflite/RangeOptions.py create mode 100644 ethosu/vela/tflite/RankOptions.py create mode 100644 ethosu/vela/tflite/ReducerOptions.py create mode 100644 ethosu/vela/tflite/ReshapeOptions.py create mode 100644 ethosu/vela/tflite/ResizeBilinearOptions.py create mode 100644 ethosu/vela/tflite/ResizeNearestNeighborOptions.py create mode 100644 ethosu/vela/tflite/ReverseSequenceOptions.py create mode 100644 ethosu/vela/tflite/ReverseV2Options.py create mode 100644 ethosu/vela/tflite/SVDFOptions.py create mode 100644 ethosu/vela/tflite/ScatterNdOptions.py create mode 100644 ethosu/vela/tflite/SegmentSumOptions.py create mode 100644 ethosu/vela/tflite/SelectOptions.py create mode 100644 ethosu/vela/tflite/SelectV2Options.py create mode 100644 ethosu/vela/tflite/SequenceRNNOptions.py create mode 100644 ethosu/vela/tflite/ShapeOptions.py create mode 100644 ethosu/vela/tflite/SkipGramOptions.py create mode 100644 ethosu/vela/tflite/SliceOptions.py create mode 100644 ethosu/vela/tflite/SoftmaxOptions.py create mode 100644 ethosu/vela/tflite/SpaceToBatchNDOptions.py create mode 100644 ethosu/vela/tflite/SpaceToDepthOptions.py create mode 100644 ethosu/vela/tflite/SparseIndexVector.py create mode 100644 ethosu/vela/tflite/SparseToDenseOptions.py create mode 100644 ethosu/vela/tflite/SparsityParameters.py create mode 100644 ethosu/vela/tflite/SplitOptions.py create mode 100644 ethosu/vela/tflite/SplitVOptions.py create mode 100644 ethosu/vela/tflite/SquareOptions.py create mode 100644 ethosu/vela/tflite/SquaredDifferenceOptions.py create mode 100644 ethosu/vela/tflite/SqueezeOptions.py create mode 100644 ethosu/vela/tflite/StridedSliceOptions.py create mode 100644 ethosu/vela/tflite/SubGraph.py create mode 100644 ethosu/vela/tflite/SubOptions.py create mode 100644 ethosu/vela/tflite/Tensor.py create mode 100644 ethosu/vela/tflite/TensorType.py create mode 100644 ethosu/vela/tflite/TileOptions.py create mode 100644 ethosu/vela/tflite/TopKV2Options.py create mode 100644 ethosu/vela/tflite/TransposeConvOptions.py create mode 100644 ethosu/vela/tflite/TransposeOptions.py create mode 100644 ethosu/vela/tflite/Uint16Vector.py create mode 100644 ethosu/vela/tflite/Uint8Vector.py create mode 100644 ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py create mode 100644 ethosu/vela/tflite/UniqueOptions.py create mode 100644 ethosu/vela/tflite/UnpackOptions.py create mode 100644 ethosu/vela/tflite/WhereOptions.py create mode 100644 ethosu/vela/tflite/WhileOptions.py create mode 100644 ethosu/vela/tflite/ZerosLikeOptions.py create mode 100644 ethosu/vela/tflite/__init__.py create mode 100644 ethosu/vela/tflite_mapping.py create mode 100644 ethosu/vela/tflite_reader.py create mode 100644 ethosu/vela/tflite_writer.py create mode 100644 ethosu/vela/vela.py create mode 100644 ethosu/vela/weight_compressor.py create mode 100644 setup.py diff --git a/Pipfile b/Pipfile new file mode 100644 index 00000000..300bef65 --- /dev/null +++ b/Pipfile @@ -0,0 +1,9 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +ethos-u-vela = {editable = true,path = "."} diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 00000000..6fa01549 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,56 @@ +{ + "_meta": { + "hash": { + "sha256": "2d930644f3f81f11dae3317cae890fe083479342c80da44161b46ac83d6972d5" + }, + "pipfile-spec": 6, + "requires": {}, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "ethos-u-vela": { + "editable": true, + "path": "." + }, + "flatbuffers": { + "hashes": [ + "sha256:776a959c5f70b41819fa75de44ed14fd984fa1a79b378f27e6f4fff338cbdca2", + "sha256:f24185db54193540e3d684dc98aa7c2d89882341641548ceb36fd2589fef6c4e" + ], + "version": "==1.11.0" + }, + "numpy": { + "hashes": [ + "sha256:1598a6de323508cfeed6b7cd6c4efb43324f4692e20d1f76e1feec7f59013448", + "sha256:1b0ece94018ae21163d1f651b527156e1f03943b986188dd81bc7e066eae9d1c", + "sha256:2e40be731ad618cb4974d5ba60d373cdf4f1b8dcbf1dcf4d9dff5e212baf69c5", + "sha256:4ba59db1fcc27ea31368af524dcf874d9277f21fd2e1f7f1e2e0c75ee61419ed", + "sha256:59ca9c6592da581a03d42cc4e270732552243dc45e87248aa8d636d53812f6a5", + "sha256:5e0feb76849ca3e83dd396254e47c7dba65b3fa9ed3df67c2556293ae3e16de3", + "sha256:6d205249a0293e62bbb3898c4c2e1ff8a22f98375a34775a259a0523111a8f6c", + "sha256:6fcc5a3990e269f86d388f165a089259893851437b904f422d301cdce4ff25c8", + "sha256:82847f2765835c8e5308f136bc34018d09b49037ec23ecc42b246424c767056b", + "sha256:87902e5c03355335fc5992a74ba0247a70d937f326d852fc613b7f53516c0963", + "sha256:9ab21d1cb156a620d3999dd92f7d1c86824c622873841d6b080ca5495fa10fef", + "sha256:a1baa1dc8ecd88fb2d2a651671a84b9938461e8a8eed13e2f0a812a94084d1fa", + "sha256:a244f7af80dacf21054386539699ce29bcc64796ed9850c99a34b41305630286", + "sha256:a35af656a7ba1d3decdd4fae5322b87277de8ac98b7d9da657d9e212ece76a61", + "sha256:b1fe1a6f3a6f355f6c29789b5927f8bd4f134a4bd9a781099a7c4f66af8850f5", + "sha256:b5ad0adb51b2dee7d0ee75a69e9871e2ddfb061c73ea8bc439376298141f77f5", + "sha256:ba3c7a2814ec8a176bb71f91478293d633c08582119e713a0c5351c0f77698da", + "sha256:cd77d58fb2acf57c1d1ee2835567cd70e6f1835e32090538f17f8a3a99e5e34b", + "sha256:cdb3a70285e8220875e4d2bc394e49b4988bdb1298ffa4e0bd81b2f613be397c", + "sha256:deb529c40c3f1e38d53d5ae6cd077c21f1d49e13afc7936f7f868455e16b64a0", + "sha256:e7894793e6e8540dbeac77c87b489e331947813511108ae097f1715c018b8f3d" + ], + "version": "==1.18.2" + } + }, + "develop": {} +} diff --git a/README.md b/README.md new file mode 100644 index 00000000..03ad7fec --- /dev/null +++ b/README.md @@ -0,0 +1,112 @@ +# Vela +This tool is used to compile a [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers) neural network model into an optimised version that can run on an embedded system containing an [Ethos-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55). + +The optimised model will contain TensorFlow Lite Custom operators for those parts of the model that can be accelerated by the Ethos-U55. Parts of the model that cannot be accelerated are left unchanged and will instead run on the Cortex-M series CPU using an appropriate kernel (such as the [Arm](https://www.arm.com) optimised [CMSIS-NN](https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN) kernels). + +After compilation the optimised model can only be run on an Ethos-U55 NPU embedded system. + +The tool will also generate performance estimates (EXPERIMENTAL) for the compiled model. + +## Environment +Vela runs on the Linux operating system. + +## Prerequisites +The following should be installed prior to the installation of Vela: + - Python >= 3.6 + - GNU toolchain (GCC, Binutils and libraries) or alternative C compiler/linker toolchain + +## Installation +Before running, the Vela package must be installed along with all its dependencies. To do this, first change to the directory that contains this README.md file. Then use the command: +``` +pip3 install -U setuptools>=40.1.0 +pip3 install . +``` + +Or, if you use the `pipenv` virtual environment tool: +``` +pipenv install . +``` + +## Running +Vela is run with an input `.tflite` file passed on the command line. This file contains the neural network to be compiled. The tool then outputs an optimised version with a `_vela.tflite` file prefix, along with the performance estimate (EXPERIMENTAL) CSV files, all to the output directory. + +If you use the `pipenv` virtual environment tool then first start by spawning a shell in the virtual environment.: +``` +pipenv shell +``` +After which running Vela is the same regardless of whether you are in a virtual environment or not. + +Example usage: +1) Compile the network `my_model.tflite`. The optimised version will be output to `./output/my_network_vela.tflite`. +``` +vela my_model.tflite +``` +2) Compile the network `/path/to/my_model.tflite` and specify the output to go in the directory `./results_dir/`. +``` +vela --output-dir ./results_dir /path/to/my_model.tflite +``` +3) To get a list of all available options: +``` +vela --help +``` +4) To specifiy information about the embedded system's configuration use Vela's system configuration file. The following command selects the `MySysConfig` settings that are described in the `sys_cfg_vela.ini` system configuration file. More details can be found in the next section. +``` +vela --config sys_cfg_vela.ini --system-config MySysConfig my_model.tflite +``` + +### Vela's System Configuration file +This is used to describe various properties of the embedded system that the network will run in. + +Example of a Vela system configuration file. +``` +; File: sys_cfg_vela.ini +; The file contains two parts; a system config part and a CPU operator +; performance part. + +; System config +; Specifies properties such as the core clock speed, the size and speed of the +; four potential memory areas, and for various types of data which memory area +; is used to store them. The cpu property is used to link with the CPU operator +; performance. +; The four potential memory areas are: Sram, Dram, OnChipFlash, OffChipFlash. + +[SysConfig.MySysConfig] +npu_freq=500e6 +cpu=MyCpu +Sram_clock_scale=1 +Sram_port_width=64 +Dram_clock_scale=1 +Dram_port_width=64 +OnChipFlash_clock_scale=1 +OnChipFlash_port_width=64 +OffChipFlash_clock_scale=0.25 +OffChipFlash_port_width=32 +permanent_storage_mem_area=OffChipFlash +feature_map_storage_mem_area=Sram +fast_storage_mem_area=Sram + +; CPU operator performance +; Specifies properties that are used by a linear model to estimate the +; performance for any operations that will be run on the CPU (such as those not +; supported by the NPU). Setting the intercept and slope to 0 will result in +; the operator being excluded from the performance estimation. This is the same +; as not specifying the operator. If an explicit cpu is specified rather than +; using the default then the cpu name must match the cpu specified in the +; SysConfig. section. + +[CpuPerformance.MyCpuOperator] +default.intercept=0.0 +default.slope=1.0 + +MyCpu.intercept=0.0 +MyCpu.slope=1.0 +``` + +## Contribution Guidlines and Pull Requests +Contributions are accepted under [Apache License 2.0](LICENSE.txt). Only submit contributions where you have authored all of the code. + +## Resources +* [Ethos-U55](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55) + +## License +Vela is licensed under [Apache License 2.0](LICENSE.txt) diff --git a/ethosu/mlw_codec/makefile b/ethosu/mlw_codec/makefile new file mode 100644 index 00000000..6eb418dd --- /dev/null +++ b/ethosu/mlw_codec/makefile @@ -0,0 +1,49 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Makefile to build mlw_codec + +UNAME=$(shell uname -o) + +CFLAGS=-Wall -Wno-unused-function -Wno-unused-variable + +ifeq ($(DEBUG),1) + CFLAGS+=-g -O0 -DDEBUG +else + CFLAGS+=-O3 +endif + +LIBSRCS=mlw_encode.c mlw_decode.c +LIBHDRS=mlw_encode.h mlw_decode.h mlw_common.h + +ifeq ($(UNAME),Cygwin) + MLWEXE=mlw_codec.exe +else + MLWEXE=mlw_codec +endif + +all: mlwexe + +.PHONY: mlwexe +mlwexe: $(MLWEXE) + +clean: + rm -f $(MLWEXE) + +$(MLWEXE): mlw_main.c $(LIBSRCS) $(LIBHDRS) makefile + gcc $(CFLAGS) mlw_main.c $(LIBSRCS) -o $(MLWEXE) -lm diff --git a/ethosu/mlw_codec/mlw_codecmodule.c b/ethosu/mlw_codec/mlw_codecmodule.c new file mode 100644 index 00000000..de945ab3 --- /dev/null +++ b/ethosu/mlw_codec/mlw_codecmodule.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define PY_SSIZE_T_CLEAN +#include + +#include "mlw_decode.h" +#include "mlw_encode.h" + +/* C extension wrapper for mlw_encode + * + * This method is exposed directly in python with the arguments with a + * prototype of the form: + * + * output = mlw_codec.encode(input, verbose=0) + * + * input: [int] + * verbose: int + * output: bytearray + */ + +static PyObject * +method_encode (PyObject *self, PyObject *args) +{ + /* Object to hold the input integer list. */ + PyObject *input_list_object; + + /* Object to hold the input verbosity integer, the verbose argument + * is optional so defaulted to 0. + */ + int verbose = 0; + + /* Arguments to the method are delivered as a tuple, unpack the + * tuple to get the individual arguments, note the second is + * optional. + */ + if (!PyArg_ParseTuple(args, "O|i", &input_list_object, &verbose)) + return NULL; + + /* Unpack the length of the input integer list. */ + int input_length = PyObject_Length (input_list_object); + if (input_length < 0) + input_length = 0; + + /* We need to marshall the integer list into an input buffer + * suitable for mlw_encode, use a temporary heap allocated buffer + * for that purpose. + */ + int16_t *input_buffer = (int16_t *) malloc(sizeof(int16_t *) * input_length); + if (input_buffer == NULL) + return PyErr_NoMemory(); + + /* Unpack the input integer list into the temporary buffer. + */ + for (int i = 0; i < input_length; i++) + { + PyObject *item; + item = PyList_GetItem(input_list_object, i); + if (!PyLong_Check(item)) + input_buffer[i] = 0; + input_buffer[i] = PyLong_AsLong(item); + } + + /* We don't know the output length required, we guess worst case, + * the mlw_encode call will do a resize (downwards) anyway. + */ + uint8_t *output_buffer = malloc(input_length); + if (output_buffer == NULL) + return PyErr_NoMemory(); + + int output_length = mlw_encode(input_buffer, input_length, &output_buffer, verbose); + + PyObject *output_byte_array = PyByteArray_FromStringAndSize ((char *) output_buffer, output_length); + + /* Discard the temporary input and output buffers. */ + free (input_buffer); + free (output_buffer); + + return output_byte_array; +} + +/* C extension wrapper for mlw_decode + * + * This method is exposed directly in python with the arguments with a + * prototype of the form: + * + * output = mlw_codec.decode(input, verbose=0) + * + * input: bytearray + * verbose: int + * output: [int] + */ + +static PyObject * +method_decode(PyObject *self, PyObject *args) +{ + /* Object to hold the input bytearray. */ + PyObject *input_bytearray_object; + + /* Object to hold the input verbosity integer, the verbose argument + * is optional so defaulted to 0. + */ + int verbose = 0; + + /* Arguments to the method are delivered as a tuple, unpack the + * tuple to get the individual arguments, note the second is + * optional. + */ + if (!PyArg_ParseTuple(args, "Y|i", &input_bytearray_object, &verbose)) + return NULL; + + /* Unpack the input buffer and length from the bytearray object. */ + uint8_t *input_buffer = (uint8_t *) PyByteArray_AsString(input_bytearray_object); + int input_length = PyByteArray_Size(input_bytearray_object); + + /* We don't know the output length required, we guess, but the guess + * will be too small, the mlw_decode call will do a resize (upwards) + * anyway. + */ + int16_t *output_buffer = malloc (input_length); + if (output_buffer == NULL) + return PyErr_NoMemory(); + + int output_length = mlw_decode (input_buffer, input_length, &output_buffer, verbose); + + /* Construct a new integer list and marshall the output buffer + * contents into the list. */ + PyObject *output_list = PyList_New(output_length); + for (int i = 0; i + +#ifndef __MLW_COMMON_H__ +#define __MLW_COMMON_H__ + +#define ZDIV_DISABLE 6 // not alternating mode +#define ZDIV_EOS 7 // indicates end of stream + +#define WDIV_UNCOMPRESSED 7 // indicates uncompressed weights + +#endif diff --git a/ethosu/mlw_codec/mlw_decode.c b/ethosu/mlw_codec/mlw_decode.c new file mode 100644 index 00000000..92aaea67 --- /dev/null +++ b/ethosu/mlw_codec/mlw_decode.c @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlw_common.h" +#include "mlw_decode.h" + + +/////////////////////////////// Read from bitstream + +typedef struct bitbuf { + uint8_t *buf; + int buf_size; // in bytes + int pos; // bit pos of next bit + int log_symbols; +} bitbuf_t; + + +// size in byte +static void bitbuf_init( bitbuf_t *bb, uint8_t *buf, int size, int log_symbols) { + bb->buf = buf; + bb->pos = 0; + bb->buf_size = size; + bb->log_symbols = log_symbols; +} + +static int bitbuf_getbit( bitbuf_t *bb) { + int byte_pos = bb->pos>>3; + int bit_pos = bb->pos&7; + if ( byte_pos < 0 || byte_pos >= bb->buf_size ) { + printf("bitbuf_getbit: underrun, bit_pos %3d byte_pos %3d buf_size %3d\n", bit_pos, byte_pos, bb->buf_size); + exit(1); + } + int bit = bb->buf[ byte_pos ] & (1<pos++; + return bit; +} + +static int bitbuf_get( bitbuf_t *bb, const char *name, int len) { + int i, data=0, save_pos=bb->pos; + if (len>0) { + for(i=0; ilog_symbols) + printf("bitbuf: pos %3d %7s len %d data %x\n", save_pos, name, len, data); + } + return data; +} + +// Decode the given weight stream +// inbuf compressed bitstream +// inbuf_size size of compressed bitstream in bytes +// outbuf uncompressed 9bit signed weights, buffer malloced +// verbose if non-zero, printf log +// Return value is the number of uncompressed weights +int mlw_decode( uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose) { + int nvalues; + int w_grc_div; + int w_grc_trunc; + int w_uncompressed; + int z_grc_div, z_prev_grc_div=0; + int new_palette; + int palsize=0, palbits=0; + int direct_offset=0; + int16_t palette[512]; + int first=1; + int use_zero_run, i, j; + int outbuf_size=0; + int nchunks=0; + + *outbuf=0; + + bitbuf_t bitbuf_s, *bb=&bitbuf_s; + bitbuf_init( bb, inbuf, inbuf_size, (verbose&2)?1:0 ); + + // Loop over all slices + while(1) { + // Decode slice header + z_grc_div = bitbuf_get( bb, "ZDIV", 3 ); + while(z_grc_div==ZDIV_EOS) { // TODO: change to ZDIV_PAD + // End of stream + // Byte align + bitbuf_get( bb, "BYTEALIGN", (8-(bb->pos&7))&7 ); + first=1; + if ( (bb->pos/8) == inbuf_size) { + // Quit if we actually reached end of input stream + break; + } + z_grc_div = bitbuf_get( bb, "ZDIV", 3 ); + } + if ( (bb->pos/8) == inbuf_size) { + break; // reached end of input stream + } + assert(z_grc_div<4 || z_grc_div==ZDIV_DISABLE); + use_zero_run = z_grc_div!=ZDIV_DISABLE; // alternating grc + nvalues = bitbuf_get( bb, "SLICELEN", 15 )+1; + w_grc_div = bitbuf_get( bb, "WDIV", 3 ); + w_grc_trunc = bitbuf_get( bb, "WTRUNC", 1 ); + new_palette = bitbuf_get( bb, "NEWPAL", 1 ); + if (first) { + // the first slice must have a palette/direct mode setup + assert(new_palette); + first=0; + } + if (!new_palette) { + // At the moment it is not supported to change between alternating + // and non-alternating without redefining the palette (this is because + // the zero is not included in the palette in case of alternating) + int prev_use_zero_run = z_prev_grc_div!=ZDIV_DISABLE; + (void)(prev_use_zero_run); + assert( use_zero_run == prev_use_zero_run); + } + z_prev_grc_div = z_grc_div; + if (new_palette) { + direct_offset = bitbuf_get( bb, "DIROFS", 5 ); + palsize = bitbuf_get( bb, "PALSIZE", 5 ); + if (palsize>0) + palsize++; + palbits = bitbuf_get( bb, "PALBITS", 3 )+2; + for(i=0; i0) { + // Uncompressed bits is given by palette size. + uncompressed_bits=0; + while( (1<=0 && use_zero_run && z_pos5 ? 8 : 12; + for(i=0; i>1; + } + cnt += code; + if (code<2 || w_grc_trunc) { + w_q[w_nsymbols++] = cnt; + cnt=0; + } + } + w_carry = cnt; + w_pos += w_nsymbols; + } + if (w_prev_enable) { + for(i=0; i>1; + (*outbuf)[k++] = sign ? -mag : mag; + if (use_zero_run) { + for(j=0; j + +#ifndef __MLW_DECODE_H__ +#define __MLW_DECODE_H__ + +#ifdef _MSC_VER + #define EXPORTED __declspec(dllexport) +#else + #define EXPORTED __attribute__((visibility("default"))) +#endif + +#if __cplusplus +extern "C" +{ +#endif + +EXPORTED +int mlw_decode(uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose); + +#if __cplusplus +} +#endif + +#endif diff --git a/ethosu/mlw_codec/mlw_encode.c b/ethosu/mlw_codec/mlw_encode.c new file mode 100644 index 00000000..ac25fc52 --- /dev/null +++ b/ethosu/mlw_codec/mlw_encode.c @@ -0,0 +1,874 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlw_common.h" +#include "mlw_encode.h" + +#define DPRINTF(...) +//#define DPRINTF(...) printf(__VA_ARGS__) + +#define ZERO_RUN_THRES 4 + +#define min(a,b) ((a)<(b)?(a):(b)) +#define max(a,b) ((a)>(b)?(a):(b)) + +typedef struct palette { + int16_t lut[32]; + int16_t inv_lut[512]; + int palsize; // number of palette entries + int palbits; // bit width of palette entries + int use_zero_runs; // zeros are coded separately + int only_palette; // no values outside the palette + int direct_offset; // added to the decoded weight index before direct conversion to sign/mag + int only_zeros; // special case that the section is all zeros +} palette_t; + +static int is_power_of_two( int x ) { + return ((x-1) & x)==0; +} + +static int get_palette_index_bits( int size ) { + int i; + for(i=7; i>=0; i--) + if (size > (1< (i-last_restart_idx)/4; + + if (got_palette) { + // Check if the next value is not covered by the current palette + if ( prev_idx[ buf[i]+256 ] < last_restart_idx ) { + // New value: increase the palette size + palette_size++; + DPRINTF("Note: at pos %d extend palette to size %d\n", i, palette_size); + if ( is_power_of_two(palette_size-1-exclude_zero) ) { + if ( (i - last_restart_idx - zero_cnt) > 512 || (palette_size-exclude_zero)>32 ) { + // create a new palette because we extend a long lasting palette to require one more index bit + DPRINTF("Note: at pos %d create new palette because previous has to increase one more index bit. last_restart_idx %d n %d zero_cnt %d\n", i, last_restart_idx, i - last_restart_idx, zero_cnt ); + assert( restart_i < max_palettes ); + DPRINTF("restart %d pos %d\n", restart_i, i); + restart_pos[restart_i++] = i; + last_restart_idx = i; + got_palette=0; + zero_cnt=0; + } + } + } + } + + prev_idx[ buf[i]+256 ] = i; + if (buf[i]==0) + zero_cnt++; + + static const int window_sizes[5][2] = {{32,1}, {64,1}, {128,1}, {256,1}, {512,1}}; + int k; + // loop over window sizes + for(k=0; k<5; k++) { + // Every Nth non-zero value, count what would be the size of a palette covering the last N NZ. + int N = window_sizes[k][0] * (got_palette?2:1); + if ( (i - last_restart_idx - zero_cnt) > 0 && ((i - last_restart_idx - zero_cnt) % N)==0 ) { + // Search backward to the position N nonzero values earlier + int nzcnt=0; + for( j=i; j>last_restart_idx; j--) { + if ( buf[j]!=0 ) { + if (nzcnt==N+1) + break; + nzcnt++; + } + } + int restart_idx = j; + + // Calculate the size of a new palette (starting at restart_idx) + int new_palette_size=0; + for(j=0; j<512; j++) { + if ( prev_idx[j] >= restart_idx ) { + new_palette_size++; + } + } + + int create_new_palette=0; + if (got_palette) { + int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero ); + int old_size_bits = get_palette_index_bits( palette_size - exclude_zero ); + int savings = N*(old_size_bits*15-new_size_bits*15)/16 - new_palette_size*8 - 20; + if ( savings>0 ) { + // Create new palette because it can be smaller than the existing palette + create_new_palette=1; + DPRINTF("Note: at pos %d restart smaller palette\n", restart_idx); + } + } else { + if ( (new_palette_size-exclude_zero) <= 32) { + int new_size_bits = get_palette_index_bits( new_palette_size - exclude_zero ); + // estimate if we will make savings by using palette mode + int savings = N*(90-new_size_bits*15)/16 - new_palette_size*8 - 20; + create_new_palette = savings>0; + } + } + if (create_new_palette) { + palette_size=new_palette_size; + got_palette=1; + last_restart_idx = restart_idx; + DPRINTF("Note: at pos %d create palette of size %d\n", last_restart_idx, new_palette_size); + if ( restart_pos[restart_i-1] != last_restart_idx) { + assert( restart_i < max_palettes ); + restart_pos[restart_i++] = last_restart_idx; + } + zero_cnt=0; + for( j=last_restart_idx; j<=i; j++) + if (buf[j]==0) + zero_cnt++; + } + } + } + } + // Reallocate to actual size + *palette_restart_positions = (int*)realloc( restart_pos, restart_i*sizeof(int) ); + return restart_i; +} + +// Calculate frequency table +static void calc_freq( const int16_t *buf, int size, int freq[512] ) { + int i; + memset(freq, 0, 512*sizeof(int)); + for(i=0; ibb ? -1 : aa0) { + all_max_val = max(all_max_val, palval); + } + } + + // Count number of non-used weight values around zero (0, -1, +1, -2, +2 etc) + for(i=0; i<31; i++) { + if ((freq64[i]>>16)!=0) + break; + } + p->direct_offset = i; + + // Sort in descending frequency order + qsort(freq64, 512, sizeof(uint64_t), cmp_uint64); + + // Identify special case that there are no weights to code + // in the weight index stream (i.e. all weights are zeros) + p->only_zeros = (freq64[0]>>16)==0; + if (p->only_zeros) { + p->direct_offset=0; + } + + // Check if all weights fit into the palette (and the palette is not empty) + p->only_palette = (freq64[0]>>16)>0 && (freq64[32]>>16)==0; + + int max_palette_size; + if (p->only_palette) { + max_palette_size = 32; + } else { + // For direct-lut we must make sure that the encoded weight + // index is not > 511. We do that by limiting the palette size + // such that the greatest value can be reached after subtracting + // the palette size. + max_palette_size = min(32, 511-all_max_val); + if (max_palette_size==1) { + max_palette_size=0; // because palette of size 1 is not supported + } + } + + // Setup the 32 entry palette + int palette_max_val = 0, val, cnt, pal_cnt=0; + for(i=0; i>16; + val = freq64[i]&0xffff; + if ( cnt==0 ) + break; + p->lut[i] = val; + palette_max_val = max(palette_max_val, val); + pal_cnt+=cnt; + } + if (i==1) + i++; // palette size of 1 is not supported, make it 2 + + // Heuristic for when to use the palette. If more than half of the + // weights are in the palette then we use it. This ensures we don't + // use palette for e.g. rectangular distributions. + int palbits_val; + if (pal_cnt > all_cnt/2) { + p->palsize = i; + palbits_val = palette_max_val; + } else { + // No palette + p->palsize = 0; + // If no palette, then palbits is used to specify the + // number of bits required for uncompressed mode, i.e. + // the number of bits for the greatest weight value + palbits_val = all_max_val; + } + + // the palette entry bit width + // minimum 2bits (because PALBITS is in range 2..9) + int palbits=2; + while( (1<palbits = palbits; + p->use_zero_runs = use_zero_runs; +} + +// Return 1 if zero runs should be used +// If palette_size is 512, then palette is not used (in that case the palette is setup +// with the standard alternating unsigned to signed mapping) +static int find_palette( const int16_t *inbuf, int inbuf_size, palette_t *p) { + int freq[512], i; + + // Calculate frequencies of the given weight stream + calc_freq( inbuf, inbuf_size, freq); + + // Find two most common values + int most_common_freq[2]={0}, most_common_val[2]={0}; + for(i=0; i<512; i++) { + if ( freq[i] > most_common_freq[0] ) { + most_common_freq[1] = most_common_freq[0]; + most_common_val[1] = most_common_val[0]; + most_common_freq[0] = freq[i]; + most_common_val[0] = i-256; + } else if ( freq[i] > most_common_freq[1] ) { + most_common_freq[1] = freq[i]; + most_common_val[1] = i-256; + } + } + + // Decide if zero-runs (alternating mode) should be used: + // * zero should be the most common symbol + // * zero should be sufficiently more common than the second most common symbol + int use_zero_runs = most_common_val[0]==0 && most_common_freq[0] > ZERO_RUN_THRES*most_common_freq[1]; + + // Create the palette + create_palette( freq, use_zero_runs, p); + + return use_zero_runs; +} + +static void create_inverse_palette( palette_t *p) { + int i; + memset( p->inv_lut, 0, sizeof(p->inv_lut)); + for(i=0; i<512; i++) { + int val = i; + int sign = val&1; + int mag = val>>1; + int weight = sign ? -mag : mag; + if (weight+256 < 512) + p->inv_lut[ weight+256 ] = i + p->palsize - p->direct_offset; + } + for(i=0; ipalsize; i++) { + int val = p->lut[i]; + int sign = val&1; + int mag = val>>1; + int weight = sign ? -mag : mag; + if (weight+256 < 512) + p->inv_lut[ weight+256 ] = i; + } +} + +#define NWCFG 13 +#define NZCFG 4 // restrict search to ZDIV=0..3 +#define MAX_ZWCFG (max(NWCFG,NZCFG)) + +// search state +typedef struct search_state { + int bitcnt; // number of bits to reach this state + uint8_t prev_cfg; // previous grc parameter config +} search_state_t; + +// (trunc<<4) | div, 0x20 means uncompressed +static const char w_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20 }; +static const char z_grc_params[] = { 0x00, 0x01, 0x02, 0x03, 0x04 }; + + + +// An algorithm similar to the Viterbi algorithm is used to search for a +// good GRC parameter sequence for the given input value sequence. +// The inval buffer can contain weights, weight indices or runs. +// The return value is the resulting number of bitstream sections. +static int search_grc_params( const int *inval_buf, + int n_inval, + int zrun_mode, + int uncompressed_bits, + uint8_t *grc_param_cfg, + int *grc_param_pos, + int max_grc_param_cfg, + int *existing_grc_param_pos, + int n_existing_grc_param_pos, + int *bitcnt ) +{ + int n_cfg = zrun_mode ? NZCFG : NWCFG; + const char *grc_params = zrun_mode ? z_grc_params : w_grc_params; + int i,j; + + search_state_t *state[MAX_ZWCFG]; + for(i=0; i>4; + int q = value>>div; + int bits = trunc ? min(q+1,2) + div : q+1+div; + if (!zrun_mode && ((trunc && q>2) || q>31)) + bits=10000; // it's not possible to code the current value; give it a high cost + if (trunc==2) + bits=uncompressed_bits; + + if ( best_bitcnt + cmd_cost < state[j][i].bitcnt ) { + // Change GRC parameters + state[j][i+1].prev_cfg = best_cfg; + state[j][i+1].bitcnt = best_bitcnt + cmd_cost + bits; + } else { + // Keep same GRC parameters + state[j][i+1].prev_cfg = j; + state[j][i+1].bitcnt = state[j][i].bitcnt + bits; + } + } + } + + + // Best GRC parameter + int best_bitcnt=0x7fffffff, best_cfg=0; + for(j=0; j=0; i--) { + if (state[cfg][i].prev_cfg != cfg || i==0) { + n_cmds++; + cfg = state[cfg][i].prev_cfg; + } + } + + (void)(max_grc_param_cfg); + assert(n_cmds<=max_grc_param_cfg); + + cfg = best_cfg; + j=n_cmds-1; + int endpos=n_inval; + for(i=n_inval; i>=0; i--) { + if (state[cfg][i].prev_cfg != cfg || i==0) { + grc_param_cfg[j] = cfg; + grc_param_pos[j] = endpos; + j--; + cfg = state[cfg][i].prev_cfg; + endpos = i-1; + } + } + assert(j==-1); + + for(i=0; ibuf = buf; + bb->pos = 0; + bb->buf_size = size; + bb->log_symbols = log_symbols; +} + +static void bitbuf_putbit( bitbuf_t *bb, int bit) { + int byte_pos = bb->pos>>3; + int bit_pos = bb->pos&7; + assert( byte_pos >= 0 ); + assert( byte_pos < bb->buf_size ); + bb->buf[ byte_pos ] = (bb->buf[ byte_pos ] & ~(1<pos += 1; +} + +static void bitbuf_put( bitbuf_t *bb, const char *name, int len, int data) { + int i; + if (len>0) { + if (bb->log_symbols) + printf("bitbuf: pos %3d %7s len %d data %x\n", bb->pos, name, len, data); + for(i=0; i>i)&1); + } + } +} + +// Return new bitpos +static int encode_slice( const int *w_value, + const int *z_value, + int nvalues, + palette_t *p, + int new_palette, + int uncompressed_bits, + int w_cfg, + int z_cfg, + uint8_t *bitbuf, + int bitbuf_size, + int bitpos, + int verbose ) +{ + int i,j; + bitbuf_t bitbuf_s, *bb=&bitbuf_s; + bitbuf_init( bb, bitbuf, bitbuf_size, verbose&2?1:0 ); + bb->pos = bitpos; + + assert(nvalues<32768); + // GRC parameters for this slice + int w_grc_div = w_grc_params[w_cfg] & 15; + int w_grc_trunc = (w_grc_params[w_cfg] >> 4)==1; + int w_uncompressed = (w_grc_params[w_cfg] >> 4)==2; + int z_grc_div = z_grc_params[z_cfg] & 15; + + if (w_uncompressed) { + w_grc_div = uncompressed_bits; + } + + int zdiv = p->use_zero_runs ? z_grc_div : ZDIV_DISABLE; + int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED; + + if (verbose&1) { + printf("slice: bitoffset %7d slicelen %5d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %2d\n", + bb->pos, nvalues, zdiv, wdiv, w_grc_trunc, new_palette, p->palbits, p->palsize); + } + + // Write slice header + bitbuf_put( bb, "ZDIV", 3, zdiv); + bitbuf_put( bb, "SLICELEN", 15, nvalues-1 ); + bitbuf_put( bb, "WDIV", 3, wdiv); + bitbuf_put( bb, "WTRUNC", 1, w_grc_trunc ); + bitbuf_put( bb, "NEWPAL", 1, new_palette ); + if (new_palette) { + bitbuf_put( bb, "DIROFS", 5, p->direct_offset ); + bitbuf_put( bb, "PALSIZE", 5, max(0, p->palsize-1)); + bitbuf_put( bb, "PALBITS", 3, p->palbits-2 ); + for(i=0; ipalsize; i++) { + bitbuf_put( bb, "PALETTE", p->palbits, p->lut[i] ); + } + } + + int z_nvalues = nvalues + (new_palette?1:0); + int w_pos=0, z_pos=0; + int w_unary0=0, w_unary1=0, w_unary1_len=0, w_q=-1, w_r=0; + int z_unary=0, z_q=-1, z_r=0; + int w_nsymbols=0, w_remain[12]={0}; + int w_prev_enable=0, w_prev_nsymbols=0, w_prev_remain[12]={0}; + int z_nsymbols=0, z_remain[12]={0}; + int z_prev_enable=0, z_prev_nsymbols=0, z_prev_remain[12]={0}; + int z_unary_len = z_grc_div<3 ? 12 : 8; + do { + int balance = p->use_zero_runs ? w_pos - z_pos : 0; + int w_enable = balance<8 && w_pos=0 && p->use_zero_runs && z_pos5 ? 8 : 12; + while(j>w_grc_div; + w_r = value&((1<=0 && j0 ? (1<0) { + w_unary1 |= w_q>1 ? (1<=0) { + w_remain[w_nsymbols] = w_r; + w_nsymbols++; + w_pos++; + } + } + } + + if (z_enable) { + // Encode chunk (zrun) + j=0; + z_nsymbols=0; + z_unary=0; + while(j>z_grc_div; + z_r = value&((1<=0 && j0 ? (1<=0) { + z_remain[z_nsymbols] = z_r; + z_nsymbols++; + z_pos++; + } + } + } + + // Write chunk to bitstream + if (w_enable && !w_uncompressed) { + bitbuf_put( bb, "WUNARY0", 12, w_unary0); + } + if (z_enable) { + bitbuf_put( bb, "ZUNARY", z_unary_len, z_unary); + } + if (w_enable && !w_uncompressed) { + bitbuf_put( bb, "WUNARY1", w_unary1_len, w_unary1); + } + if (w_prev_enable) { + for(i=0; ipos; +} + + +// return new bitpos +static int encode_section( const int16_t *inbuf, + int size, + palette_t *p, + uint8_t *bitbuf, + int bitbuf_size, + int bitpos, + int verbose ) +{ + int uncompressed_bits; + + // Uncompressed mode can only be used if either all weights + // are in the palette OR if the palette is not used. + if (p->only_palette) { + // Uncompressed bits derived from palette size + uncompressed_bits=0; + while( (1<palsize ) + uncompressed_bits++; + } else if (p->palsize==0) { + // Uncompressed bits is palbits (which is the bitdepth of the greatest weight) + uncompressed_bits = p->palbits; + } else { + // Don't use uncompressed + uncompressed_bits = 100; + } + + int *weight_values = malloc( size*sizeof(int) ); + int *zrun_values = malloc( size*sizeof(int) ); + + // Get weights (or weight indicies) AND zero-runs from the input weight stream. + int i=0, n_weights = 0, zcnt; + while(1) { + if (p->use_zero_runs) { + zcnt=0; + // Count zero run + // Special case: if all weights in the section are zero, we must + // still ensure we have one coded weight so the the slice length + // doesn't become 0. Therefore we skip the first zero run and code + // the zero explicitly as a weight value instead + if (!p->only_zeros || i>0) { + while( iinv_lut[inbuf[i]+256]; + weight_values[n_weights] = value; + n_weights++; + i++; + } + + // Search for good GRC parameters for the weight stream + int n_w_slice, w_bitcnt; + uint8_t *w_slice_cfg; + int *w_slice_pos; + w_slice_cfg = malloc( size ); + w_slice_pos = malloc( size*sizeof(int) ); + n_w_slice = search_grc_params( weight_values, n_weights, 0, uncompressed_bits, w_slice_cfg, w_slice_pos, size, 0, 0, &w_bitcnt); + if (n_weights==0) + n_w_slice = 0; + + // Search for good GRC parameters for the zrun stream + int n_z_slice=0, z_bitcnt=0; + uint8_t *z_slice_cfg=0; + int *z_slice_pos=0; + if (p->use_zero_runs) { + z_slice_cfg = malloc( size ); + z_slice_pos = malloc( size*sizeof(int) ); + n_z_slice = search_grc_params( zrun_values, n_weights+1, 1, 0, z_slice_cfg, z_slice_pos, size, w_slice_pos, n_w_slice, &z_bitcnt); + } + + // Encode bitstream slice + int pos=0, i_w_slice=0, i_z_slice=0, new_palette=1; + while(posuse_zero_runs ? zrun_values+pos+(!new_palette) : 0; + bitpos = encode_slice( weight_values+pos, zrun_buf, len, + p, new_palette, uncompressed_bits, + w_slice_cfg[i_w_slice], p->use_zero_runs ? z_slice_cfg[i_z_slice] : 0, + bitbuf, bitbuf_size, bitpos, verbose ); + new_palette = 0; + + if (i_w_sliceuse_zero_runs) { + free(z_slice_cfg); + free(z_slice_pos); + } + free(weight_values); + free(zrun_values); + + return bitpos; +} + +// Encode the given weight stream +// inbuf uncompressed 9bit signed weights +// inbuf_size number of weights +// outbuf compressed bitstream, buffer is malloced +// verbose if non-zero, printf log +// Return value is the size in bytes of the compressed output +// Return -1 if error +int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) { + int i; + // Range check + for(i=0; i255) { + printf("ERROR: weight out of range at index %d, weight value is %d (valid range is -255..255)\n", i, inbuf[i]); + return -1; + } + } + + int bitbuf_size = inbuf_size*2+1024; + *outbuf = malloc( bitbuf_size ); + + // Analyse input data to find palette re-programming points + int n_restarts; + int *palette_restart_pos; + n_restarts = search_palette_sections( inbuf, inbuf_size, &palette_restart_pos); + + // Compress each section (using a single palette) separately + int bitpos=0; + for(i=0; ipos = bitpos; + bitbuf_put( bb, "ZDIV", 3, ZDIV_EOS); + bitbuf_put( bb, "BYTEALIGN", (8-(bb->pos&7))&7, 0xff ); + + // Pad with 0xff until 64bit aligned + while( bb->pos & 127 ) { + bitbuf_put( bb, "PAD", 8, 0xff ); + } + bitpos = bb->pos; + } + assert((bitpos&127)==0); + int outbuf_size = bitpos/8; + *outbuf = realloc( *outbuf, outbuf_size); + + free(palette_restart_pos); + + return outbuf_size; +} + +void mlw_free_outbuf( uint8_t *outbuf ) { + if (outbuf) + free(outbuf); +} diff --git a/ethosu/mlw_codec/mlw_encode.h b/ethosu/mlw_codec/mlw_encode.h new file mode 100644 index 00000000..a995ac6e --- /dev/null +++ b/ethosu/mlw_codec/mlw_encode.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#ifndef __MLW_ENCODE_H__ +#define __MLW_ENCODE_H__ + +#ifdef _MSC_VER + #define EXPORTED __declspec(dllexport) +#else + #define EXPORTED __attribute__((visibility("default"))) +#endif + +#if __cplusplus +extern "C" +{ +#endif + +EXPORTED +int mlw_encode(int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose); + +EXPORTED +void mlw_free_outbuf(uint8_t *outbuf); + +#if __cplusplus +} +#endif + +#endif diff --git a/ethosu/mlw_codec/mlw_main.c b/ethosu/mlw_codec/mlw_main.c new file mode 100644 index 00000000..9f720495 --- /dev/null +++ b/ethosu/mlw_codec/mlw_main.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlw_encode.h" +#include "mlw_decode.h" + +static void fatal_error(const char *format, ...) { + va_list ap; + va_start (ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + exit(1); +} + +static void print_usage(void) { + printf("Usage:\n"); + printf(" Encode: ./mlw_codec [] [-o ] infiles.bin\n"); + printf(" Decode: ./mlw_codec [] -d [-o ] infiles.mlw\n"); + printf("\n"); + printf("Options:\n"); + printf(" -w The uncompressed weight file is an int16_t (word) stream.\n"); + printf(" This is to support 9bit signed weights. Little endian is assuemd.\n"); + printf(" The default format is int8_t (byte) stream (if -w is not specified)\n"); + printf("\n"); +} + +// Read file into allocated buffer. Return length in bytes. +static int read_file( FILE *f, uint8_t **buf) { + + fseek(f, 0, SEEK_END); + int size = ftell(f); + fseek(f, 0, SEEK_SET); + *buf = malloc(size); + assert(*buf); + int rsize = fread(*buf, 1, size, f); + assert(rsize==size); + fclose(f); + return size; +} + + +#define MAX_INFILES 1000 + +int main(int argc, char *argv[]) +{ + int c, decode=0, inbuf_size, outbuf_size; + char *infile_name[MAX_INFILES], *outfile_name=0; + uint8_t *inbuf=0, *outbuf=0; + FILE *infile, *outfile=0; + int verbose=0, infile_idx=0; + int int16_format=0; + + if (argc==1) { + print_usage(); + exit(1); + } + + // Parse command line options + while( optind < argc) { + // Parse options + while ((c = getopt (argc, argv, "di:o:v:w?")) != -1) { + switch (c) { + case 'd': + decode=1; + break; + case 'i': + assert(infile_idx".format(self.width, self.height, self.depth) + + @classmethod + def from_string(cls, s): + w, h, c = (int(v) for v in s.split("x")) + return cls(w, h, c) + + +class Rect: + def __init__(self, x, y, z, x2, y2, z2): + self.x = x + self.y = y + self.z = z + self.x2 = x2 + self.y2 = y2 + self.z2 = z2 + + def start(self): + return PointXYZ(self.x, self.y, self.z) + + def end(self): + return PointXYZ(self.x2, self.y2, self.z2) + + def size(self): + return Block(self.x2 - self.x + 1, self.y2 - self.y + 1, self.z2 - self.z + 1) + + def __repr__(self): + return "".format(self.x, self.y, self.z, self.x2, self.y2, self.z2) + + +class Kernel: + def __init__(self, w, h, sx=1, sy=1, dx=1, dy=1): + assert sx > 0 and sy > 0 + assert dx > 0 and dy > 0 + self.width = w + self.height = h + self.stride = PointXY(sx, sy) + self.dilation = PointXY(dx, dy) + + +class SHRAMElements: + IFM8 = 0 + IFM16 = 1 + IFM8_Elementwise = 2 + IFM16_Elementwise = 3 + Acc16 = 4 + Acc32 = 5 + Acc40 = 6 + Last = Acc40 + BitSizes = np.array([8, 16, 8, 16, 16, 32, 40], np.int32) + + +class SHRAMBlockConfig: + def __init__(self, sizes, banks): + assert len(banks) == SHRAMElements.Last + 1 + self.sizes = sizes + self.banks = banks + + +# Area indices must match Ethos-U55 SHRAM layout spec +class SharedBufferArea(enum.IntEnum): + OFM = 0 + Weights = 1 + IFM = 2 + Accumulators = 3 + Size = Accumulators + 1 + + +class ArchitectureFeatures: + """This class is a container for various parameters of the Ethos-U55 core +and system configuration that can be tuned, either by command line +parameters or by the Ethos-U55 architects. The class is often passed +around to passes that need to do architecture-dependent actions. + +Note the difference between ArchitectureFeatures and CompilerOptions +- ArchitectureFeatures is for changing the Ethos-U55 and system architecture +- CompilerOptions is for changing the behaviour of the compiler + +""" + + ArchitectureConfig = namedtuple( + "ArchitectureConfig", "macs cores ofm_ublock ifm_ublock shram_banks shram_granules elem_units" + ) + accelerator_configs = { + "ethos-u55-256": ArchitectureConfig(256, 1, Block(2, 2, 8), Block(2, 2, 8), 48, [8, 8, 8, 8, 8, 16, 20], 8), + "ethos-u55-128": ArchitectureConfig(128, 1, Block(2, 1, 8), Block(2, 2, 8), 24, [4, 4, 4, 4, 4, 8, 12], 4), + "ethos-u55-64": ArchitectureConfig(64, 1, Block(1, 1, 8), Block(1, 1, 8), 16, [2, 2, 2, 2, 4, 4, 8], 2), + "ethos-u55-32": ArchitectureConfig(32, 1, Block(1, 1, 4), Block(1, 1, 8), 16, [2, 2, 2, 2, 4, 4, 4], 1), + } + + OFMSplitDepth = 16 + + def __init__( + self, + vela_config: ConfigParser, + accelerator_config, + system_config, + permanent_storage, + inter_pass_cycle_delay, + dram_bandwidth, + override_block_config, + block_config_limit, + global_memory_clock_scale, + max_blockdep, + ): + accelerator_config = accelerator_config.lower() + self.vela_config = vela_config + self.accelerator_config = accelerator_config + if not self.accelerator_config in ArchitectureFeatures.accelerator_configs: + raise Exception("Unknown accelerator configuration " + self.accelerator_config) + accel_config = ArchitectureFeatures.accelerator_configs[self.accelerator_config] + self.config = accel_config + + self.system_config = system_config + + is_yoda_system = "yoda-" in self.accelerator_config + + if is_yoda_system: + self.sram_size = 256 * 1024 + else: + self.sram_size = 200 * 1024 * 1024 + + self.ncores = accel_config.cores + self.ofm_ublock = accel_config.ofm_ublock + self.ifm_ublock = accel_config.ifm_ublock + self.subkernel_max = Block(8, 8, 65536) + self.ofm_block_max = Block(64, 32, 128) + self.override_block_config = override_block_config + self.block_config_limit = block_config_limit + + self.global_memory_clock_scale = global_memory_clock_scale + if self.global_memory_clock_scale <= 0.0 or self.global_memory_clock_scale > 1.0: + raise Exception( + "Invalid global_memory_clock_scale = " + + str(self.global_memory_clock_scale) + + " (must be > 0.0 and <= 1.0)" + ) + + self.max_blockdep = max_blockdep + + dpu_min_height = accel_config.ofm_ublock.height + dpu_min_width = accel_config.ofm_ublock.width + dpu_dot_product_width = 8 + dpu_min_ofm_channels = accel_config.ofm_ublock.depth + + self.num_elem_wise_units = accel_config.elem_units + self.num_macs_per_cycle = dpu_min_height * dpu_min_width * dpu_dot_product_width * dpu_min_ofm_channels + + self.memory_clock_scales = np.zeros(MemArea.Size) + self.memory_port_widths = np.zeros(MemArea.Size) + + # Get system configuration + self.__read_sys_config() + + # apply the global memory clock scales to the individual ones from the system config + for mem in MemArea.all(): + self.memory_clock_scales[mem] *= self.global_memory_clock_scale + + self.memory_clocks = self.memory_clock_scales * self.npu_clock + self.memory_bandwidths_per_cycle = self.memory_port_widths * self.memory_clock_scales / 8 + + if dram_bandwidth != 0: + self.memory_bandwidths_per_cycle[MemArea.Dram] = dram_bandwidth * 1e9 / self.npu_clock + + self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.npu_clock + + # sizes as N x H x W x C. we need to round up to these when allocating storage + self.storage_rounding_quantums = { + TensorFormat.Unknown: (1, 1, 1, 1), + TensorFormat.WeightsCompressed: (1, 1, 1, 1), + TensorFormat.NHWC: (1, 1, 1, 1), + TensorFormat.NHCWB16: (1, 1, 1, 16), + } + + # brick sizes as N x H x W x C. We have to fetch whole bricks at a time + self.brick_sizes = { + TensorFormat.Unknown: (1, 1, 1, 1), + TensorFormat.WeightsCompressed: (1, 1, 1, 1), + TensorFormat.NHWC: (1, 1, 1, 1), + TensorFormat.NHCWB16: (1, 1, 1, 16), + } + + self.inter_pass_cycle_delay = inter_pass_cycle_delay + + self.default_weight_format = TensorFormat.WeightsCompressed + self.default_feature_map_format = TensorFormat.NHWC + + if permanent_storage != MemArea.OffChipFlash: + self.permanent_storage_mem_area = permanent_storage + + self.tensor_storage_mem_area = { + # permanent mem_area + TensorPurpose.Weights: self.permanent_storage_mem_area, + TensorPurpose.FeatureMap: self.feature_map_storage_mem_area, + } + + self.tensor_load_mem_area = dict(self.tensor_storage_mem_area) + + if self.tensor_storage_mem_area[TensorPurpose.Weights] in (MemArea.OffChipFlash,): + self.tensor_load_mem_area[TensorPurpose.Weights] = MemArea.Sram + + self.min_block_sizes = { + NpuBlockType.Default: (dpu_min_height, dpu_min_width), + NpuBlockType.VectorProduct: (1, 1), + NpuBlockType.ConvolutionMxN: (dpu_min_height, dpu_min_width), + NpuBlockType.Pooling: (dpu_min_height, dpu_min_width), + NpuBlockType.ConvolutionDepthWise: (dpu_min_height, dpu_min_width), + NpuBlockType.ElementWise: (1, 1), + } + + self.sub_kernel_limits = { + NpuBlockType.Default: (8, 8), + NpuBlockType.VectorProduct: (1, 1), + NpuBlockType.ConvolutionMxN: (8, 8), + NpuBlockType.Pooling: (8, 8), + NpuBlockType.ConvolutionDepthWise: (8, 8), + NpuBlockType.ElementWise: (1, 1), + } + + # weights for scheduler search + from .npu_performance import make_bandwidth_array + + self.bandwidth_weights = make_bandwidth_array() + self.bandwidth_weights[MemArea.Sram] = 1.0 + self.bandwidth_weights[MemArea.Dram] = 10.0 + self.bandwidth_weights[MemArea.OnChipFlash] = 2.0 + self.bandwidth_weights[MemArea.OffChipFlash] = 20.0 + self.cycles_weight = 40 + self.max_sram_used_weight = 1000 + + if is_yoda_system: + self.max_sram_used_weight = 0 + + # Shared Buffer Block allocations + self.shram_bank_size = 1024 # bytes + self.shram_size_bytes = accel_config.shram_banks * self.shram_bank_size + self.shram_reserved_output_banks = 2 + self.shram_reserved_weight_banks = 0 + self.shram_reserved_unused_banks = 2 if accel_config.shram_banks > 16 else 0 + self.shram_total_banks = accel_config.shram_banks - self.shram_reserved_unused_banks + self.shram_bank_granules = np.array(accel_config.shram_granules, np.int32) + + # Build a map of acceptable IFM/OFM block configurations up to the maximum + # IFM/OFM block size. + ifm_block_max = self.get_ifm_block_size(32, self.ofm_block_max, Kernel(8, 8)) + self.block_config_map = dict() + self.generate_block_config_map(Block(ifm_block_max.width, ifm_block_max.height, 128)) + + # Setup supported operators and restriction checkers class + self.supported_operators = SupportedOperators() + + # Calculate block configuration for ALL known IFM operations and + # accumulator sizes. Consumers will need to select their preferred + # operation and bit-width at read-time. + def generate_block_config(self, width, height, depth): + # Number of bytes required for any SRAM element for a FM of given dimensions + size_bytes = (SHRAMElements.BitSizes * (height * width * depth)) // 8 + # Convert byte size (rounded) to size in banks + size_banks = round_up_divide(size_bytes, self.shram_bank_size) + size_banks *= 2 # Double buffer the IFM/Acc (need twice as many banks) + # Round bank requirement to bank granularity + required_banks = round_up(size_banks, self.shram_bank_granules) + return SHRAMBlockConfig(size_bytes, required_banks) + + @staticmethod + def make_block_config_key(width, height, depth): + return (int(height), int(width), int(depth)) + + def get_block_config(self, width, height, depth): + assert depth <= self.ofm_block_max.depth + key = ArchitectureFeatures.make_block_config_key(width, height, depth) + config = self.block_config_map.get(key, None) + return config + + # Generate a key:value map of possible block configurations, where the + # key is compounded from the block dimensions: 0x00HHWWCC + def generate_block_config_map(self, block: Block): + for h in range(1, block.height + 1): + for w in range(1, block.width + 1): + # All possible IFM/OFM depth values + for c in [4, 8, 12, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128]: + key = ArchitectureFeatures.make_block_config_key(w, h, c) + self.block_config_map[key] = self.generate_block_config(w, h, c) + + def calc_ifm_block_depth(self, ifm_depth, ifm_bits): + assert ifm_bits == 8 or ifm_bits == 16 + assert ifm_depth > 0 + ifm_depth = round_up(ifm_depth, self.ifm_ublock.depth) + max_block_depth = 32 if ifm_bits == 8 else 16 + return min(max_block_depth, ifm_depth) + + # Calculate the size of the IFM block given a depth, target OFM block and a kernel + def get_ifm_block_size( + self, ifm_block_depth, ofm_block: Block, kernel: Kernel, subkernel: Block = Block(8, 8, 65536) + ): + upscaling = 1 + # Height + ifm_odd_2x_height_enable = 0 + dilated_kernel_height = ((kernel.height - 1) * kernel.dilation.y) + 1 + ifm_block_height = ( + (ofm_block.height - 1) * kernel.stride.y + + min(subkernel.height, dilated_kernel_height) + + ifm_odd_2x_height_enable + ) // upscaling + + if kernel.stride.y == 1: + ifm_block_height = round_up(ifm_block_height, self.ofm_ublock.height) + elif kernel.stride.y == 2: + if (self.ofm_ublock.height == 2) and (ifm_block_height % 4 == 2): + ifm_block_height = ifm_block_height + 2 + else: + ifm_block_height = round_up(ifm_block_height, self.ofm_ublock.height) + else: + assert False + + # Width + ifm_odd_2x_width_enable = 0 + dilated_kernel_width = ((kernel.width - 1) * kernel.dilation.x) + 1 + ifm_block_width = ( + (ofm_block.width - 1) * kernel.stride.x + + min(subkernel.width, dilated_kernel_width) + + ifm_odd_2x_width_enable + ) // upscaling + + if kernel.stride.x == 1: + ifm_block_width = round_up(ifm_block_width, self.ofm_ublock.width) + elif kernel.stride.x == 2: + if (self.ofm_ublock.width == 2) and (ifm_block_width % 4 == 2): + ifm_block_width = ifm_block_width + 2 + else: + ifm_block_width = round_up(ifm_block_width, self.ofm_ublock.width) + else: + assert False + + return Block(ifm_block_width, ifm_block_height, ifm_block_depth) + + @staticmethod + def intersects(start_a, end_a, start_b, end_b): + start_x = max(start_a[0], start_b[0]) + end_x = min(end_a[0], end_b[0]) + start_y = max(start_a[1], start_b[1]) + end_y = min(end_a[1], end_b[1]) + start_z = max(start_a[2], start_b[2]) + end_z = min(end_a[2], end_b[2]) + return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0) + + # Block job dependency: + # Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10) + # + # A | B + # ----------------------+------------------ + # .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER + # |<------->| dependency offset + # + MAX_BLOCKDEP = 3 + + # Get the coordinates of a block offset from either the end (negative) + # or the start (zero or positive) of the given 3d area + def get_offset_block_coords(self, area: Rect, block: Block, offset): + size = area.size() + # Dimensions of the region, in blocks + width_blocks = round_up_divide(size.width, block.width) + height_blocks = round_up_divide(size.height, block.height) + depth_blocks = round_up_divide(size.depth, block.depth) + total_blocks = width_blocks * height_blocks * depth_blocks + if offset < 0: + index = total_blocks + offset + else: + index = offset + + if index >= total_blocks: + return None + + # Coordinates of the indexed block + coord_z = block.depth * (index % depth_blocks) + coord_y = block.height * (index // (depth_blocks * width_blocks)) + coord_x = block.width * ((index // depth_blocks) % width_blocks) + + return (coord_x + area.x, coord_y + area.y, coord_z + area.z) + + def get_first_job_input_volume( + self, ifm: Rect, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, padLT, block_offset + ): + # Get ifm block size (jobs are invisibly decomposed into subkernels) + ifm_block = self.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, self.ofm_block_max) + ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth) + + # Which OFM block are we calculating + ofm_coord = self.get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks) + if ofm_coord is None: + return None + + # Coordinate of the source IFM block + ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padLT[0]) + ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padLT[1]) + ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth + + # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM + start_coord = (ifm_coord_x, ifm_coord_y, ifm_coord_z) + end_coord = ( + start_coord[0] + ifm_block.width, + start_coord[1] + ifm_block.height, + start_coord[2] + ifm_block.depth, + ) + + return (start_coord, end_coord, 1) # start, end, total jobs + + def get_prev_job_output_volume( + self, ifm: Block, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, block_offset + ): + assert block_offset >= 0 + + # Get OFM block's volume coordinates + start_coord = self.get_offset_block_coords(ofm, ofm_block, -1 - block_offset) + if start_coord is None: + return None + end_coord = ( + start_coord[0] + ofm_block.width, + start_coord[1] + ofm_block.height, + start_coord[2] + ofm_block.depth, + ) + + # Calculate how many IFM blocks this OFM block requires (i.e how many jobs) + ifm_block = self.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, self.ofm_block_max) + ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth) + ifm_depth_blocks = 1 # Overwrite with 1 to force OFM block dependency, not IFM + + return (start_coord, end_coord, ifm_depth_blocks) # start, end, total jobs for this OFM block + + def calc_block_dep( + self, + prev_ifm: Block, + prev_ofm: Block, + prev_ifm_block_depth, + prev_ofm_block: Block, + prev_kernel: Kernel, + ifm: Block, + ofm: Block, + ifm_block_depth, + ofm_block: Block, + kernel: Kernel, + padLT, + ): + + blockdep = ArchitectureFeatures.MAX_BLOCKDEP + + # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window + # of IFM area overlaps with any previous OFM block generation. + elapsed_jobs = 0 + ifm_depth = ifm.size().depth + for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): + # This is the IFM block we want to sample from + in_area = self.get_first_job_input_volume( + ifm, ofm, ifm_block_depth, ofm_block, kernel, padLT, forward_offset + ) + if in_area is None: + break + + # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs) + outstanding_jobs = 0 + for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): + # This is the OFM block being generated by the previous op + out_area = self.get_prev_job_output_volume( + prev_ifm, prev_ofm, prev_ifm_block_depth, prev_ofm_block, prev_kernel, block_offset + ) + if out_area is None: + break + + # Block dependency is the max number of allowed outstanding jobs + # in the pipeline. Selected by determining how many jobs occur + # in between two operators' overlapping OFM->IFM block volumes + if ArchitectureFeatures.intersects(in_area[0], in_area[1], out_area[0], out_area[1]): + break + # Early exit if no intersections and we've seen enough jobs in the pipeline + elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP: + break + + # This OFM had this many jobs (accumulate over multiple OFM blocks) + outstanding_jobs += out_area[2] + + blockdep = min(blockdep, elapsed_jobs + outstanding_jobs) + elapsed_jobs += in_area[2] + # Early exit if no intersections and we've seen enough jobs in the pipeline + if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP: + break + + return blockdep + + def cpu_cycle_estimate(self, op): + """ + Gets estimated performance of a CPU operation, based on a linear model of intercept, slope, + specified in the vela config file, in ConfigParser file format (.ini file). + Example configuration snippet: + [CpuPerformance.MyOperationType] + Cortex-Mx.intercept= + Cortex-Mx.slope= + """ + section = "CpuPerformance." + op.type + if self.vela_config is not None and section in self.vela_config: + op_config = self.vela_config[section] + try: + intercept = float(op_config.get(self.cpu_config + ".intercept", op_config["default.intercept"])) + slope = float(op_config.get(self.cpu_config + ".slope", op_config["default.slope"])) + n_elements = op.inputs[0].elements() + cycles = intercept + n_elements * slope + return cycles + except: + print("Error: Reading CPU cycle estimate in vela configuration file, section {}".format(section)) + raise + + print("Warning: No configured CPU performance estimate for", op.type) + return 0 + + def __read_sys_config(self): + """ + Gets the system configuration with the given name from the vela configuration file + Example configuration snippet: + [SysConfig.MyConfigName] + npu_freq= + cpu=Cortex-Mx + ... + """ + # Get system configuration from the vela configuration file + if self.vela_config is None: + print("Warning: Using default values for system configuration") + else: + section_key = "SysConfig." + self.system_config + if not section_key in self.vela_config: + raise Exception("Unknown system configuration " + self.system_config) + + try: + self.npu_clock = float(self.__sys_config("npu_freq", "500e6")) + self.cpu_config = self.__sys_config("cpu", "Cortex-M7") + + self.memory_clock_scales[MemArea.Sram] = float(self.__sys_config("Sram_clock_scale", "1")) + self.memory_port_widths[MemArea.Sram] = int(self.__sys_config("Sram_port_width", "64")) + + self.memory_clock_scales[MemArea.OnChipFlash] = float(self.__sys_config("OnChipFlash_clock_scale", "1")) + self.memory_port_widths[MemArea.OnChipFlash] = int(self.__sys_config("OnChipFlash_port_width", "64")) + + self.memory_clock_scales[MemArea.OffChipFlash] = float( + self.__sys_config("OffChipFlash_clock_scale", "0.25") + ) + self.memory_port_widths[MemArea.OffChipFlash] = int(self.__sys_config("OffChipFlash_port_width", "32")) + + self.memory_clock_scales[MemArea.Dram] = float(self.__sys_config("Dram_clock_scale", "1")) + self.memory_port_widths[MemArea.Dram] = int(self.__sys_config("Dram_port_width", "32")) + + self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")] + self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")] + self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")] + if self.permanent_storage_mem_area not in set((MemArea.OnChipFlash, MemArea.OffChipFlash)): + raise Exception( + "Invalid permanent_storage_mem_area = " + + str(self.permanent_storage_mem_area) + + " (must be 'OnChipFlash' or 'OffChipFlash'). To store the weights and other constant data in SRAM" + " select 'OnChipFlash'" + ) + except: + print("Error: Reading System Configuration in vela configuration file, section {}".format(section_key)) + raise + + def __sys_config(self, key, default_value): + """ + Gets the system configuration value with the given key from the vela config file. + """ + if self.vela_config is None: + return default_value + section = "SysConfig." + self.system_config + result = self.vela_config[section].get(key, None) + if result is None: + raise Exception("Error: System Configuration Missing key {} in section [{}] ".format(key, section)) + return result diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py new file mode 100644 index 00000000..7f8c4ca4 --- /dev/null +++ b/ethosu/vela/compiler_driver.py @@ -0,0 +1,204 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Contains the main sequencing of the compiler. + +from . import graph_optimiser +from . import mark_tensors +from . import insert_dma +from . import pass_packing +from . import scheduler +from . import tensor_allocation +from . import npu_performance +import time + +from . import high_level_command_stream +from . import high_level_command_stream_generator +from . import register_command_stream_generator +from . import extract_npu_subgraphs +from . import npu_serialisation +from . import weight_compressor +from . import live_range +from .tensor import MemArea +from .nn_graph import TensorAllocator, PassPlacement +from .rewrite_graph import verify_graph_health, verify_subgraph_health + + +class CompilerOptions: + """Set of options to change compiler behaviour - verbosity, targets, turning off passes. + +Note the difference between ArchitectureFeatures and CompilerOptions +- ArchitectureFeatures is for changing the Ethos-U55 and system architecture +- CompilerOptions is for changing the behaviour of the compiler +""" + + def __init__( + self, + verbose_graph=False, + verbose_quantization=False, + verbose_packing=False, + verbose_tensor_purpose=False, + verbose_tensor_format=False, + verbose_allocation=False, + verbose_high_level_command_stream=False, + verbose_register_command_stream=False, + verbose_operators=False, + show_minimum_possible_allocation=False, + show_cpu_operations=False, + tensor_allocator=TensorAllocator.Greedy, + timing=False, + output_dir="outputs", + ): + + self.verbose_graph = verbose_graph + self.verbose_quantization = verbose_quantization + self.verbose_packing = verbose_packing + self.verbose_tensor_purpose = verbose_tensor_purpose + self.verbose_tensor_format = verbose_tensor_format + self.verbose_allocation = verbose_allocation + self.verbose_high_level_command_stream = verbose_high_level_command_stream + self.verbose_register_command_stream = verbose_register_command_stream + self.verbose_operators = verbose_operators + self.show_minimum_possible_allocation = show_minimum_possible_allocation + self.show_cpu_operations = show_cpu_operations + self.tensor_allocator = tensor_allocator + self.timing = timing + self.output_dir = output_dir + + def __str__(self): + return type(self).__name__ + ": " + str(self.__dict__) + + __repr__ = __str__ + + +def compiler_driver(nng, arch, options, scheduler_options): + assert verify_graph_health(nng) + nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph) + assert verify_graph_health(nng) + + if options.verbose_quantization: + nng.print_graph_with_tensor_quantization() + + nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph) + assert verify_graph_health(nng) + + nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose) + assert verify_graph_health(nng) + nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph) + assert verify_graph_health(nng) + pass_packing.pack_into_passes(nng, arch, options.verbose_packing) + assert verify_graph_health(nng) + + extract_npu_subgraphs.extract_npu_subgraphs(nng, arch) + + mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format) + assert verify_graph_health(nng) + if options.timing: + start = time.time() + + # Run the scheduler + scheduler.schedule_passes(nng, arch, scheduler_options) + + if options.timing: + stop = time.time() + print("Scheduling took %f s" % (stop - start)) + start = time.time() + + # Update the compressed weights now that we have determined the + # block config, and calc and pack the scales and biases + weight_compressor.update_pass_weight_and_scale_tensors(nng, arch) + + # Memory area for all non-constant tensors (Cpu and Npu) + non_const_mem_area = MemArea.Sram + + # LiveRanges for constant tensors for all Npu subgraphs + permanent_storage = arch.permanent_storage_mem_area + lr_graph_flash = live_range.LiveRangeGraph() + + # Placeholders for scratch and flash tensors that are common for all Npu subgraphs + scratch_tens = None + flash_tens = None + + # Calculate live ranges for all constant Npu tensors, in permanent storage + for sg in nng.subgraphs: + if sg.placement == PassPlacement.Npu: + lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes( + sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash + ) + + # Allocate all Npu constant tensors to the first Npu subgraph since it is + # processed first during serialization into tensors + first_npu_sg = nng.subgraphs[1] + assert first_npu_sg.placement == PassPlacement.Npu + tensor_allocation.allocate_tensors( + nng, + first_npu_sg, + arch, + permanent_storage, + scheduler_options.use_ifm_ofm_overlap, + options.tensor_allocator, + options.verbose_allocation, + options.show_minimum_possible_allocation, + lr_graph_flash, + ) + + # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step + # will start at the root subgraph's input and traverse from top to bottom. When + # it comes across an Npu-op it will extract live ranges for it's corresponding + # Npu subgraph and add them to the root's live range graph. Finally, all of the + # non-constant tensors are allocated together + root_sg = nng.get_root_subgraph() + tensor_allocation.allocate_tensors( + nng, + root_sg, + arch, + non_const_mem_area, + scheduler_options.use_ifm_ofm_overlap, + options.tensor_allocator, + options.verbose_allocation, + options.show_minimum_possible_allocation, + ) + + # Generate command streams and serialise Npu-ops into tensors + for sg in nng.subgraphs: + high_level_command_stream_generator.generate_high_level_command_stream( + nng, sg, arch, options.verbose_high_level_command_stream + ) + register_command_stream_generator.generate_register_command_stream( + nng, sg, arch, options.verbose_register_command_stream + ) + scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors( + nng, sg, arch, scratch_tens, flash_tens + ) + + npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch) + + # Allocate all Cpu constant tensors, this is done last because the Npu-ops + # have to be serialized into flash and scratch tensors first + tensor_allocation.allocate_tensors( + nng, + root_sg, + arch, + permanent_storage, + scheduler_options.use_ifm_ofm_overlap, + options.tensor_allocator, + options.verbose_allocation, + options.show_minimum_possible_allocation, + ) + + npu_performance.calc_performance_for_network(nng, arch) diff --git a/ethosu/vela/data_type.py b/ethosu/vela/data_type.py new file mode 100644 index 00000000..1d3e94ed --- /dev/null +++ b/ethosu/vela/data_type.py @@ -0,0 +1,116 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Defines the basic numeric type classes for tensors. + +from .numeric_util import round_up_divide +import enum + + +class BaseType(enum.Flag): + Signed = 1 + Unsigned = 2 + Asymmetric = 4 + Int = 8 + SignedInt = Int | Signed + UnsignedInt = Int | Unsigned + AsymmSInt = Int | Asymmetric | Signed + AsymmUInt = Int | Asymmetric | Unsigned + Float = 16 + BFloat = 32 + Bool = 64 + String = 128 + Resource = 256 + Variant = 512 + + +class DataType: + """Defines a data type. Consists of a base type, and the number of bits used for this type""" + + __slots__ = "type", "bits" + + def __init__(self, type_, bits): + self.type = type_ + self.bits = bits + + def __eq__(self, other): + return self.type == other.type and self.bits == other.bits + + def __hash__(self): + return hash((self.type, self.bits)) + + def size_in_bytes(self): + return round_up_divide(self.bits, 8) + + def size_in_bits(self): + return self.bits + + def __str__(self): + stem, needs_format = DataType.stem_name[self.type] + if not needs_format: + return stem + else: + return stem % (self.bits,) + + __repr__ = __str__ + + stem_name = { + BaseType.UnsignedInt: ("uint%s", True), + BaseType.SignedInt: ("int%s", True), + BaseType.AsymmUInt: ("quint%s", True), + BaseType.AsymmSInt: ("qint%s", True), + BaseType.Float: ("float%s", True), + BaseType.BFloat: ("bfloat%s", True), + BaseType.Bool: ("bool", False), + BaseType.String: ("string", False), + BaseType.Resource: ("resource", False), + BaseType.Variant: ("variant", False), + } + + +# generate the standard set of data types +DataType.int8 = DataType(BaseType.SignedInt, 8) +DataType.int16 = DataType(BaseType.SignedInt, 16) +DataType.int32 = DataType(BaseType.SignedInt, 32) +DataType.int64 = DataType(BaseType.SignedInt, 64) + +DataType.uint8 = DataType(BaseType.UnsignedInt, 8) +DataType.uint16 = DataType(BaseType.UnsignedInt, 16) +DataType.uint32 = DataType(BaseType.UnsignedInt, 32) +DataType.uint64 = DataType(BaseType.UnsignedInt, 64) + +DataType.quint4 = DataType(BaseType.AsymmUInt, 4) +DataType.quint8 = DataType(BaseType.AsymmUInt, 8) +DataType.quint12 = DataType(BaseType.AsymmUInt, 12) +DataType.quint16 = DataType(BaseType.AsymmUInt, 16) +DataType.quint32 = DataType(BaseType.AsymmUInt, 32) + +DataType.qint4 = DataType(BaseType.AsymmSInt, 4) +DataType.qint8 = DataType(BaseType.AsymmSInt, 8) +DataType.qint12 = DataType(BaseType.AsymmSInt, 12) +DataType.qint16 = DataType(BaseType.AsymmSInt, 16) +DataType.qint32 = DataType(BaseType.AsymmSInt, 32) + +DataType.float16 = DataType(BaseType.Float, 16) +DataType.float32 = DataType(BaseType.Float, 32) +DataType.float64 = DataType(BaseType.Float, 64) + +DataType.string = DataType(BaseType.String, 64) +DataType.bool = DataType(BaseType.Bool, 8) +DataType.resource = DataType(BaseType.Resource, 8) +DataType.variant = DataType(BaseType.Variant, 8) diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py new file mode 100644 index 00000000..86c4a369 --- /dev/null +++ b/ethosu/vela/driver_actions.py @@ -0,0 +1,107 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Creates driver actions that are embedded in the custom operator payload. + +import numpy as np +from typing import List +from .ethos_u55_regs.ethos_u55_regs import * + + +class DACommands: + Reserved = 0x00 + Config = 0x01 + Config_PatchShift = 4 + CmdStream = 0x02 + ReadAPB = 0x03 + ReadAPB_CountShift = 12 + ReadAPB_IndexMask = (1 << ReadAPB_CountShift) - 1 + DumpSHRAM = 0x04 + NOP = 0x05 + + +def make_da_tag(id: int, reserved: int, param: int) -> int: + tag: int = id + tag |= reserved << 8 + tag |= param << 16 + return tag + + +def emit_fourcc(data: List[int], fourcc: str): + assert data != None + assert fourcc != None + assert len(fourcc) == 4 + value: int = 0 + value = fourcc[0].encode()[0] + value |= fourcc[1].encode()[0] << 8 + value |= fourcc[2].encode()[0] << 16 + value |= fourcc[3].encode()[0] << 24 + data.append(value) + + +def build_id_word(): + arch_major_rev, arch_minor_rev, arch_patch_rev = (int(x) for x in ARCH_VER.split(".")) + n = id_r() + n.set_arch_major_rev(arch_major_rev) + n.set_arch_minor_rev(arch_minor_rev) + n.set_arch_patch_rev(arch_patch_rev) + return n.word + + +def build_config_word(arch): + macs_cc = arch.config.macs + log2_macs_cc = int(np.log2(macs_cc) + 0.5) + shram_size = int(arch.shram_size_bytes / 1024) + n = config_r() + n.set_shram_size(shram_size) + n.set_cmd_stream_version(0) # may be incremented in the future + n.set_macs_per_cc(log2_macs_cc) + return n.word + + +def emit_config(data: List[int], rel: int, patch: int, arch): + assert data != None + data.append(make_da_tag(DACommands.Config, 0, (patch << DACommands.Config_PatchShift) | rel)) + data.append(build_config_word(arch)) + data.append(build_id_word()) + + +def emit_cmd_stream_header(data: List[int], length: int): + assert data != None + # Insert NOPs to align start of command stream to 16 bytes + num_nops = 4 - ((len(data) + 1) % 4) + for _ in range(num_nops): + data.append(make_da_tag(DACommands.NOP, 0, 0)) + + # Use the reserved 8 bit as the length high + length_high = (length & 0x00FF0000) >> 16 + length_low = length & 0x0000FFFF + data.append(make_da_tag(DACommands.CmdStream, length_high, length_low)) + + +def emit_reg_read(data: List[int], reg_index: int, reg_count: int = 1): + assert data != None + assert reg_index >= 0 + assert reg_count >= 1 + payload: int = (reg_index & DACommands.ReadAPB_IndexMask) | ((reg_count << DACommands.ReadAPB_CountShift) - 1) + data.append(make_da_tag(DACommands.ReadAPB, 0, payload)) + + +def emit_dump_shram(data: List[int]): + assert data != None + data.append(make_da_tag(DACommands.DumpSHRAM, 0, 0)) diff --git a/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py b/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py new file mode 100644 index 00000000..37f7a67a --- /dev/null +++ b/ethosu/vela/ethos_u55_regs/ethos_u55_regs.py @@ -0,0 +1,3138 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ctypes import * +from enum import Enum + +ARCH_VER = '0.154.0' + + +class DEBUG_INTERNAL(Enum): + SHARED_BUFFER0 = 0x0400 + SHARED_BUFFER1 = 0x0404 + SHARED_BUFFER2 = 0x0408 + SHARED_BUFFER3 = 0x040C + SHARED_BUFFER4 = 0x0410 + SHARED_BUFFER5 = 0x0414 + SHARED_BUFFER6 = 0x0418 + SHARED_BUFFER7 = 0x041C + SHARED_BUFFER8 = 0x0420 + SHARED_BUFFER9 = 0x0424 + SHARED_BUFFER10 = 0x0428 + SHARED_BUFFER11 = 0x042C + SHARED_BUFFER12 = 0x0430 + SHARED_BUFFER13 = 0x0434 + SHARED_BUFFER14 = 0x0438 + SHARED_BUFFER15 = 0x043C + SHARED_BUFFER16 = 0x0440 + SHARED_BUFFER17 = 0x0444 + SHARED_BUFFER18 = 0x0448 + SHARED_BUFFER19 = 0x044C + SHARED_BUFFER20 = 0x0450 + SHARED_BUFFER21 = 0x0454 + SHARED_BUFFER22 = 0x0458 + SHARED_BUFFER23 = 0x045C + SHARED_BUFFER24 = 0x0460 + SHARED_BUFFER25 = 0x0464 + SHARED_BUFFER26 = 0x0468 + SHARED_BUFFER27 = 0x046C + SHARED_BUFFER28 = 0x0470 + SHARED_BUFFER29 = 0x0474 + SHARED_BUFFER30 = 0x0478 + SHARED_BUFFER31 = 0x047C + SHARED_BUFFER32 = 0x0480 + SHARED_BUFFER33 = 0x0484 + SHARED_BUFFER34 = 0x0488 + SHARED_BUFFER35 = 0x048C + SHARED_BUFFER36 = 0x0490 + SHARED_BUFFER37 = 0x0494 + SHARED_BUFFER38 = 0x0498 + SHARED_BUFFER39 = 0x049C + SHARED_BUFFER40 = 0x04A0 + SHARED_BUFFER41 = 0x04A4 + SHARED_BUFFER42 = 0x04A8 + SHARED_BUFFER43 = 0x04AC + SHARED_BUFFER44 = 0x04B0 + SHARED_BUFFER45 = 0x04B4 + SHARED_BUFFER46 = 0x04B8 + SHARED_BUFFER47 = 0x04BC + SHARED_BUFFER48 = 0x04C0 + SHARED_BUFFER49 = 0x04C4 + SHARED_BUFFER50 = 0x04C8 + SHARED_BUFFER51 = 0x04CC + SHARED_BUFFER52 = 0x04D0 + SHARED_BUFFER53 = 0x04D4 + SHARED_BUFFER54 = 0x04D8 + SHARED_BUFFER55 = 0x04DC + SHARED_BUFFER56 = 0x04E0 + SHARED_BUFFER57 = 0x04E4 + SHARED_BUFFER58 = 0x04E8 + SHARED_BUFFER59 = 0x04EC + SHARED_BUFFER60 = 0x04F0 + SHARED_BUFFER61 = 0x04F4 + SHARED_BUFFER62 = 0x04F8 + SHARED_BUFFER63 = 0x04FC + SHARED_BUFFER64 = 0x0500 + SHARED_BUFFER65 = 0x0504 + SHARED_BUFFER66 = 0x0508 + SHARED_BUFFER67 = 0x050C + SHARED_BUFFER68 = 0x0510 + SHARED_BUFFER69 = 0x0514 + SHARED_BUFFER70 = 0x0518 + SHARED_BUFFER71 = 0x051C + SHARED_BUFFER72 = 0x0520 + SHARED_BUFFER73 = 0x0524 + SHARED_BUFFER74 = 0x0528 + SHARED_BUFFER75 = 0x052C + SHARED_BUFFER76 = 0x0530 + SHARED_BUFFER77 = 0x0534 + SHARED_BUFFER78 = 0x0538 + SHARED_BUFFER79 = 0x053C + SHARED_BUFFER80 = 0x0540 + SHARED_BUFFER81 = 0x0544 + SHARED_BUFFER82 = 0x0548 + SHARED_BUFFER83 = 0x054C + SHARED_BUFFER84 = 0x0550 + SHARED_BUFFER85 = 0x0554 + SHARED_BUFFER86 = 0x0558 + SHARED_BUFFER87 = 0x055C + SHARED_BUFFER88 = 0x0560 + SHARED_BUFFER89 = 0x0564 + SHARED_BUFFER90 = 0x0568 + SHARED_BUFFER91 = 0x056C + SHARED_BUFFER92 = 0x0570 + SHARED_BUFFER93 = 0x0574 + SHARED_BUFFER94 = 0x0578 + SHARED_BUFFER95 = 0x057C + SHARED_BUFFER96 = 0x0580 + SHARED_BUFFER97 = 0x0584 + SHARED_BUFFER98 = 0x0588 + SHARED_BUFFER99 = 0x058C + SHARED_BUFFER100 = 0x0590 + SHARED_BUFFER101 = 0x0594 + SHARED_BUFFER102 = 0x0598 + SHARED_BUFFER103 = 0x059C + SHARED_BUFFER104 = 0x05A0 + SHARED_BUFFER105 = 0x05A4 + SHARED_BUFFER106 = 0x05A8 + SHARED_BUFFER107 = 0x05AC + SHARED_BUFFER108 = 0x05B0 + SHARED_BUFFER109 = 0x05B4 + SHARED_BUFFER110 = 0x05B8 + SHARED_BUFFER111 = 0x05BC + SHARED_BUFFER112 = 0x05C0 + SHARED_BUFFER113 = 0x05C4 + SHARED_BUFFER114 = 0x05C8 + SHARED_BUFFER115 = 0x05CC + SHARED_BUFFER116 = 0x05D0 + SHARED_BUFFER117 = 0x05D4 + SHARED_BUFFER118 = 0x05D8 + SHARED_BUFFER119 = 0x05DC + SHARED_BUFFER120 = 0x05E0 + SHARED_BUFFER121 = 0x05E4 + SHARED_BUFFER122 = 0x05E8 + SHARED_BUFFER123 = 0x05EC + SHARED_BUFFER124 = 0x05F0 + SHARED_BUFFER125 = 0x05F4 + SHARED_BUFFER126 = 0x05F8 + SHARED_BUFFER127 = 0x05FC + SHARED_BUFFER128 = 0x0600 + SHARED_BUFFER129 = 0x0604 + SHARED_BUFFER130 = 0x0608 + SHARED_BUFFER131 = 0x060C + SHARED_BUFFER132 = 0x0610 + SHARED_BUFFER133 = 0x0614 + SHARED_BUFFER134 = 0x0618 + SHARED_BUFFER135 = 0x061C + SHARED_BUFFER136 = 0x0620 + SHARED_BUFFER137 = 0x0624 + SHARED_BUFFER138 = 0x0628 + SHARED_BUFFER139 = 0x062C + SHARED_BUFFER140 = 0x0630 + SHARED_BUFFER141 = 0x0634 + SHARED_BUFFER142 = 0x0638 + SHARED_BUFFER143 = 0x063C + SHARED_BUFFER144 = 0x0640 + SHARED_BUFFER145 = 0x0644 + SHARED_BUFFER146 = 0x0648 + SHARED_BUFFER147 = 0x064C + SHARED_BUFFER148 = 0x0650 + SHARED_BUFFER149 = 0x0654 + SHARED_BUFFER150 = 0x0658 + SHARED_BUFFER151 = 0x065C + SHARED_BUFFER152 = 0x0660 + SHARED_BUFFER153 = 0x0664 + SHARED_BUFFER154 = 0x0668 + SHARED_BUFFER155 = 0x066C + SHARED_BUFFER156 = 0x0670 + SHARED_BUFFER157 = 0x0674 + SHARED_BUFFER158 = 0x0678 + SHARED_BUFFER159 = 0x067C + SHARED_BUFFER160 = 0x0680 + SHARED_BUFFER161 = 0x0684 + SHARED_BUFFER162 = 0x0688 + SHARED_BUFFER163 = 0x068C + SHARED_BUFFER164 = 0x0690 + SHARED_BUFFER165 = 0x0694 + SHARED_BUFFER166 = 0x0698 + SHARED_BUFFER167 = 0x069C + SHARED_BUFFER168 = 0x06A0 + SHARED_BUFFER169 = 0x06A4 + SHARED_BUFFER170 = 0x06A8 + SHARED_BUFFER171 = 0x06AC + SHARED_BUFFER172 = 0x06B0 + SHARED_BUFFER173 = 0x06B4 + SHARED_BUFFER174 = 0x06B8 + SHARED_BUFFER175 = 0x06BC + SHARED_BUFFER176 = 0x06C0 + SHARED_BUFFER177 = 0x06C4 + SHARED_BUFFER178 = 0x06C8 + SHARED_BUFFER179 = 0x06CC + SHARED_BUFFER180 = 0x06D0 + SHARED_BUFFER181 = 0x06D4 + SHARED_BUFFER182 = 0x06D8 + SHARED_BUFFER183 = 0x06DC + SHARED_BUFFER184 = 0x06E0 + SHARED_BUFFER185 = 0x06E4 + SHARED_BUFFER186 = 0x06E8 + SHARED_BUFFER187 = 0x06EC + SHARED_BUFFER188 = 0x06F0 + SHARED_BUFFER189 = 0x06F4 + SHARED_BUFFER190 = 0x06F8 + SHARED_BUFFER191 = 0x06FC + SHARED_BUFFER192 = 0x0700 + SHARED_BUFFER193 = 0x0704 + SHARED_BUFFER194 = 0x0708 + SHARED_BUFFER195 = 0x070C + SHARED_BUFFER196 = 0x0710 + SHARED_BUFFER197 = 0x0714 + SHARED_BUFFER198 = 0x0718 + SHARED_BUFFER199 = 0x071C + SHARED_BUFFER200 = 0x0720 + SHARED_BUFFER201 = 0x0724 + SHARED_BUFFER202 = 0x0728 + SHARED_BUFFER203 = 0x072C + SHARED_BUFFER204 = 0x0730 + SHARED_BUFFER205 = 0x0734 + SHARED_BUFFER206 = 0x0738 + SHARED_BUFFER207 = 0x073C + SHARED_BUFFER208 = 0x0740 + SHARED_BUFFER209 = 0x0744 + SHARED_BUFFER210 = 0x0748 + SHARED_BUFFER211 = 0x074C + SHARED_BUFFER212 = 0x0750 + SHARED_BUFFER213 = 0x0754 + SHARED_BUFFER214 = 0x0758 + SHARED_BUFFER215 = 0x075C + SHARED_BUFFER216 = 0x0760 + SHARED_BUFFER217 = 0x0764 + SHARED_BUFFER218 = 0x0768 + SHARED_BUFFER219 = 0x076C + SHARED_BUFFER220 = 0x0770 + SHARED_BUFFER221 = 0x0774 + SHARED_BUFFER222 = 0x0778 + SHARED_BUFFER223 = 0x077C + SHARED_BUFFER224 = 0x0780 + SHARED_BUFFER225 = 0x0784 + SHARED_BUFFER226 = 0x0788 + SHARED_BUFFER227 = 0x078C + SHARED_BUFFER228 = 0x0790 + SHARED_BUFFER229 = 0x0794 + SHARED_BUFFER230 = 0x0798 + SHARED_BUFFER231 = 0x079C + SHARED_BUFFER232 = 0x07A0 + SHARED_BUFFER233 = 0x07A4 + SHARED_BUFFER234 = 0x07A8 + SHARED_BUFFER235 = 0x07AC + SHARED_BUFFER236 = 0x07B0 + SHARED_BUFFER237 = 0x07B4 + SHARED_BUFFER238 = 0x07B8 + SHARED_BUFFER239 = 0x07BC + SHARED_BUFFER240 = 0x07C0 + SHARED_BUFFER241 = 0x07C4 + SHARED_BUFFER242 = 0x07C8 + SHARED_BUFFER243 = 0x07CC + SHARED_BUFFER244 = 0x07D0 + SHARED_BUFFER245 = 0x07D4 + SHARED_BUFFER246 = 0x07D8 + SHARED_BUFFER247 = 0x07DC + SHARED_BUFFER248 = 0x07E0 + SHARED_BUFFER249 = 0x07E4 + SHARED_BUFFER250 = 0x07E8 + SHARED_BUFFER251 = 0x07EC + SHARED_BUFFER252 = 0x07F0 + SHARED_BUFFER253 = 0x07F4 + SHARED_BUFFER254 = 0x07F8 + SHARED_BUFFER255 = 0x07FC + SIZE = 0x0800 + +class HW_DEBUG_INTERNAL(Enum): + CLKFORCE = 0x0140 + DEBUG = 0x0144 + DEBUG2 = 0x0148 + DEBUGCORE = 0x014C + SIZE = 0x0150 + +class NPU_BP(Enum): + BASEP0 = 0x0080 + BASEP1 = 0x0084 + BASEP2 = 0x0088 + BASEP3 = 0x008C + BASEP4 = 0x0090 + BASEP5 = 0x0094 + BASEP6 = 0x0098 + BASEP7 = 0x009C + BASEP8 = 0x00A0 + BASEP9 = 0x00A4 + BASEP10 = 0x00A8 + BASEP11 = 0x00AC + BASEP12 = 0x00B0 + BASEP13 = 0x00B4 + BASEP14 = 0x00B8 + BASEP15 = 0x00BC + SIZE = 0x00C0 + +class NPU_IDS(Enum): + REVISION = 0x0FC0 + PID4 = 0x0FD0 + PID5 = 0x0FD4 + PID6 = 0x0FD8 + PID7 = 0x0FDC + PID0 = 0x0FE0 + PID1 = 0x0FE4 + PID2 = 0x0FE8 + PID3 = 0x0FEC + CID0 = 0x0FF0 + CID1 = 0x0FF4 + CID2 = 0x0FF8 + CID3 = 0x0FFC + SIZE = 0x1000 + +class NPU_REG(Enum): + ID = 0x0000 + STATUS = 0x0004 + CMD = 0x0008 + RESET = 0x000C + QBASE0 = 0x0010 + QBASE1 = 0x0014 + QREAD = 0x0018 + QCONFIG = 0x001C + QSIZE = 0x0020 + PROT = 0x0024 + CONFIG = 0x0028 + LOCK = 0x002C + REGIONCFG = 0x003C + AXI_LIMIT0 = 0x0040 + AXI_LIMIT1 = 0x0044 + AXI_LIMIT2 = 0x0048 + AXI_LIMIT3 = 0x004C + SIZE = 0x0050 + +class PMU_INTERNAL(Enum): + PMCR = 0x0180 + PMCNTENSET = 0x0184 + PMCNTENCLR = 0x0188 + PMOVSSET = 0x018C + PMOVSCLR = 0x0190 + PMINTSET = 0x0194 + PMINTCLR = 0x0198 + PMCCNTR_LO = 0x01A0 + PMCCNTR_HI = 0x01A4 + PMCCNTR_CFG = 0x01A8 + PMCAXI_CHAN = 0x01AC + PMEVCNTR0 = 0x0300 + PMEVCNTR1 = 0x0304 + PMEVCNTR2 = 0x0308 + PMEVCNTR3 = 0x030C + PMEVTYPER0 = 0x0380 + PMEVTYPER1 = 0x0384 + PMEVTYPER2 = 0x0388 + PMEVTYPER3 = 0x038C + SIZE = 0x0390 + +class TSU_DEBUG_INTERNAL(Enum): + IFM_PAD_TOP = 0x0800 + IFM_PAD_LEFT = 0x0804 + IFM_PAD_RIGHT = 0x0808 + IFM_PAD_BOTTOM = 0x080C + IFM_DEPTH_M1 = 0x0810 + IFM_PRECISION = 0x0814 + IFM_UPSCALE = 0x081C + IFM_ZERO_POINT = 0x0824 + IFM_WIDTH0_M1 = 0x0828 + IFM_HEIGHT0_M1 = 0x082C + IFM_HEIGHT1_M1 = 0x0830 + IFM_IB_END = 0x0834 + IFM_REGION = 0x083C + OFM_WIDTH_M1 = 0x0844 + OFM_HEIGHT_M1 = 0x0848 + OFM_DEPTH_M1 = 0x084C + OFM_PRECISION = 0x0850 + OFM_BLK_WIDTH_M1 = 0x0854 + OFM_BLK_HEIGHT_M1 = 0x0858 + OFM_BLK_DEPTH_M1 = 0x085C + OFM_ZERO_POINT = 0x0860 + OFM_WIDTH0_M1 = 0x0868 + OFM_HEIGHT0_M1 = 0x086C + OFM_HEIGHT1_M1 = 0x0870 + OFM_REGION = 0x087C + KERNEL_WIDTH_M1 = 0x0880 + KERNEL_HEIGHT_M1 = 0x0884 + KERNEL_STRIDE = 0x0888 + PARALLEL_MODE = 0x088C + ACC_FORMAT = 0x0890 + ACTIVATION = 0x0894 + ACTIVATION_MIN = 0x0898 + ACTIVATION_MAX = 0x089C + WEIGHT_REGION = 0x08A0 + SCALE_REGION = 0x08A4 + AB_START = 0x08B4 + BLOCKDEP = 0x08BC + DMA0_SRC_REGION = 0x08C0 + DMA0_DST_REGION = 0x08C4 + DMA0_SIZE0 = 0x08C8 + DMA0_SIZE1 = 0x08CC + IFM2_BROADCAST = 0x0900 + IFM2_SCALAR = 0x0904 + IFM2_PRECISION = 0x0914 + IFM2_ZERO_POINT = 0x0924 + IFM2_WIDTH0_M1 = 0x0928 + IFM2_HEIGHT0_M1 = 0x092C + IFM2_HEIGHT1_M1 = 0x0930 + IFM2_IB_START = 0x0934 + IFM2_REGION = 0x093C + IFM_BASE0 = 0x0A00 + IFM_BASE0_HI = 0x0A04 + IFM_BASE1 = 0x0A08 + IFM_BASE1_HI = 0x0A0C + IFM_BASE2 = 0x0A10 + IFM_BASE2_HI = 0x0A14 + IFM_BASE3 = 0x0A18 + IFM_BASE3_HI = 0x0A1C + IFM_STRIDE_X = 0x0A20 + IFM_STRIDE_X_HI = 0x0A24 + IFM_STRIDE_Y = 0x0A28 + IFM_STRIDE_Y_HI = 0x0A2C + IFM_STRIDE_C = 0x0A30 + IFM_STRIDE_C_HI = 0x0A34 + OFM_BASE0 = 0x0A40 + OFM_BASE0_HI = 0x0A44 + OFM_BASE1 = 0x0A48 + OFM_BASE1_HI = 0x0A4C + OFM_BASE2 = 0x0A50 + OFM_BASE2_HI = 0x0A54 + OFM_BASE3 = 0x0A58 + OFM_BASE3_HI = 0x0A5C + OFM_STRIDE_X = 0x0A60 + OFM_STRIDE_X_HI = 0x0A64 + OFM_STRIDE_Y = 0x0A68 + OFM_STRIDE_Y_HI = 0x0A6C + OFM_STRIDE_C = 0x0A70 + OFM_STRIDE_C_HI = 0x0A74 + WEIGHT_BASE = 0x0A80 + WEIGHT_BASE_HI = 0x0A84 + WEIGHT_LENGTH = 0x0A88 + WEIGHT_LENGTH_HI = 0x0A8C + SCALE_BASE = 0x0A90 + SCALE_BASE_HI = 0x0A94 + SCALE_LENGTH = 0x0A98 + OFM_SCALE = 0x0AA0 + OFM_SCALE_SHIFT = 0x0AA4 + OPA_SCALE = 0x0AA8 + OPA_SCALE_SHIFT = 0x0AAC + OPB_SCALE = 0x0AB0 + DMA0_SRC = 0x0AC0 + DMA0_SRC_HI = 0x0AC4 + DMA0_DST = 0x0AC8 + DMA0_DST_HI = 0x0ACC + DMA0_LEN = 0x0AD0 + DMA0_LEN_HI = 0x0AD4 + DMA0_SKIP0 = 0x0AD8 + DMA0_SKIP0_HI = 0x0ADC + DMA0_SKIP1 = 0x0AE0 + DMA0_SKIP1_HI = 0x0AE4 + IFM2_BASE0 = 0x0B00 + IFM2_BASE0_HI = 0x0B04 + IFM2_BASE1 = 0x0B08 + IFM2_BASE1_HI = 0x0B0C + IFM2_BASE2 = 0x0B10 + IFM2_BASE2_HI = 0x0B14 + IFM2_BASE3 = 0x0B18 + IFM2_BASE3_HI = 0x0B1C + IFM2_STRIDE_X = 0x0B20 + IFM2_STRIDE_X_HI = 0x0B24 + IFM2_STRIDE_Y = 0x0B28 + IFM2_STRIDE_Y_HI = 0x0B2C + IFM2_STRIDE_C = 0x0B30 + IFM2_STRIDE_C_HI = 0x0B34 + WEIGHT1_BASE = 0x0B40 + WEIGHT1_BASE_HI = 0x0B44 + WEIGHT1_LENGTH = 0x0B48 + WEIGHT1_LENGTH_HI = 0x0B4C + SCALE1_BASE = 0x0B50 + SCALE1_BASE_HI = 0x0B54 + SCALE1_LENGTH = 0x0B58 + SIZE = 0x0B5C + +class TSU_DEBUG_RO_INTERNAL(Enum): + KERNEL_X = 0x0200 + KERNEL_Y = 0x0204 + KERNEL_W_M1 = 0x0208 + KERNEL_H_M1 = 0x020C + OFM_CBLK_WIDTH_M1 = 0x0210 + OFM_CBLK_HEIGHT_M1 = 0x0214 + OFM_CBLK_DEPTH_M1 = 0x0218 + IFM_CBLK_DEPTH_M1 = 0x021C + OFM_X = 0x0220 + OFM_Y = 0x0224 + OFM_Z = 0x0228 + IFM_Z = 0x022C + PAD_TOP = 0x0230 + PAD_LEFT = 0x0234 + IFM_CBLK_WIDTH = 0x0238 + IFM_CBLK_HEIGHT = 0x023C + DMA_IFM_SRC = 0x0240 + DMA_IFM_SRC_HI = 0x0244 + DMA_IFM_DST = 0x0248 + DMA_OFM_SRC = 0x024C + DMA_OFM_DST = 0x0250 + DMA_OFM_DST_HI = 0x0254 + DMA_WEIGHT_SRC = 0x0258 + DMA_WEIGHT_SRC_HI = 0x025C + DMA_CMD_SRC = 0x0260 + DMA_CMD_SRC_HI = 0x0264 + DMA_CMD_SIZE = 0x0268 + DMA_M2M_SRC = 0x026C + DMA_M2M_SRC_HI = 0x0270 + DMA_M2M_DST = 0x0274 + DMA_M2M_DST_HI = 0x0278 + CURRENT_QREAD = 0x027C + DMA_SCALE_SRC = 0x0280 + DMA_SCALE_SRC_HI = 0x0284 + CURRENT_CMD = 0x02BC + SIZE = 0x02C0 + + + +class acc_format(Enum): + INT_32BIT = 0 + INT_40BIT = 1 + FP_S5_10 = 2 + +class activation(Enum): + NONE = 0 + TANH = 3 + SIGMOID = 4 + LUT_START = 16 + LUT_END = 23 + +class clip_range(Enum): + OFM_PRECISION = 0 + FORCE_UINT8 = 2 + FORCE_INT8 = 3 + FORCE_INT16 = 5 + +class cmd0(Enum): + NPU_OP_STOP = 0x000 + NPU_OP_IRQ = 0x001 + NPU_OP_CONV = 0x002 + NPU_OP_DEPTHWISE = 0x003 + NPU_OP_POOL = 0x005 + NPU_OP_ELEMENTWISE = 0x006 + NPU_OP_DMA_START = 0x010 + NPU_OP_DMA_WAIT = 0x011 + NPU_OP_KERNEL_WAIT = 0x012 + NPU_OP_PMU_MASK = 0x013 + NPU_SET_IFM_PAD_TOP = 0x100 + NPU_SET_IFM_PAD_LEFT = 0x101 + NPU_SET_IFM_PAD_RIGHT = 0x102 + NPU_SET_IFM_PAD_BOTTOM = 0x103 + NPU_SET_IFM_DEPTH_M1 = 0x104 + NPU_SET_IFM_PRECISION = 0x105 + NPU_SET_IFM_UPSCALE = 0x107 + NPU_SET_IFM_ZERO_POINT = 0x109 + NPU_SET_IFM_WIDTH0_M1 = 0x10A + NPU_SET_IFM_HEIGHT0_M1 = 0x10B + NPU_SET_IFM_HEIGHT1_M1 = 0x10C + NPU_SET_IFM_IB_END = 0x10D + NPU_SET_IFM_REGION = 0x10F + NPU_SET_OFM_WIDTH_M1 = 0x111 + NPU_SET_OFM_HEIGHT_M1 = 0x112 + NPU_SET_OFM_DEPTH_M1 = 0x113 + NPU_SET_OFM_PRECISION = 0x114 + NPU_SET_OFM_BLK_WIDTH_M1 = 0x115 + NPU_SET_OFM_BLK_HEIGHT_M1 = 0x116 + NPU_SET_OFM_BLK_DEPTH_M1 = 0x117 + NPU_SET_OFM_ZERO_POINT = 0x118 + NPU_SET_OFM_WIDTH0_M1 = 0x11A + NPU_SET_OFM_HEIGHT0_M1 = 0x11B + NPU_SET_OFM_HEIGHT1_M1 = 0x11C + NPU_SET_OFM_REGION = 0x11F + NPU_SET_KERNEL_WIDTH_M1 = 0x120 + NPU_SET_KERNEL_HEIGHT_M1 = 0x121 + NPU_SET_KERNEL_STRIDE = 0x122 + NPU_SET_PARALLEL_MODE = 0x123 + NPU_SET_ACC_FORMAT = 0x124 + NPU_SET_ACTIVATION = 0x125 + NPU_SET_ACTIVATION_MIN = 0x126 + NPU_SET_ACTIVATION_MAX = 0x127 + NPU_SET_WEIGHT_REGION = 0x128 + NPU_SET_SCALE_REGION = 0x129 + NPU_SET_AB_START = 0x12D + NPU_SET_BLOCKDEP = 0x12F + NPU_SET_DMA0_SRC_REGION = 0x130 + NPU_SET_DMA0_DST_REGION = 0x131 + NPU_SET_DMA0_SIZE0 = 0x132 + NPU_SET_DMA0_SIZE1 = 0x133 + NPU_SET_IFM2_BROADCAST = 0x180 + NPU_SET_IFM2_SCALAR = 0x181 + NPU_SET_IFM2_PRECISION = 0x185 + NPU_SET_IFM2_ZERO_POINT = 0x189 + NPU_SET_IFM2_WIDTH0_M1 = 0x18A + NPU_SET_IFM2_HEIGHT0_M1 = 0x18B + NPU_SET_IFM2_HEIGHT1_M1 = 0x18C + NPU_SET_IFM2_IB_START = 0x18D + NPU_SET_IFM2_REGION = 0x18F + +class cmd1(Enum): + NPU_SET_IFM_BASE0 = 0x000 + NPU_SET_IFM_BASE1 = 0x001 + NPU_SET_IFM_BASE2 = 0x002 + NPU_SET_IFM_BASE3 = 0x003 + NPU_SET_IFM_STRIDE_X = 0x004 + NPU_SET_IFM_STRIDE_Y = 0x005 + NPU_SET_IFM_STRIDE_C = 0x006 + NPU_SET_OFM_BASE0 = 0x010 + NPU_SET_OFM_BASE1 = 0x011 + NPU_SET_OFM_BASE2 = 0x012 + NPU_SET_OFM_BASE3 = 0x013 + NPU_SET_OFM_STRIDE_X = 0x014 + NPU_SET_OFM_STRIDE_Y = 0x015 + NPU_SET_OFM_STRIDE_C = 0x016 + NPU_SET_WEIGHT_BASE = 0x020 + NPU_SET_WEIGHT_LENGTH = 0x021 + NPU_SET_SCALE_BASE = 0x022 + NPU_SET_SCALE_LENGTH = 0x023 + NPU_SET_OFM_SCALE = 0x024 + NPU_SET_OPA_SCALE = 0x025 + NPU_SET_OPB_SCALE = 0x026 + NPU_SET_DMA0_SRC = 0x030 + NPU_SET_DMA0_DST = 0x031 + NPU_SET_DMA0_LEN = 0x032 + NPU_SET_DMA0_SKIP0 = 0x033 + NPU_SET_DMA0_SKIP1 = 0x034 + NPU_SET_IFM2_BASE0 = 0x080 + NPU_SET_IFM2_BASE1 = 0x081 + NPU_SET_IFM2_BASE2 = 0x082 + NPU_SET_IFM2_BASE3 = 0x083 + NPU_SET_IFM2_STRIDE_X = 0x084 + NPU_SET_IFM2_STRIDE_Y = 0x085 + NPU_SET_IFM2_STRIDE_C = 0x086 + NPU_SET_WEIGHT1_BASE = 0x090 + NPU_SET_WEIGHT1_LENGTH = 0x091 + NPU_SET_SCALE1_BASE = 0x092 + NPU_SET_SCALE1_LENGTH = 0x093 + +class data_format(Enum): + NHWC = 0 + NHCWB16 = 1 + +class elementwise_mode(Enum): + MUL = 0 + ADD = 1 + SUB = 2 + MIN = 3 + MAX = 4 + LRELU = 5 + ABS = 6 + CLZ = 7 + SHR = 8 + SHL = 9 + +class ifm_precision(Enum): + W8_U8 = 0 + W8_S8 = 1 + W8_U16 = 4 + W8_S16 = 5 + W8_S32 = 9 + +class ifm_scale_mode(Enum): + SCALE_16BIT = 0 + SCALE_OPA_32BIT = 1 + SCALE_OPB_32BIT = 2 + +class memory_type(Enum): + AXI0_OUTSTANDING_COUNTER0 = 0 + AXI0_OUTSTANDING_COUNTER1 = 1 + AXI1_OUTSTANDING_COUNTER2 = 2 + AXI1_OUTSTANDING_COUNTER3 = 3 + +class ofm_precision(Enum): + U8 = 0 + S8 = 1 + U16 = 2 + S16 = 3 + S32 = 5 + +class pmu_event_type(Enum): + CYCLE = 0x11 + NPU_IDLE = 0x20 + MAC_ACTIVE = 0x30 + MAC_ACTIVE_8BIT = 0x31 + MAC_ACTIVE_16BIT = 0x32 + MAC_DPU_ACTIVE = 0x33 + MAC_STALLED_BY_WD_ACC = 0x34 + MAC_STALLED_BY_WD = 0x35 + MAC_STALLED_BY_ACC = 0x36 + MAC_STALLED_BY_IB = 0x37 + AO_ACTIVE = 0x40 + AO_ACTIVE_8BIT = 0x41 + AO_ACTIVE_16BIT = 0x42 + AO_STALLED_BY_OFMP_OB = 0x43 + AO_STALLED_BY_OFMP = 0x44 + AO_STALLED_BY_OB = 0x45 + AO_STALLED_BY_ACC_IB = 0x46 + AO_STALLED_BY_ACC = 0x47 + AO_STALLED_BY_IB = 0x48 + WD_ACTIVE = 0x50 + WD_STALLED = 0x51 + WD_STALLED_BY_WS = 0x52 + WD_STALLED_BY_WD_BUF = 0x53 + WD_PARSE_ACTIVE = 0x54 + WD_PARSE_STALLED = 0x55 + WD_PARSE_STALLED_IN = 0x56 + WD_PARSE_STALLED_OUT = 0x57 + AXI0_RD_TRANS_ACCEPTED = 0x80 + AXI0_RD_TRANS_COMPLETED = 0x81 + AXI0_RD_DATA_BEAT_RECEIVED = 0x82 + AXI0_RD_TRAN_REQ_STALLED = 0x83 + AXI0_WR_TRANS_ACCEPTED = 0x84 + AXI0_WR_TRANS_COMPLETED_M = 0x85 + AXI0_WR_TRANS_COMPLETED_S = 0x86 + AXI0_WR_DATA_BEAT_WRITTEN = 0x87 + AXI0_WR_TRAN_REQ_STALLED = 0x88 + AXI0_WR_DATA_BEAT_STALLED = 0x89 + AXI0_ENABLED_CYCLES = 0x8c + AXI0_RD_STALL_LIMIT = 0x8e + AXI0_WR_STALL_LIMIT = 0x8f + AXI1_RD_TRANS_ACCEPTED = 0x180 + AXI1_RD_TRANS_COMPLETED = 0x181 + AXI1_RD_DATA_BEAT_RECEIVED = 0x182 + AXI1_RD_TRAN_REQ_STALLED = 0x183 + AXI1_WR_TRANS_ACCEPTED = 0x184 + AXI1_WR_TRANS_COMPLETED_M = 0x185 + AXI1_WR_TRANS_COMPLETED_S = 0x186 + AXI1_WR_DATA_BEAT_WRITTEN = 0x187 + AXI1_WR_TRAN_REQ_STALLED = 0x188 + AXI1_WR_DATA_BEAT_STALLED = 0x189 + AXI1_ENABLED_CYCLES = 0x18c + AXI1_RD_STALL_LIMIT = 0x18e + AXI1_WR_STALL_LIMIT = 0x18f + AXI_LATENCY_ANY = 0xa0 + AXI_LATENCY_32 = 0xa1 + AXI_LATENCY_64 = 0xa2 + AXI_LATENCY_128 = 0xa3 + AXI_LATENCY_256 = 0xa4 + AXI_LATENCY_512 = 0xa5 + AXI_LATENCY_1024 = 0xa6 + +class pooling_mode(Enum): + MAX = 0 + AVERAGE = 1 + REDUCE_SUM = 2 + +class privilege_level(Enum): + USER = 0 + PRIVILEGED = 1 + +class product(Enum): + ETHOS_U55 = 0 + +class resampling_mode(Enum): + NONE = 0 + NEAREST = 1 + TRANSPOSE = 2 + +class rounding(Enum): + TFL = 0 + TRUNCATE = 1 + NATURAL = 2 + +class security_level(Enum): + SECURE = 0 + NON_SECURE = 1 + +class state(Enum): + STOPPED = 0 + RUNNING = 1 + +class stride_mode(Enum): + STRIDE_MODE_1D = 0 + STRIDE_MODE_2D = 1 + STRIDE_MODE_3D = 2 + + +class clkforce_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("top_level_clk", c_uint32, 1), + ("cc_clk", c_uint32, 1), + ("dma_clk", c_uint32, 1), + ("mac_clk", c_uint32, 1), + ("ao_clk", c_uint32, 1), + ("wd_clk", c_uint32, 1), + ("reserved0", c_uint32, 26), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_top_level_clk(self, value): self.bits.top_level_clk = value + def get_top_level_clk(self): value = self.bits.top_level_clk; return value + def set_cc_clk(self, value): self.bits.cc_clk = value + def get_cc_clk(self): value = self.bits.cc_clk; return value + def set_dma_clk(self, value): self.bits.dma_clk = value + def get_dma_clk(self): value = self.bits.dma_clk; return value + def set_mac_clk(self, value): self.bits.mac_clk = value + def get_mac_clk(self): value = self.bits.mac_clk; return value + def set_ao_clk(self, value): self.bits.ao_clk = value + def get_ao_clk(self): value = self.bits.ao_clk; return value + def set_wd_clk(self, value): self.bits.wd_clk = value + def get_wd_clk(self): value = self.bits.wd_clk; return value + + +class basep0_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep1_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep2_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep3_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep4_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep5_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep6_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep7_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep8_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep9_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep10_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep11_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep12_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep13_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep14_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class basep15_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("addr_word", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_addr_word(self, value): self.bits.addr_word = value + def get_addr_word(self): value = self.bits.addr_word; return value + + +class pid4_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pid4", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pid4(self, value): self.bits.pid4 = value + def get_pid4(self): value = self.bits.pid4; return value + + +class pid5_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pid5", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pid5(self, value): self.bits.pid5 = value + def get_pid5(self): value = self.bits.pid5; return value + + +class pid6_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pid6", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pid6(self, value): self.bits.pid6 = value + def get_pid6(self): value = self.bits.pid6; return value + + +class pid7_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pid7", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pid7(self, value): self.bits.pid7 = value + def get_pid7(self): value = self.bits.pid7; return value + + +class pid0_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pid0", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pid0(self, value): self.bits.pid0 = value + def get_pid0(self): value = self.bits.pid0; return value + + +class pid1_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pid1", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pid1(self, value): self.bits.pid1 = value + def get_pid1(self): value = self.bits.pid1; return value + + +class pid2_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pid2", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pid2(self, value): self.bits.pid2 = value + def get_pid2(self): value = self.bits.pid2; return value + + +class pid3_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pid3", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pid3(self, value): self.bits.pid3 = value + def get_pid3(self): value = self.bits.pid3; return value + + +class cid0_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("cid0", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_cid0(self, value): self.bits.cid0 = value + def get_cid0(self): value = self.bits.cid0; return value + + +class cid1_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("cid1", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_cid1(self, value): self.bits.cid1 = value + def get_cid1(self): value = self.bits.cid1; return value + + +class cid2_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("cid2", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_cid2(self, value): self.bits.cid2 = value + def get_cid2(self): value = self.bits.cid2; return value + + +class cid3_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("cid3", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_cid3(self, value): self.bits.cid3 = value + def get_cid3(self): value = self.bits.cid3; return value + + +class id_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("version_status", c_uint32, 4), + ("version_minor", c_uint32, 4), + ("version_major", c_uint32, 4), + ("product_major", c_uint32, 4), + ("arch_patch_rev", c_uint32, 4), + ("arch_minor_rev", c_uint32, 8), + ("arch_major_rev", c_uint32, 4), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_version_status(self, value): self.bits.version_status = value + def get_version_status(self): value = self.bits.version_status; return value + def set_version_minor(self, value): self.bits.version_minor = value + def get_version_minor(self): value = self.bits.version_minor; return value + def set_version_major(self, value): self.bits.version_major = value + def get_version_major(self): value = self.bits.version_major; return value + def set_product_major(self, value): self.bits.product_major = value + def get_product_major(self): value = self.bits.product_major; return value + def set_arch_patch_rev(self, value): self.bits.arch_patch_rev = value + def get_arch_patch_rev(self): value = self.bits.arch_patch_rev; return value + def set_arch_minor_rev(self, value): self.bits.arch_minor_rev = value + def get_arch_minor_rev(self): value = self.bits.arch_minor_rev; return value + def set_arch_major_rev(self, value): self.bits.arch_major_rev = value + def get_arch_major_rev(self): value = self.bits.arch_major_rev; return value + + +class status_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("state", c_uint32, 1), + ("irq_raised", c_uint32, 1), + ("bus_status", c_uint32, 1), + ("reset_status", c_uint32, 1), + ("cmd_parse_error", c_uint32, 1), + ("cmd_end_reached", c_uint32, 1), + ("pmu_irq_raised", c_uint32, 1), + ("wd_fault", c_uint32, 1), + ("reserved0", c_uint32, 3), + ("faulting_interface", c_uint32, 1), + ("faulting_channel", c_uint32, 4), + ("irq_history_mask", c_uint32, 16), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_state(self, value): self.bits.state = value + def get_state(self): value = self.bits.state; return value + def set_irq_raised(self, value): self.bits.irq_raised = value + def get_irq_raised(self): value = self.bits.irq_raised; return value + def set_bus_status(self, value): self.bits.bus_status = value + def get_bus_status(self): value = self.bits.bus_status; return value + def set_reset_status(self, value): self.bits.reset_status = value + def get_reset_status(self): value = self.bits.reset_status; return value + def set_cmd_parse_error(self, value): self.bits.cmd_parse_error = value + def get_cmd_parse_error(self): value = self.bits.cmd_parse_error; return value + def set_cmd_end_reached(self, value): self.bits.cmd_end_reached = value + def get_cmd_end_reached(self): value = self.bits.cmd_end_reached; return value + def set_pmu_irq_raised(self, value): self.bits.pmu_irq_raised = value + def get_pmu_irq_raised(self): value = self.bits.pmu_irq_raised; return value + def set_wd_fault(self, value): self.bits.wd_fault = value + def get_wd_fault(self): value = self.bits.wd_fault; return value + def set_faulting_interface(self, value): self.bits.faulting_interface = value + def get_faulting_interface(self): value = self.bits.faulting_interface; return value + def set_faulting_channel(self, value): self.bits.faulting_channel = value + def get_faulting_channel(self): value = self.bits.faulting_channel; return value + def set_irq_history_mask(self, value): self.bits.irq_history_mask = value + def get_irq_history_mask(self): value = self.bits.irq_history_mask; return value + + +class cmd_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("transition_to_running_state", c_uint32, 1), + ("clear_irq", c_uint32, 1), + ("clock_q_enable", c_uint32, 1), + ("power_q_enable", c_uint32, 1), + ("stop_request", c_uint32, 1), + ("reserved0", c_uint32, 11), + ("clear_irq_history", c_uint32, 16), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_transition_to_running_state(self, value): self.bits.transition_to_running_state = value + def get_transition_to_running_state(self): value = self.bits.transition_to_running_state; return value + def set_clear_irq(self, value): self.bits.clear_irq = value + def get_clear_irq(self): value = self.bits.clear_irq; return value + def set_clock_q_enable(self, value): self.bits.clock_q_enable = value + def get_clock_q_enable(self): value = self.bits.clock_q_enable; return value + def set_power_q_enable(self, value): self.bits.power_q_enable = value + def get_power_q_enable(self): value = self.bits.power_q_enable; return value + def set_stop_request(self, value): self.bits.stop_request = value + def get_stop_request(self): value = self.bits.stop_request; return value + def set_clear_irq_history(self, value): self.bits.clear_irq_history = value + def get_clear_irq_history(self): value = self.bits.clear_irq_history; return value + + +class reset_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("pending_cpl", c_uint32, 1), + ("pending_csl", c_uint32, 1), + ("reserved0", c_uint32, 30), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_pending_cpl(self, value): self.bits.pending_cpl = value + def get_pending_cpl(self): value = self.bits.pending_cpl; return value + def set_pending_csl(self, value): self.bits.pending_csl = value + def get_pending_csl(self): value = self.bits.pending_csl; return value + + +class qbase0_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("qbase0", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_qbase0(self, value): self.bits.qbase0 = value + def get_qbase0(self): value = self.bits.qbase0; return value + + +class qbase1_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("qbase1", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_qbase1(self, value): self.bits.qbase1 = value + def get_qbase1(self): value = self.bits.qbase1; return value + + +class qread_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("qread", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_qread(self, value): self.bits.qread = value + def get_qread(self): value = self.bits.qread; return value + + +class qconfig_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("qconfig", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_qconfig(self, value): self.bits.qconfig = value + def get_qconfig(self): value = self.bits.qconfig; return value + + +class qsize_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("qsize", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_qsize(self, value): self.bits.qsize = value + def get_qsize(self): value = self.bits.qsize; return value + + +class prot_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("active_cpl", c_uint32, 1), + ("active_csl", c_uint32, 1), + ("reserved0", c_uint32, 30), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_active_cpl(self, value): self.bits.active_cpl = value + def get_active_cpl(self): value = self.bits.active_cpl; return value + def set_active_csl(self, value): self.bits.active_csl = value + def get_active_csl(self): value = self.bits.active_csl; return value + + +class config_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("macs_per_cc", c_uint32, 4), + ("cmd_stream_version", c_uint32, 4), + ("shram_size", c_uint32, 8), + ("reserved0", c_uint32, 12), + ("product", c_uint32, 4), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_macs_per_cc(self, value): self.bits.macs_per_cc = value + def get_macs_per_cc(self): value = self.bits.macs_per_cc; return value + def set_cmd_stream_version(self, value): self.bits.cmd_stream_version = value + def get_cmd_stream_version(self): value = self.bits.cmd_stream_version; return value + def set_shram_size(self, value): self.bits.shram_size = value + def get_shram_size(self): value = self.bits.shram_size; return value + def set_product(self, value): self.bits.product = value + def get_product(self): value = self.bits.product; return value + + +class lock_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("lock", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_lock(self, value): self.bits.lock = value + def get_lock(self): value = self.bits.lock; return value + + +class regioncfg_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("region0", c_uint32, 2), + ("region1", c_uint32, 2), + ("region2", c_uint32, 2), + ("region3", c_uint32, 2), + ("region4", c_uint32, 2), + ("region5", c_uint32, 2), + ("region6", c_uint32, 2), + ("region7", c_uint32, 2), + ("reserved0", c_uint32, 16), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_region0(self, value): self.bits.region0 = value + def get_region0(self): value = self.bits.region0; return value + def set_region1(self, value): self.bits.region1 = value + def get_region1(self): value = self.bits.region1; return value + def set_region2(self, value): self.bits.region2 = value + def get_region2(self): value = self.bits.region2; return value + def set_region3(self, value): self.bits.region3 = value + def get_region3(self): value = self.bits.region3; return value + def set_region4(self, value): self.bits.region4 = value + def get_region4(self): value = self.bits.region4; return value + def set_region5(self, value): self.bits.region5 = value + def get_region5(self): value = self.bits.region5; return value + def set_region6(self, value): self.bits.region6 = value + def get_region6(self): value = self.bits.region6; return value + def set_region7(self, value): self.bits.region7 = value + def get_region7(self): value = self.bits.region7; return value + + +class axi_limit0_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("max_beats", c_uint32, 2), + ("reserved0", c_uint32, 2), + ("memtype", c_uint32, 4), + ("reserved1", c_uint32, 8), + ("max_outstanding_read_m1", c_uint32, 8), + ("max_outstanding_write_m1", c_uint32, 8), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_max_beats(self, value): self.bits.max_beats = value + def get_max_beats(self): value = self.bits.max_beats; return value + def set_memtype(self, value): self.bits.memtype = value + def get_memtype(self): value = self.bits.memtype; return value + def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value + def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value + def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value + def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value + + +class axi_limit1_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("max_beats", c_uint32, 2), + ("reserved0", c_uint32, 2), + ("memtype", c_uint32, 4), + ("reserved1", c_uint32, 8), + ("max_outstanding_read_m1", c_uint32, 8), + ("max_outstanding_write_m1", c_uint32, 8), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_max_beats(self, value): self.bits.max_beats = value + def get_max_beats(self): value = self.bits.max_beats; return value + def set_memtype(self, value): self.bits.memtype = value + def get_memtype(self): value = self.bits.memtype; return value + def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value + def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value + def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value + def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value + + +class axi_limit2_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("max_beats", c_uint32, 2), + ("reserved0", c_uint32, 2), + ("memtype", c_uint32, 4), + ("reserved1", c_uint32, 8), + ("max_outstanding_read_m1", c_uint32, 8), + ("max_outstanding_write_m1", c_uint32, 8), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_max_beats(self, value): self.bits.max_beats = value + def get_max_beats(self): value = self.bits.max_beats; return value + def set_memtype(self, value): self.bits.memtype = value + def get_memtype(self): value = self.bits.memtype; return value + def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value + def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value + def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value + def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value + + +class axi_limit3_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("max_beats", c_uint32, 2), + ("reserved0", c_uint32, 2), + ("memtype", c_uint32, 4), + ("reserved1", c_uint32, 8), + ("max_outstanding_read_m1", c_uint32, 8), + ("max_outstanding_write_m1", c_uint32, 8), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_max_beats(self, value): self.bits.max_beats = value + def get_max_beats(self): value = self.bits.max_beats; return value + def set_memtype(self, value): self.bits.memtype = value + def get_memtype(self): value = self.bits.memtype; return value + def set_max_outstanding_read_m1(self, value): self.bits.max_outstanding_read_m1 = value + def get_max_outstanding_read_m1(self): value = self.bits.max_outstanding_read_m1; return value + def set_max_outstanding_write_m1(self, value): self.bits.max_outstanding_write_m1 = value + def get_max_outstanding_write_m1(self): value = self.bits.max_outstanding_write_m1; return value + + +class pmcr_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("cnt_en", c_uint32, 1), + ("event_cnt_rst", c_uint32, 1), + ("cycle_cnt_rst", c_uint32, 1), + ("mask_en", c_uint32, 1), + ("reserved0", c_uint32, 7), + ("num_event_cnt", c_uint32, 5), + ("reserved1", c_uint32, 16), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_cnt_en(self, value): self.bits.cnt_en = value + def get_cnt_en(self): value = self.bits.cnt_en; return value + def set_event_cnt_rst(self, value): self.bits.event_cnt_rst = value + def get_event_cnt_rst(self): value = self.bits.event_cnt_rst; return value + def set_cycle_cnt_rst(self, value): self.bits.cycle_cnt_rst = value + def get_cycle_cnt_rst(self): value = self.bits.cycle_cnt_rst; return value + def set_mask_en(self, value): self.bits.mask_en = value + def get_mask_en(self): value = self.bits.mask_en; return value + def set_num_event_cnt(self, value): self.bits.num_event_cnt = value + def get_num_event_cnt(self): value = self.bits.num_event_cnt; return value + + +class pmcntenset_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("event_cnt_0", c_uint32, 1), + ("event_cnt_1", c_uint32, 1), + ("event_cnt_2", c_uint32, 1), + ("event_cnt_3", c_uint32, 1), + ("reserved0", c_uint32, 27), + ("cycle_cnt", c_uint32, 1), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_event_cnt_0(self, value): self.bits.event_cnt_0 = value + def get_event_cnt_0(self): value = self.bits.event_cnt_0; return value + def set_event_cnt_1(self, value): self.bits.event_cnt_1 = value + def get_event_cnt_1(self): value = self.bits.event_cnt_1; return value + def set_event_cnt_2(self, value): self.bits.event_cnt_2 = value + def get_event_cnt_2(self): value = self.bits.event_cnt_2; return value + def set_event_cnt_3(self, value): self.bits.event_cnt_3 = value + def get_event_cnt_3(self): value = self.bits.event_cnt_3; return value + def set_cycle_cnt(self, value): self.bits.cycle_cnt = value + def get_cycle_cnt(self): value = self.bits.cycle_cnt; return value + + +class pmcntenclr_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("event_cnt_0", c_uint32, 1), + ("event_cnt_1", c_uint32, 1), + ("event_cnt_2", c_uint32, 1), + ("event_cnt_3", c_uint32, 1), + ("reserved0", c_uint32, 27), + ("cycle_cnt", c_uint32, 1), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_event_cnt_0(self, value): self.bits.event_cnt_0 = value + def get_event_cnt_0(self): value = self.bits.event_cnt_0; return value + def set_event_cnt_1(self, value): self.bits.event_cnt_1 = value + def get_event_cnt_1(self): value = self.bits.event_cnt_1; return value + def set_event_cnt_2(self, value): self.bits.event_cnt_2 = value + def get_event_cnt_2(self): value = self.bits.event_cnt_2; return value + def set_event_cnt_3(self, value): self.bits.event_cnt_3 = value + def get_event_cnt_3(self): value = self.bits.event_cnt_3; return value + def set_cycle_cnt(self, value): self.bits.cycle_cnt = value + def get_cycle_cnt(self): value = self.bits.cycle_cnt; return value + + +class pmovsset_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("event_cnt_0_ovf", c_uint32, 1), + ("event_cnt_1_ovf", c_uint32, 1), + ("event_cnt_2_ovf", c_uint32, 1), + ("event_cnt_3_ovf", c_uint32, 1), + ("reserved0", c_uint32, 27), + ("cycle_cnt_ovf", c_uint32, 1), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_event_cnt_0_ovf(self, value): self.bits.event_cnt_0_ovf = value + def get_event_cnt_0_ovf(self): value = self.bits.event_cnt_0_ovf; return value + def set_event_cnt_1_ovf(self, value): self.bits.event_cnt_1_ovf = value + def get_event_cnt_1_ovf(self): value = self.bits.event_cnt_1_ovf; return value + def set_event_cnt_2_ovf(self, value): self.bits.event_cnt_2_ovf = value + def get_event_cnt_2_ovf(self): value = self.bits.event_cnt_2_ovf; return value + def set_event_cnt_3_ovf(self, value): self.bits.event_cnt_3_ovf = value + def get_event_cnt_3_ovf(self): value = self.bits.event_cnt_3_ovf; return value + def set_cycle_cnt_ovf(self, value): self.bits.cycle_cnt_ovf = value + def get_cycle_cnt_ovf(self): value = self.bits.cycle_cnt_ovf; return value + + +class pmovsclr_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("event_cnt_0_ovf", c_uint32, 1), + ("event_cnt_1_ovf", c_uint32, 1), + ("event_cnt_2_ovf", c_uint32, 1), + ("event_cnt_3_ovf", c_uint32, 1), + ("reserved0", c_uint32, 27), + ("cycle_cnt_ovf", c_uint32, 1), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_event_cnt_0_ovf(self, value): self.bits.event_cnt_0_ovf = value + def get_event_cnt_0_ovf(self): value = self.bits.event_cnt_0_ovf; return value + def set_event_cnt_1_ovf(self, value): self.bits.event_cnt_1_ovf = value + def get_event_cnt_1_ovf(self): value = self.bits.event_cnt_1_ovf; return value + def set_event_cnt_2_ovf(self, value): self.bits.event_cnt_2_ovf = value + def get_event_cnt_2_ovf(self): value = self.bits.event_cnt_2_ovf; return value + def set_event_cnt_3_ovf(self, value): self.bits.event_cnt_3_ovf = value + def get_event_cnt_3_ovf(self): value = self.bits.event_cnt_3_ovf; return value + def set_cycle_cnt_ovf(self, value): self.bits.cycle_cnt_ovf = value + def get_cycle_cnt_ovf(self): value = self.bits.cycle_cnt_ovf; return value + + +class pmintset_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("event_cnt_0_int", c_uint32, 1), + ("event_cnt_1_int", c_uint32, 1), + ("event_cnt_2_int", c_uint32, 1), + ("event_cnt_3_int", c_uint32, 1), + ("reserved0", c_uint32, 27), + ("cycle_cnt_int", c_uint32, 1), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_event_cnt_0_int(self, value): self.bits.event_cnt_0_int = value + def get_event_cnt_0_int(self): value = self.bits.event_cnt_0_int; return value + def set_event_cnt_1_int(self, value): self.bits.event_cnt_1_int = value + def get_event_cnt_1_int(self): value = self.bits.event_cnt_1_int; return value + def set_event_cnt_2_int(self, value): self.bits.event_cnt_2_int = value + def get_event_cnt_2_int(self): value = self.bits.event_cnt_2_int; return value + def set_event_cnt_3_int(self, value): self.bits.event_cnt_3_int = value + def get_event_cnt_3_int(self): value = self.bits.event_cnt_3_int; return value + def set_cycle_cnt_int(self, value): self.bits.cycle_cnt_int = value + def get_cycle_cnt_int(self): value = self.bits.cycle_cnt_int; return value + + +class pmintclr_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("event_cnt_0_int", c_uint32, 1), + ("event_cnt_1_int", c_uint32, 1), + ("event_cnt_2_int", c_uint32, 1), + ("event_cnt_3_int", c_uint32, 1), + ("reserved0", c_uint32, 27), + ("cycle_cnt_int", c_uint32, 1), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_event_cnt_0_int(self, value): self.bits.event_cnt_0_int = value + def get_event_cnt_0_int(self): value = self.bits.event_cnt_0_int; return value + def set_event_cnt_1_int(self, value): self.bits.event_cnt_1_int = value + def get_event_cnt_1_int(self): value = self.bits.event_cnt_1_int; return value + def set_event_cnt_2_int(self, value): self.bits.event_cnt_2_int = value + def get_event_cnt_2_int(self): value = self.bits.event_cnt_2_int; return value + def set_event_cnt_3_int(self, value): self.bits.event_cnt_3_int = value + def get_event_cnt_3_int(self): value = self.bits.event_cnt_3_int; return value + def set_cycle_cnt_int(self, value): self.bits.cycle_cnt_int = value + def get_cycle_cnt_int(self): value = self.bits.cycle_cnt_int; return value + + +class pmccntr_lo_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("cycle_cnt_lo", c_uint32, 32), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_cycle_cnt_lo(self, value): self.bits.cycle_cnt_lo = value + def get_cycle_cnt_lo(self): value = self.bits.cycle_cnt_lo; return value + + +class pmccntr_hi_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("cycle_cnt_hi", c_uint32, 16), + ("reserved0", c_uint32, 16), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_cycle_cnt_hi(self, value): self.bits.cycle_cnt_hi = value + def get_cycle_cnt_hi(self): value = self.bits.cycle_cnt_hi; return value + + +class pmccntr_cfg_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("cycle_cnt_cfg_start", c_uint32, 10), + ("reserved0", c_uint32, 6), + ("cycle_cnt_cfg_stop", c_uint32, 10), + ("reserved1", c_uint32, 6), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_cycle_cnt_cfg_start(self, value): self.bits.cycle_cnt_cfg_start = value + def get_cycle_cnt_cfg_start(self): value = self.bits.cycle_cnt_cfg_start; return value + def set_cycle_cnt_cfg_stop(self, value): self.bits.cycle_cnt_cfg_stop = value + def get_cycle_cnt_cfg_stop(self): value = self.bits.cycle_cnt_cfg_stop; return value + + +class pmcaxi_chan_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("axi_chan", c_uint32, 4), + ("reserved0", c_uint32, 3), + ("rw", c_uint32, 1), + ("axi_cnt", c_uint32, 2), + ("reserved1", c_uint32, 22), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_axi_chan(self, value): self.bits.axi_chan = value + def get_axi_chan(self): value = self.bits.axi_chan; return value + def set_rw(self, value): self.bits.rw = value + def get_rw(self): value = self.bits.rw; return value + def set_axi_cnt(self, value): self.bits.axi_cnt = value + def get_axi_cnt(self): value = self.bits.axi_cnt; return value + + +class pmevtyper0_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("ev_type", c_uint32, 10), + ("reserved0", c_uint32, 22), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_ev_type(self, value): self.bits.ev_type = value + def get_ev_type(self): value = self.bits.ev_type; return value + + +class pmevtyper1_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("ev_type", c_uint32, 10), + ("reserved0", c_uint32, 22), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_ev_type(self, value): self.bits.ev_type = value + def get_ev_type(self): value = self.bits.ev_type; return value + + +class pmevtyper2_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("ev_type", c_uint32, 10), + ("reserved0", c_uint32, 22), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_ev_type(self, value): self.bits.ev_type = value + def get_ev_type(self): value = self.bits.ev_type; return value + + +class pmevtyper3_r(Union): + class _bitfield(Structure): + _fields_ = [ + ("ev_type", c_uint32, 10), + ("reserved0", c_uint32, 22), + ] + _fields_ = [("bits", _bitfield), + ("word", c_uint32)] + def set_ev_type(self, value): self.bits.ev_type = value + def get_ev_type(self): value = self.bits.ev_type; return value + +class command_no_payload_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class command_with_payload_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("param", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_param(self): return param + def set_param(self, value): param = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_op_stop_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("mask", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_STOP and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_mask(self): return mask + def set_mask(self, value): mask = value + +class npu_op_irq_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("mask", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_IRQ and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_mask(self): return mask + def set_mask(self, value): mask = value + +class npu_op_conv_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("reserved0", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_CONV and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + +class npu_op_depthwise_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("reserved0", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_DEPTHWISE and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + +class npu_op_pool_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("mode", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_POOL and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_mode(self): return mode + def set_mode(self, value): mode = value + +class npu_op_elementwise_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("mode", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_ELEMENTWISE and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_mode(self): return mode + def set_mode(self, value): mode = value + +class npu_op_dma_start_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("channel_mode", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_DMA_START and must_be_zero0==0; + def get_channel_mode(self): return channel_mode + def set_channel_mode(self, value): channel_mode = value + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + +class npu_op_dma_wait_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("reserved0", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_DMA_WAIT and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + +class npu_op_kernel_wait_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_KERNEL_WAIT and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_op_pmu_mask_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_OP_PMU_MASK and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_pad_top_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_TOP and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_pad_left_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_LEFT and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_pad_right_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_RIGHT and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_pad_bottom_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PAD_BOTTOM and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_depth_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_DEPTH_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_precision_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 4), + ("reserved0", c_uint32, 2), + ("format", c_uint32, 2), + ("scale_mode", c_uint32, 2), + ("reserved1", c_uint32, 4), + ("round_mode", c_uint32, 2), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_PRECISION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_format(self): return format + def set_format(self, value): format = value + def get_param(self): return param + def set_param(self, value): param = value + def get_round_mode(self): return round_mode + def set_round_mode(self, value): round_mode = value + def get_scale_mode(self): return scale_mode + def set_scale_mode(self, value): scale_mode = value + +class npu_set_ifm_upscale_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("mode", c_uint32, 2), + ("reserved0", c_uint32, 14), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_UPSCALE and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_mode(self): return mode + def set_mode(self, value): mode = value + +class npu_set_ifm_zero_point_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_ZERO_POINT and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_width0_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_WIDTH0_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_height0_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_HEIGHT0_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_height1_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_HEIGHT1_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_ib_end_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_IB_END and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_region_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM_REGION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_width_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_WIDTH_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_height_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_depth_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_DEPTH_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_precision_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("precision", c_uint32, 3), + ("reserved0", c_uint32, 3), + ("format", c_uint32, 2), + ("scaling", c_uint32, 1), + ("reserved1", c_uint32, 5), + ("rounding", c_uint32, 2), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_PRECISION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_format(self): return format + def set_format(self, value): format = value + def get_precision(self): return precision + def set_precision(self, value): precision = value + def get_rounding(self): return rounding + def set_rounding(self, value): rounding = value + def get_scaling(self): return scaling + def set_scaling(self, value): scaling = value + +class npu_set_ofm_blk_width_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_WIDTH_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_blk_height_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_HEIGHT_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_blk_depth_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_BLK_DEPTH_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_zero_point_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_ZERO_POINT and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_width0_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_WIDTH0_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_height0_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT0_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_height1_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_HEIGHT1_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ofm_region_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_OFM_REGION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_kernel_width_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_WIDTH_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_kernel_height_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_HEIGHT_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_kernel_stride_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_KERNEL_STRIDE and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_parallel_mode_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_PARALLEL_MODE and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_acc_format_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_ACC_FORMAT and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_activation_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("type", c_uint32, 12), + ("act_clip_range", c_uint32, 4), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION and must_be_zero0==0; + def get_act_clip_range(self): return act_clip_range + def set_act_clip_range(self, value): act_clip_range = value + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_type(self): return type + def set_type(self, value): type = value + +class npu_set_activation_min_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION_MIN and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_activation_max_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_ACTIVATION_MAX and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_weight_region_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_WEIGHT_REGION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_scale_region_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_SCALE_REGION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ab_start_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_AB_START and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_blockdep_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_BLOCKDEP and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_dma0_src_region_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("region", c_uint32, 8), + ("internal", c_uint32, 1), + ("stride_mode", c_uint32, 2), + ("reserved0", c_uint32, 5), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SRC_REGION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_internal(self): return internal + def set_internal(self, value): internal = value + def get_region(self): return region + def set_region(self, value): region = value + def get_stride_mode(self): return stride_mode + def set_stride_mode(self, value): stride_mode = value + +class npu_set_dma0_dst_region_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("region", c_uint32, 8), + ("internal", c_uint32, 1), + ("stride_mode", c_uint32, 2), + ("reserved0", c_uint32, 5), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_DST_REGION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_internal(self): return internal + def set_internal(self, value): internal = value + def get_region(self): return region + def set_region(self, value): region = value + def get_stride_mode(self): return stride_mode + def set_stride_mode(self, value): stride_mode = value + +class npu_set_dma0_size0_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SIZE0 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_dma0_size1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_DMA0_SIZE1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm2_broadcast_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("broadcast_height", c_uint32, 1), + ("broadcast_width", c_uint32, 1), + ("broadcast_depth", c_uint32, 1), + ("reserved0", c_uint32, 3), + ("operand_order", c_uint32, 1), + ("broadcast_scalar", c_uint32, 1), + ("reserved1", c_uint32, 8), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_BROADCAST and must_be_zero0==0; + def get_broadcast_depth(self): return broadcast_depth + def set_broadcast_depth(self, value): broadcast_depth = value + def get_broadcast_height(self): return broadcast_height + def set_broadcast_height(self, value): broadcast_height = value + def get_broadcast_scalar(self): return broadcast_scalar + def set_broadcast_scalar(self, value): broadcast_scalar = value + def get_broadcast_width(self): return broadcast_width + def set_broadcast_width(self, value): broadcast_width = value + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_operand_order(self): return operand_order + def set_operand_order(self, value): operand_order = value + +class npu_set_ifm2_scalar_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_SCALAR and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm2_precision_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 4), + ("reserved0", c_uint32, 2), + ("format", c_uint32, 2), + ("reserved1", c_uint32, 8), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_PRECISION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_format(self): return format + def set_format(self, value): format = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm2_zero_point_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_ZERO_POINT and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm2_width0_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_WIDTH0_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm2_height0_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_HEIGHT0_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm2_height1_m1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_HEIGHT1_M1 and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm2_ib_start_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_IB_START and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm2_region_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero0", c_uint32, 6), + ("param", c_uint32, 16), + ] + def valid(self): return cmd_code==cmd0.NPU_SET_IFM2_REGION and must_be_zero0==0; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_param(self): return param + def set_param(self, value): param = value + +class npu_set_ifm_base0_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm_base1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm_base2_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm_base3_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm_stride_x_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm_stride_y_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm_stride_c_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ofm_base0_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ofm_base1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ofm_base2_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ofm_base3_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OFM_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ofm_stride_x_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ofm_stride_y_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ofm_stride_c_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OFM_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_weight_base_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_weight_length_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_scale_base_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_SCALE_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_scale_length_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_SCALE_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ofm_scale_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("shift", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OFM_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + def get_shift(self): return shift + def set_shift(self, value): shift = value + +class npu_set_opa_scale_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("shift", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OPA_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + def get_shift(self): return shift + def set_shift(self, value): shift = value + +class npu_set_opb_scale_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_OPB_SCALE and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_dma0_src_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SRC and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_dma0_dst_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_DST and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_dma0_len_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_LEN and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_dma0_skip0_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("param", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SKIP0 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_param(self): return param + def set_param(self, value): param = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_dma0_skip1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("param", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_DMA0_SKIP1 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_param(self): return param + def set_param(self, value): param = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm2_base0_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE0 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm2_base1_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE1 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm2_base2_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE2 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm2_base3_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_BASE3 and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm2_stride_x_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_X and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm2_stride_y_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_Y and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_ifm2_stride_c_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_IFM2_STRIDE_C and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_weight1_base_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("param", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT1_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_param(self): return param + def set_param(self, value): param = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_weight1_length_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_WEIGHT1_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_scale1_base_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("param", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_SCALE1_BASE and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_param(self): return param + def set_param(self, value): param = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value + +class npu_set_scale1_length_t(Structure): + _fields_ = [ + ("cmd_code", c_uint32, 10), + ("must_be_zero", c_uint32, 4), + ("payload_size", c_uint32, 2), + ("reserved0", c_uint32, 16), + ("data", c_uint32, 32), + ] + def valid(self): return cmd_code==cmd1.NPU_SET_SCALE1_LENGTH and must_be_zero==0 and payload_size>=1 and payload_size<=2; + def get_cmd_code(self): return cmd_code + def set_cmd_code(self, value): cmd_code = value + def get_data(self): return data + def set_data(self, value): data = value + def get_payload_size(self): return payload_size + def set_payload_size(self, value): payload_size = value diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py new file mode 100644 index 00000000..5b9ba8b0 --- /dev/null +++ b/ethosu/vela/extract_npu_subgraphs.py @@ -0,0 +1,253 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left +# untouched in the final output. +# +# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked +# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and +# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do. + +from .nn_graph import Pass, PassPlacement, NpuBlockType, Subgraph +from .operation import Operation +import numpy as np + + +def make_npu_call_op_pass(npu_subgraph): + op = Operation("NpuOp", "call_" + npu_subgraph.name) + op.attrs["subgraph"] = npu_subgraph + ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default) + ps.ops = [op] + ps.primary_op = op + op.attrs["npu_block_type"] = ps.npu_block_type + op.scheduled_pass = ps + + # Inputs and outputs filled in later as we cut the graphs + return ps + + +def switch_tensor_for_op(op, orig_tens, new_tens): + + op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs] + op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs] + + ps = op.scheduled_pass + if ps is None: + return + + ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs] + ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs] + + if ps.ifm_tensor == orig_tens: + ps.ifm_tensor = new_tens + if ps.ifm2_tensor == orig_tens: + ps.ifm2_tensor = new_tens + if ps.ofm_tensor == orig_tens: + ps.ofm_tensor = new_tens + if ps.weight_tensor == orig_tens: + ps.weight_tensor = new_tens + if ps.scale_tensor == orig_tens: + ps.scale_tensor = new_tens + + +def rewrite_tensor_cpu_producer_npu_consumers( + orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass +): + is_const = orig_tens.ops[0].type == "Const" + + new_tens = orig_tens.clone("_npu") + orig_tens.npu_tensor = new_tens + new_tens.cpu_tensor = orig_tens + + op_type = "SubgraphInput" + if is_const: + op_type = "Const" + op = Operation(op_type, orig_tens.name + "_input") + op.attrs["npu_block_type"] = NpuBlockType.Default + op.outputs = [new_tens] + op.scheduled_pass = startup_init_ps + new_tens.ops = [op] + startup_init_ps.ops.append(op) + startup_init_ps.outputs.append(new_tens) + + if not is_const: + call_ps.inputs.append(orig_tens) + call_ps.primary_op.inputs.append(orig_tens) + + for op in list(orig_tens.consumers()): + if op is None: + continue # Subgraph consumers handled separately. + ps = op.scheduled_pass + if subgraph_for_pass[ps] == npu_subgraph: + switch_tensor_for_op(op, orig_tens, new_tens) + orig_tens.consumer_list.remove(op) + new_tens.consumer_list.append(op) + + # Deal with output tensors for the NPU graph. These are special. + npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors] + + +def rewrite_tensor_npu_producer_cpu_consumers( + orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass +): + + new_tens = orig_tens.clone("_cpu") + new_tens.npu_tensor = orig_tens + orig_tens.cpu_tensor = new_tens + + npu_subgraph.output_tensors.append(orig_tens) + + call_ps.outputs.append(new_tens) + call_ps.primary_op.outputs.append(new_tens) + new_tens.ops = [call_ps.primary_op] + + for op in list(orig_tens.consumers()): + if op is None: + continue # Subgraph consumers handled separately. + ps = op.scheduled_pass + if subgraph_for_pass[ps] != npu_subgraph: + switch_tensor_for_op(op, orig_tens, new_tens) + orig_tens.consumer_list.remove(op) + new_tens.consumer_list.append(op) + + # Deal with output tensors for the CPU graph. These are special. + cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors] + + +def extract_subgraph(nng, orig_sg, arch): + assert orig_sg.placement == PassPlacement.Cpu + + passes = list(orig_sg.passes) + place_vec = np.array([ps.placement for ps in passes]) + place_vec[ + place_vec == PassPlacement.StartupInit + ] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU. + + # MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU + # passes should be assigned to the NPU. + + # Forward, then backwards + for is_reversed in range(2): + last_place = PassPlacement.Cpu + seq = enumerate(place_vec) + if is_reversed: + seq = reversed(list(seq)) + for idx, place in seq: + if place == PassPlacement.MemoryOnly: + if last_place == PassPlacement.Npu: + place = PassPlacement.Npu + place_vec[idx] = place + + if place != PassPlacement.MemoryOnly: + last_place = place + + # Anything left, assign to the CPU. + place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu + + if np.all(place_vec == PassPlacement.Cpu): + return [] # Nothing to do + + # Create the subgraphs and split passes between them + + new_subgraphs = [] + split_count = 0 + subgraph_for_pass = {} + orig_sg.passes = [] + call_pass = {} + startup_init_passes = {} + + last_place = PassPlacement.Cpu + curr_sg = orig_sg + + for idx, place in enumerate(place_vec): + if place != last_place: + if place == PassPlacement.Npu: + split_count += 1 + curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu) + new_subgraphs.append(curr_sg) + call_ps = make_npu_call_op_pass(curr_sg) + subgraph_for_pass[call_ps] = orig_sg + orig_sg.passes.append(call_ps) + call_pass[curr_sg] = call_ps + + startup_init_ps = Pass( + curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default + ) + curr_sg.passes.append(startup_init_ps) + startup_init_passes[curr_sg] = startup_init_ps + subgraph_for_pass[startup_init_ps] = curr_sg + + else: + curr_sg = orig_sg + last_place = place + ps = passes[idx] + subgraph_for_pass[ps] = curr_sg + curr_sg.passes.append(ps) + + # Rewrite tensors to fix up graphs. + + for curr_sg in new_subgraphs: + for ps in curr_sg.passes: + for tens in ps.inputs: + source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops] + assert len(source_sgs) >= 0 + producer_sg = source_sgs[0] + for sg in source_sgs: + assert sg == producer_sg # All need to be the same. + + if producer_sg != curr_sg: + assert ( + producer_sg == orig_sg + ) # Because we go in-order, all the producers must be the original graph. + rewrite_tensor_cpu_producer_npu_consumers( + tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass + ) + + for tens in ps.outputs: + + dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None] + need_rewrite = False + for sg in dest_sgs: + if sg != curr_sg: + need_rewrite = True + break + if tens in orig_sg.output_tensors: + need_rewrite = True + + if need_rewrite: + rewrite_tensor_npu_producer_cpu_consumers( + tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass + ) + + return new_subgraphs + + +def extract_npu_subgraphs(nng, arch): + + nng.refresh_after_modification() + + for sg in list(nng.subgraphs): + if sg.placement == PassPlacement.Cpu: + new_subgraphs = extract_subgraph(nng, sg, arch) + nng.subgraphs += new_subgraphs + + nng.refresh_after_modification() + nng.prune_startup_init_pass() + + for sg in nng.subgraphs: + sg.build_pass_links() diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py new file mode 100644 index 00000000..f0afcf8f --- /dev/null +++ b/ethosu/vela/graph_optimiser.py @@ -0,0 +1,485 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Early optimisation of the network graph, using the rewrite_graph module to do the traversal of the graph. These are +# split into two parts optimise_graph_a and optimise_graph_b. + +from .nn_graph import Operation, NpuBlockType, Tensor +from . import rewrite_graph +from .data_type import BaseType, DataType +import numpy as np +import math +from .numeric_util import round_up_divide + +passthrough_nodes = set(("Identity",)) + + +def remove_passthrough_tensor(tens, arch): + if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes: + assert len(tens.ops[0].inputs) == 1 + tens = tens.ops[0].inputs[0] + return tens + + +def rewrite_concat(tens, arch): + if len(tens.ops) == 1 and tens.ops[0].is_concat_op(): + concat_op = tens.ops[0] + if tens != concat_op.outputs[0]: + return tens # don't attempt to rewrite the min/max outputs of QuantizedConcat + + # Not supported so leave it and run on CPU + if not concat_op.run_on_npu: + return tens + + inputs, axis = concat_op.get_concat_inputs_axis() + + tens.ops = [] + offset = 0 + for idx, inp in enumerate(inputs): + new_op = Operation("ConcatSliceWrite", concat_op.name + str(idx)) + new_op.inputs = [inp] + new_op.outputs = [tens] + new_op.attrs["concat_axis"] = axis + new_op.attrs["concat_start"] = offset + offset += inp.shape[axis] + new_op.attrs["concat_end"] = offset + new_op.run_on_npu = True + tens.ops.append(new_op) + assert tens.shape[axis] == offset + + return tens + + +def rewrite_split(tens, arch): + + if len(tens.ops) == 1 and tens.ops[0].is_split_op(): + split_op = tens.ops[0] + + # Not supported so leave it and run on CPU + if not split_op.run_on_npu: + return tens + + inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis() + + tens.ops = [] + new_op = Operation("SplitSliceRead", split_op.name) + new_op.inputs = [inp] + new_op.outputs = [tens] + + # For Split the offset cannot be extracted from the tensor so it has to + # be calculated from the index of the output tensor + if axis != None: + # Get the start and end of the split + offset_start = [0] * len(tens.shape) + offset_end = [0] * len(tens.shape) + for out in outputs: + if out == tens: + break + offset_start[axis] += out.shape[axis] + + offset_end[axis] = offset_start[axis] + tens.shape[axis] + + new_op.attrs["split_start"] = offset_start + new_op.attrs["split_end"] = offset_end + new_op.run_on_npu = True + tens.ops.append(new_op) + + return tens + + +def needed_total_padding(input_size, stride, filter_size): + out_size = (input_size + stride - 1) // stride + needed_input = (out_size - 1) * stride + filter_size + total_padding = max(0, needed_input - input_size) + return total_padding + + +def calc_padding_and_skirt(padding_type, kernel_size, stride, input_dims): + ypad = needed_total_padding(int(input_dims[1]), int(stride[1]), int(kernel_size[0])) + xpad = needed_total_padding(int(input_dims[2]), int(stride[2]), int(kernel_size[1])) + if padding_type == b"SAME": + left_pad = (xpad + 0) // 2 + right_pad = (xpad + 1) // 2 + top_pad = (ypad + 0) // 2 + bottom_pad = (ypad + 1) // 2 + elif padding_type == b"VALID": + left_pad = 0 + right_pad = 0 + top_pad = 0 + bottom_pad = 0 + else: + assert 0, "Unknown padding" + padding = (top_pad, left_pad, bottom_pad, right_pad) + skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad) + return padding, skirt + + +def fixup_conv2d_backprop(op, arch): + if op.type == "Conv2DBackpropInput": + # flip the inputs + op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0] + op.type = "Conv2DBackpropInputSwitched" + + return op + + +def fixup_fully_connected_input(op, arch): + if op.type == "FullyConnectedAct": + inp = op.inputs[0] + weights = op.inputs[1] + + n_in_elems = weights.shape[-2] + elms = inp.elements() + batch_size = elms // n_in_elems + assert batch_size * n_in_elems == elms + + desired_shape = [batch_size, n_in_elems] + if inp.shape != desired_shape: + # mismatch, insert a reshape to fix this. + reshape_name = op.name + "_reshape" + new_shape_tens = Tensor([1], DataType.int32, reshape_name + "_shape") + new_shape_tens.values = np.array(desired_shape) + new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const") + new_shape_tens.ops = [new_shape_tens_const] + new_shape_tens_const.outputs = [new_shape_tens] + + reshape_op = Operation("Reshape", reshape_name) + reshape_op.inputs = [inp, new_shape_tens] + reshape_op.attrs["new_shape"] = desired_shape + reshape_out = inp.clone("_reshaped") + reshape_out.shape = reshape_out.storage_shape = reshape_out.bandwidth_shape = desired_shape + reshape_out.ops = [reshape_op] + reshape_op.outputs = [reshape_out] + + op.inputs[0] = reshape_out + + return op + + +def fixup_pack_input(op, arch): + if op.type == "Pack": + # Pack is also referred to as Stack + # Requires the rewrite_concat function to be called on the op afterwards + axis = int(op.attrs["axis"]) + desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:] + + # Construct 1 shape tensor to be used by all inserted reshape ops + new_shape_name = op.name + "_reshape_shape" + new_shape_tens = Tensor([1], DataType.int32, new_shape_name) + new_shape_tens.values = np.array(desired_shape) + new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const") + new_shape_tens.ops = [new_shape_tens_const] + new_shape_tens_const.outputs = [new_shape_tens] + + for idx, inp in enumerate(op.inputs): + reshape_name = op.name + str(idx) + "_reshape" + reshape_op = Operation("Reshape", reshape_name) + reshape_op.inputs = [inp, new_shape_tens] + reshape_op.attrs["new_shape"] = desired_shape + reshape_out = inp.clone("_reshaped") + reshape_out.shape = reshape_out.storage_shape = reshape_out.bandwidth_shape = desired_shape + reshape_out.ops = [reshape_op] + reshape_op.outputs = [reshape_out] + + op.inputs[idx] = reshape_out + + op.type = "PackReshaped" + + return op + + +def fixup_unpack_output(tens, arch): + op = tens.ops[0] + if op.type in set(("Unpack", "StridedSlice")): + # Unpack is also referred to as Unstack + # Requires the rewrite_split function to be called on the op afterwards + if op.type == "StridedSlice": + shrink_axis_mask = op.attrs["shrink_axis_mask"] + if shrink_axis_mask == 0: + # Equal Rank StridedSlice, no need to insert reshape + return tens + + # Only allow shrinking 1 axis for now + assert shrink_axis_mask & (shrink_axis_mask - 1) == 0 + assert len(tens.shape) == (len(op.inputs[0].shape) - 1) + + axis = int(math.log2(shrink_axis_mask)) + op.attrs["shrink_axis_mask"] = 0 + else: + axis = int(op.attrs["axis"]) + op.type = "UnpackReshaped" + + desired_shape = tens.shape[:axis] + [1] + tens.shape[axis:] + + # Construct 1 shape tensor to be used by all inserted reshape ops + new_shape_name = op.name + "_reshape_shape" + new_shape_tens = Tensor([1], DataType.int32, new_shape_name) + new_shape_tens.values = np.array(tens.shape) + new_shape_tens_const = Operation("Const", new_shape_tens.name + "_const") + new_shape_tens.ops = [new_shape_tens_const] + new_shape_tens_const.outputs = [new_shape_tens] + + for idx, out_tens in enumerate(op.outputs): + reshape_name = op.name + str(idx) + "_reshape" + reshape_op = Operation("Reshape", reshape_name) + reshape_op.outputs = [out_tens] + reshape_in = out_tens.clone("_reshaped") + reshape_in.shape = reshape_in.storage_shape = reshape_in.bandwidth_shape = desired_shape + reshape_in.ops = [op] + out_tens.ops = [reshape_op] + reshape_op.inputs = [reshape_in, new_shape_tens] + + op.outputs[idx] = reshape_in + + return tens + + +def add_padding_fields(op, arch): + if "padding" in op.attrs: + if "Conv" in op.type: + kernel_size = op.inputs[1].shape[:2] + input_shape = op.inputs[0].shape + elif "Pool" in op.type: + kernel_size = op.attrs["ksize"][1:3] + input_shape = op.inputs[0].shape + elif op.type == "ExtractImagePatches": + kernel_size = op.attrs["ksizes"][1:3] + input_shape = op.inputs[0].shape + else: + assert 0, "Unknown operation that uses padding" + + padding, skirt = calc_padding_and_skirt(op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape) + op.attrs["explicit_padding"] = padding + op.attrs["skirt"] = skirt + return op + + +conv_op = set(("Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitched", "Conv2DBiasAct")) +fc_op = set( + ( + "MatMul", + "QuantizedMatMul", + "BlockLSTM", + "RnnAct", + "UnidirectionalSequenceRnnAct", + "BidirectionalSequenceRnnAct", + "LstmAct", + "UnidirectionalSequenceLstmAct", + "BidirectionalSequenceLstmAct", + "FullyConnectedAct", + ) +) +depthwise_op = set(("DepthwiseConv2dNative", "DepthwiseConv2dBiasAct",)) +pool_op = set(("AvgPool", "MaxPool", "QuantizedAvgPool", "QuantizedMaxPool", "AvgPoolAct", "MaxPoolAct")) +elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum", "LeakyRelu", "Abs")) +activation_ops = set(("Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh")) +memory_only_ops = set(("Reshape",)) + +# Check if the op can be reordered +def get_prepend_op(op): + inp = op.inputs[0] + # The op should be reordered between prev_op and prep_op + prev_op = inp.ops[-1] + prep_op = None + while prev_op.type in memory_only_ops and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1: + prep_op = prev_op + inp = prev_op.inputs[0] + prev_op = inp.ops[-1] + if prev_op != None and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1: + return prep_op + + return None + + +def mark_npu_block_type(op, arch): + npu_block_type = NpuBlockType.Default + if op.type in conv_op: + npu_block_type = NpuBlockType.ConvolutionMxN + elif op.type in fc_op: + npu_block_type = NpuBlockType.VectorProduct + elif op.type in depthwise_op: + npu_block_type = NpuBlockType.ConvolutionDepthWise + elif op.type in pool_op: + npu_block_type = NpuBlockType.Pooling + elif op.type in elementwise_op: + npu_block_type = NpuBlockType.ElementWise + + op.attrs["npu_block_type"] = npu_block_type + return op + + +def convert_depthwise_to_conv(op, arch): + # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and + # the ofm depth equals the depth multipler. + # If those conditions are true, then we can perform a simple + # switch of the operator type (and weight order) + + if ("DepthwiseConv2d" in op.type) and (op.attrs["depth_multiplier"] != 1): + ifm_tensor = op.inputs[0] + weight_tensor = op.inputs[1] + ofm_tensor = op.outputs[0] + if (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"]): + # Change op type to Conv2d + op.type = op.type.replace("DepthwiseConv2d", "Conv2D") + del op.attrs["channel_multiplier"] + del op.attrs["depth_multiplier"] + + weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2)) + weight_tensor.shape = weight_tensor.storage_shape = weight_tensor.bandwidth_shape = list( + weight_tensor.quant_values.shape + ) + else: + print( + "Error: Unsupported DepthwiseConv2d with depth_multiplier = {0}, " + "ifm channels = {1}, ofm channels = {2}".format( + op.attrs["depth_multiplier"], ifm_tensor.shape[3], ofm_tensor.shape[3] + ) + ) + assert False + return op + + +# Reorder activation op if it's after the memory only operations +def fixup_act_reorder(op, arch): + if op.type in activation_ops: + prep_op = get_prepend_op(op) + if prep_op != None: + act_op = op.clone("_reordered") + act_op.inputs = [prep_op.inputs[0]] + act_op_out = act_op.inputs[0].clone("_acted") + act_op_out.quantization = op.outputs[0].quantization.clone() + act_op_out.ops = [act_op] + act_op.outputs = [act_op_out] + prep_op.inputs[0] = act_op_out + prep_op.outputs[0].quantization = act_op_out.quantization.clone() + + # Mark the op so that it will be removed as passthrough later on + op.type = "Identity" + return op + + +def convert_mul_max_to_abs_or_lrelu(op, arch): + """Whenever there is a subgraph with this topology: + + Input X For X = -1 or X > 0 + | \ / This subgraph can be replaced with either + | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0) + | / + Max + """ + + if op.type == "Maximum": + # finds the Mul input(s) to the Max + muls = [i for i in op.inputs if i.ops[0].type == "MulAct"] + if len(muls) == 1: + mul = muls[0].ops[0] + elif len(muls) == 2: + # In the case both inputs are Muls, find the one with the same input as the Max + mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0] + else: + # No Mul inputs + return op + + # make sure the Mul doesn't have any other consumers + if len(mul.outputs[0].consumers()) != 1: + return op + # make sure the Mul doesn't have a faf + if mul.attrs["fused_activation_function"]: + return op + + # finds the branched input that goes to both the Max and the Mul + shared = set(op.inputs) & set(mul.inputs) + if len(shared) == 1: + shared_in = shared.pop() + # find the constant scalar input to the Mul + const_tens = (set(mul.inputs) - {shared_in}).pop() + # check that it is a scalar + if const_tens.shape != []: + return op + const = const_tens.ops[0] + # check that it is a constant + if const.type != "Const": + return op + else: + return op + + val = const.outputs[0].values + if val >= 0: + new_op = "LeakyRelu" + op.attrs["alpha"] = val + elif val == -1: + new_op = "Abs" + else: + return op + + op.type = op.type.replace("Maximum", new_op) + op.name = op.name.replace("Maximum", new_op) + op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op) + op.inputs = [shared_in] + return op + + +def supported_operator_check(op, arch): + op.run_on_npu = arch.supported_operators.is_operator_supported(op) + return op + + +def optimise_graph_a(nng, arch, verbose_graph=False): + if verbose_graph: + nng.print_graph() + + op_rewrite_list = [ + # mark block type and check if the operations are supported + mark_npu_block_type, + supported_operator_check, + # then do any rewrites of supported operators + convert_depthwise_to_conv, + fixup_fully_connected_input, + fixup_pack_input, + fixup_conv2d_backprop, + fixup_act_reorder, + add_padding_fields, + mark_npu_block_type, + # convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved + ] + + for idx, sg in enumerate(nng.subgraphs): + # rewrite graph pass + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( + sg, arch, [fixup_unpack_output,], op_rewrite_list, rewrite_unsupported=False + ) + + for idx, sg in enumerate(nng.subgraphs): + # remove passthrough tensors + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [remove_passthrough_tensor,], []) + + if verbose_graph: + nng.print_graph() + return nng + +def optimise_graph_b(nng, arch, verbose_graph=False): + if verbose_graph: + nng.print_graph() + + for idx, sg in enumerate(nng.subgraphs): + # combined rewrite graph pass + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [rewrite_concat, rewrite_split,], []) + + if verbose_graph: + nng.print_graph() + return nng diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py new file mode 100644 index 00000000..6b3d2c1e --- /dev/null +++ b/ethosu/vela/greedy_allocation.py @@ -0,0 +1,95 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Allocate tensor addresses using a greedy algorithm. + +from . import numeric_util + + +class GreedyAllocator: + def __init__(self, nng, arch, live_ranges, mem_area): + self.nng = nng + self.arch = arch + self.mem_area = mem_area + + self.live_ranges = live_ranges + self.memory_required = 0 + + self.current_allocs = [] + + def alloc(self, new_lr): + size = new_lr.size + current_top = 0 + if self.current_allocs: + current_top = max(start_addr + lr.size for start_addr, lr in self.current_allocs) + best_offset = numeric_util.round_up(current_top, new_lr.get_alignment()) + best_offset_fit = (1 << 64) - 1 + + current_offset = 0 + for start_addr, lr in self.current_allocs: + aligned_current_offset = numeric_util.round_up(current_offset, new_lr.get_alignment()) + if aligned_current_offset + size <= start_addr and start_addr - current_offset < best_offset_fit: + best_offset = current_offset + best_offset_fit = start_addr - current_offset + + current_offset = start_addr + lr.size + + self.memory_required = max(self.memory_required, best_offset + size) + new_lr.set_address(best_offset) + self.current_allocs.append((best_offset, new_lr)) + self.current_allocs = list(sorted(self.current_allocs)) + + def dealloc(self, lr_to_dealloc): + self.current_allocs = [(start_addr, lr) for start_addr, lr in self.current_allocs if lr != lr_to_dealloc] + + def allocate_live_ranges(self, verbose_allocation): + lrs = set() + for lr in self.live_ranges.ranges.values(): + lrs.add((lr.start_time, lr.end_time, lr)) + + lrs = sorted(lrs) + + for curr_time, _, new_lr in lrs: + for _, lr in list(self.current_allocs): + if lr.end_time < curr_time: + self.dealloc(lr) + + self.alloc(new_lr) + + assert self.verify_allocation() + return self.memory_required + + def verify_allocation(self): + lrs = list(self.live_ranges.ranges.values()) + for n in lrs: + for m in lrs: + if n != m and n.overlaps_ranges(m): + overlap, tens_n, tens_m = n.overlaps_address(m) + if overlap: + print("Solution failed, overlapping buffer!") + print(tens_n.address, tens_n.address + n.size, n.name) + print(tens_m.address, tens_m.address + m.size, m.name) + print() + return False + + return True + + +def allocate_live_ranges(nng, arch, live_ranges, mem_area, verbose_allocation=False): + g = GreedyAllocator(nng, arch, live_ranges, mem_area) + return g.allocate_live_ranges(verbose_allocation) diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py new file mode 100644 index 00000000..952e2033 --- /dev/null +++ b/ethosu/vela/high_level_command_stream.py @@ -0,0 +1,365 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Contains classes that hold commands for the high-level command stream (one command per DMA or NPU stripe). + +from enum import Enum, IntEnum +import numpy as np +from .operation import NpuBlockType +from .numeric_util import round_up_divide +from .range_set import MemoryAccessSet, AccessDirection + + +class Box: + def __init__(self, start_coord, end_coord): + self.start_coord = list(start_coord) + self.end_coord = list(end_coord) + assert len(self.start_coord) == len(end_coord) + for i in range(len(self.start_coord)): + assert self.start_coord[i] <= self.end_coord[i] + + def transform_with_strides_and_skirt( + self, strides, skirt, ifm_shape, npu_block_type, concat_axis=0, concat_offset=0, split_offset=None, k_height=1 + ): + new_start_coord = list(self.start_coord) + new_end_coord = list(self.end_coord) + + new_start_coord[concat_axis] -= concat_offset + new_end_coord[concat_axis] -= concat_offset + + if split_offset != None: + for idx in range(len(split_offset)): + new_start_coord[idx] += split_offset[idx] + new_end_coord[idx] += split_offset[idx] + + if split_offset == None and npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)): + # these types of operations do a "dot product" over the entire IFM + new_start_coord[-1] = 0 + new_end_coord[-1] = ifm_shape[-1] + + if min(len(new_end_coord), len(ifm_shape)) >= 2: + new_end_coord[-2] = min(new_end_coord[-2], ifm_shape[-2]) + if min(len(new_end_coord), len(ifm_shape)) >= 3: + new_end_coord[-3] = min(new_end_coord[-3], ifm_shape[-3]) + + pad_top = 0 + pad_bottom = 0 + if strides is not None and skirt is not None: + if len(new_start_coord) >= 2: + stride = strides[2] + new_start_coord[-2] = max(new_start_coord[-2] * stride - skirt[1], 0) + new_end_coord[-2] = min(new_end_coord[-2] * stride + skirt[3], ifm_shape[-2]) + + if len(new_start_coord) >= 3: + stride = strides[1] + + total_stride = stride * (new_end_coord[-3] - new_start_coord[-3] - 1) + new_start_coord[-3] = new_start_coord[-3] * stride - skirt[0] + + pad_top = max(0, 0 - new_start_coord[-3]) + new_start_coord[-3] = max(new_start_coord[-3], 0) + + while len(ifm_shape) < 3: + ifm_shape = [1] + ifm_shape + if (new_end_coord[-3] * stride + skirt[2]) > ifm_shape[-3]: + # pad_bottom is calculated based the diff between the end position of the weight kernel, + # after last stride and the ifm height. + k_start = new_start_coord[-3] - pad_top + pad_bottom = max(0, k_start + total_stride + k_height - ifm_shape[-3]) + + new_end_coord[-3] = min(new_end_coord[-3] * stride + skirt[2], ifm_shape[-3]) + + return Box(new_start_coord, new_end_coord), pad_top, pad_bottom + + def make_weight_box(weight_shape, npu_block_type, oc_range_start=None, oc_range_end=None, weights_transposed=False): + start = [0] * len(weight_shape) + end = list(weight_shape) + if oc_range_start is not None and oc_range_end is not None: + if npu_block_type == NpuBlockType.ConvolutionDepthWise: + # input range is output range divided by channel multiplier + if weights_transposed: + start[-1] = oc_range_start // weight_shape[-2] + end[-1] = oc_range_end // weight_shape[-2] + else: + start[-2] = oc_range_start // weight_shape[-1] + end[-2] = oc_range_end // weight_shape[-1] + else: + start[-1] = oc_range_start + end[-1] = oc_range_end + for i in range(len(end)): + assert 0 <= start[i] < weight_shape[i] + assert 0 < end[i] <= weight_shape[i] + + return Box(start, end) + + def get_size_shape(self): + return [int(self.end_coord[i] - self.start_coord[i]) for i in range(len(self.end_coord))] + + def get_size(self): + return int(np.prod(self.get_size_shape())) + + def __str__(self): + return "" % (self.start_coord, self.end_coord) + + __repr__ = __str__ + + +class CommandType(IntEnum): + NpuStripe = 0 + DMA = 1 + Size = 2 + + +class Command: + def get_ofm_y_range_for_pass(self, ps_requested): + return None + + def is_npu_pass_command(self): + return False + + def get_memory_accesses(self): + return None + + def get_operation_count(self): + # returns numpy array of (DPU blocks, dma_ops). Should line up with the CommandType enum + return np.array((0, 0)) + + +class NpuStripe(Command): + def __init__( + self, + ps, + block_config, + is_first, + is_last, + is_first_h_stripe, + is_last_h_stripe, + ifm_tensor, + ifm_box, + ofm_tensor, + ofm_box, + weight_tensor=None, + weight_box=None, + scale_tensor=None, + concat_axis=0, + concat_offset=0, + ifm2_tensor=None, + ifm2_box=None, + pad_top=0, + pad_bottom=0, + ): + self.cmdtype = CommandType.NpuStripe + self.ps = ps + self.block_config = block_config + self.is_first = is_first + self.is_last = is_last + self.is_first_h_stripe = is_first_h_stripe + self.is_last_h_stripe = is_last_h_stripe + self.ifm_tensor = ifm_tensor + self.ifm_box = ifm_box + self.ifm2_tensor = ifm2_tensor + self.ifm2_box = ifm2_box + self.ofm_tensor = ofm_tensor + self.ofm_box = ofm_box + self.weight_tensor = weight_tensor + self.scale_tensor = scale_tensor + self.weight_box = weight_box + self.concat_axis = concat_axis + self.concat_offset = concat_offset + self.pad_top = pad_top + self.pad_bottom = pad_bottom + for i in range(len(self.ofm_box.end_coord)): + assert self.ofm_box.end_coord[i] <= self.ofm_tensor.shape[i] + + def get_memory_accesses(self): + res = MemoryAccessSet() + if self.ifm_tensor is not None and self.ifm_tensor.shape != []: + res.add( + self.ifm_tensor.get_address_ranges_for_coordinates(self.ifm_box.start_coord, self.ifm_box.end_coord), + AccessDirection.Read, + ) + if self.ifm2_tensor is not None and self.ifm2_tensor.shape != []: + res.add( + self.ifm2_tensor.get_address_ranges_for_coordinates(self.ifm2_box.start_coord, self.ifm2_box.end_coord), + AccessDirection.Read, + ) + if self.ofm_tensor is not None: + res.add( + self.ofm_tensor.get_address_ranges_for_coordinates(self.ofm_box.start_coord, self.ofm_box.end_coord), + AccessDirection.Write, + ) + if self.weight_tensor is not None: + res.add( + self.weight_tensor.get_address_ranges_for_coordinates( + self.weight_box.start_coord, self.weight_box.end_coord + ), + AccessDirection.Read, + ) + return res + + def is_npu_pass_command(self): + return True + + def __str__(self): + return "" % ( + self.ps.name, + self.ifm_box, + self.ifm2_box, + self.ofm_box, + self.weight_box, + self.block_config, + ) + + __repr__ = __str__ + + def get_ofm_y_range_for_pass(self, ps_requested): + if ps_requested != self.ps: + return None + if len(self.ofm_box.start_coord) >= 3: + return (self.ofm_box.start_coord[-3], self.ofm_box.end_coord[-3]) + return None + + def get_block_dimensions(self): + ofm_box = self.ofm_box + block_config = self.block_config + + out_height = 1 + out_width = 1 + out_depth = ofm_box.end_coord[-1] - ofm_box.start_coord[-1] + if len(ofm_box.end_coord) >= 4: + out_width = ofm_box.end_coord[-2] - ofm_box.start_coord[-2] + out_height = ofm_box.end_coord[-3] - ofm_box.start_coord[-3] + + assert out_height >= 0 + assert out_width >= 0 + assert out_depth >= 0 + return ( + round_up_divide(out_height, block_config[0]), + round_up_divide(out_width, block_config[1]), + round_up_divide(out_depth, block_config[3]), + ) + + def get_operation_count(self): + # returns numpy array of (DPU blocks, dma_ops) + return np.array((self.get_n_blocks(), 0)) + + def get_n_blocks(self): + h, w, d = self.get_block_dimensions() + res = h * w * d + assert res >= 0 + return res + + def get_single_block_command(self, block_idx): + block_cfg = (self.block_config[0], self.block_config[1], self.block_config[3]) + dims = self.get_block_dimensions() + strides = dims[1] * dims[2], dims[2], 1 + coord = [] + idx_left = block_idx + for s in strides: + c = idx_left // s + idx_left -= c * s + coord.append(c) + + assert idx_left == 0 + + # put in dummy height/widths in case we're dealing with FC layers + ofm_start = list(self.ofm_box.start_coord) + ofm_end = list(self.ofm_box.end_coord) + + # cut out a nice block shape + for idx in (-1, -2, -3): + if len(ofm_start) >= -idx: + ofm_start[idx] += block_cfg[idx] * coord[idx] + ofm_end[idx] = min(ofm_end[idx], ofm_start[idx] + block_cfg[idx]) + + ps = self.ps + strides = None + skirt = None + if ps.primary_op is not None: + strides = ps.primary_op.attrs.get("strides", None) + skirt = ps.primary_op.attrs.get("skirt", None) + npu_block_type = ps.npu_block_type + + ofm_box = Box(ofm_start, ofm_end) + ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt( + strides, skirt, self.ifm_tensor.shape, npu_block_type, self.concat_axis, self.concat_offset + ) + + weight_box = None + if self.weight_tensor is not None: + weight_oc_start = ofm_start[-1] + weight_oc_end = ofm_end[-1] + if self.concat_axis - len(self.weight_tensor.shape) == -1: + weight_oc_start -= self.concat_offset + weight_oc_end -= self.concat_offset + + weight_box = Box.make_weight_box( + self.weight_tensor.shape, + npu_block_type, + weight_oc_start, + weight_oc_end, + self.weight_tensor.weight_transpose_depthwise, + ) + + return NpuStripe( + self.ps, + self.block_config, + self.is_first, + self.is_last, + self.is_first_h_stripe, + self.is_last_h_stripe, + self.ifm_tensor, + ifm_box, + self.ofm_tensor, + ofm_box, + self.weight_tensor, + weight_box, + self.scale_tensor, + self.concat_axis, + self.concat_offset, + ) + + +class DMA(Command): + def __init__(self, in_tensor, out_tensor, box): + self.cmdtype = CommandType.DMA + self.in_tensor = in_tensor + self.out_tensor = out_tensor + self.box = box + + def __str__(self): + return "" % (self.in_tensor.name, self.out_tensor.name, self.box) + + __repr__ = __str__ + + def get_memory_accesses(self): + res = MemoryAccessSet() + + res.add( + self.in_tensor.get_address_ranges_for_coordinates(self.box.start_coord, self.box.end_coord), + AccessDirection.Read, + ) + res.add( + self.out_tensor.get_address_ranges_for_coordinates(self.box.start_coord, self.box.end_coord), + AccessDirection.Write, + ) + return res + + def get_operation_count(self): + # returns numpy array of (DPU blocks, dma_ops) + return np.array((0, 1)) diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py new file mode 100644 index 00000000..364df6f8 --- /dev/null +++ b/ethosu/vela/high_level_command_stream_generator.py @@ -0,0 +1,315 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Generate a high-level command stream from a scheduled subgraph with CascadedPasses. +# +# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using +# calc_allowed_ofm_ifm_overlap_for_cascaded_pass(). + +from .nn_graph import SchedulingStrategy, PassPlacement +import numpy as np +from .operation import NpuBlockType +from .high_level_command_stream import Box, CommandType, Command, NpuStripe, DMA + + +def need_dma(tens): + return len(tens.ops) == 1 and tens.ops[0].type == "DMA" + + +def dma_weights_if_necessary(ps, box, weight_tensor): + if need_dma(weight_tensor): + dma_op = weight_tensor.ops[0] + in_tensor = dma_op.inputs[0] + yield DMA(in_tensor, weight_tensor, box) + + +def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx): + is_first = idx == 0 + is_last = idx == len(passes) - 1 + ps = passes[idx] + block_config = block_configs[idx] + + ifm_tensor = ps.ifm_tensor + ifm2_tensor = ps.ifm2_tensor + ofm_tensor = ps.ofm_tensor + weight_tensor = ps.weight_tensor + scale_tensor = ps.scale_tensor + + ofm_start = [0] * len(ofm_tensor.shape) + ofm_end = list(ofm_tensor.shape) + + strides = None + skirt = None + if ps.primary_op is not None: + strides = ps.primary_op.attrs.get("strides", None) + skirt = ps.primary_op.attrs.get("skirt", None) + + npu_block_type = ps.npu_block_type + + concat_axis = 0 + concat_offset = 0 + + split_offsets = [None, None] # offset for [ifm, ifm2] + + # Fusable activation functions + activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1")) + + for op in ps.ops: + if op.type == "ConcatSliceWrite": + concat_axis = op.attrs["concat_axis"] + concat_start = op.attrs["concat_start"] + concat_end = op.attrs["concat_end"] + + ofm_start[concat_axis] = concat_start + ofm_end[concat_axis] = concat_end + concat_offset = concat_start + ps.primary_op.attrs["fused_memory_function"] = op.type + elif op.type in activation_ops: + ps.primary_op.attrs["fused_activation_function"] = op.type + + # The ops list has to be reversed here since the Pass Packing is done in reverse + ifm_idx = 0 + for op in reversed(ps.ops): + if op.type == "SplitSliceRead": + split_offsets[ifm_idx] = op.attrs["split_start"] + ps.primary_op.attrs["fused_memory_function"] = op.type + ifm_idx += 1 + + if strat == SchedulingStrategy.WeightStream: + ofm_step = block_config[-1] + ofm_stop = ofm_end[-1] + if weight_tensor is None or not need_dma(weight_tensor): + ofm_step = ofm_stop + for start in range(ofm_start[-1], ofm_stop, ofm_step): + end = min(start + ofm_step, ofm_stop) + ofm_start[-1] = start + ofm_end[-1] = end + ofm_box = Box(ofm_start, ofm_end) + ifm_box = None + ifm2_box = None + + if ifm_tensor.shape != []: + ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt( + strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0] + ) + else: + ifm_box = Box([], []) + if ifm2_tensor is not None and ifm2_tensor.shape != []: + ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt( + strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1] + ) + else: + ifm2_box = Box([], []) + + weight_box = None + if weight_tensor is not None: + weight_oc_start = start + weight_oc_end = end + if concat_axis - len(weight_tensor.shape) == -1: + weight_oc_start -= concat_offset + weight_oc_end -= concat_offset + + weight_box = Box.make_weight_box( + weight_tensor.shape, + npu_block_type, + weight_oc_start, + weight_oc_end, + weight_tensor.weight_transpose_depthwise, + ) + yield from dma_weights_if_necessary(ps, weight_box, weight_tensor) + + yield NpuStripe( + ps, + block_config, + is_first, + is_last, + True, + True, + ifm_tensor, + ifm_box, + ofm_tensor, + ofm_box, + weight_tensor, + weight_box, + scale_tensor, + concat_axis, + concat_offset, + ifm2_tensor=ifm2_tensor, + ifm2_box=ifm2_box, + ) + + elif strat == SchedulingStrategy.IfmStream: + y_step = block_config[0] + y_start = 0 + y_dim = 1 + if len(ofm_tensor.shape) >= 3: + y_start = ofm_start[-3] + y_dim = ofm_end[-3] + if idx > 0: + ifm_y_present = 0 + prev_pass = passes[idx - 1] + prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1) + else: + ifm_y_present = 1 + if len(ifm_tensor.shape) >= 3: + ifm_y_present = ifm_tensor.shape[-3] + prev_pass_gen = [] + prev_pass = None + + if len(passes) == 1: + # no cascading, can just issue one big stripe + # but only if we've done allocation and OFM does not overlap IFM + if ifm_tensor.address != -1 and ofm_tensor.address != -1: + if ( + ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address + or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address + ): + y_step = y_dim + + weight_box = None + + for start in range(y_start, y_dim, y_step): + end = min(start + y_step, y_dim) + if len(ofm_tensor.shape) >= 3: + ofm_start[-3] = start + ofm_end[-3] = end + ofm_box = Box(ofm_start, ofm_end) + + k_height = 1 + if npu_block_type == NpuBlockType.Pooling: + if ps.primary_op is not None: + k_height = ps.primary_op.attrs["ksize"][1] + else: + if weight_tensor is not None: + k_height = weight_tensor.shape[0] + + ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt( + strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height + ) + + ifm_y_needed = 1 + if len(ifm_box.end_coord) >= 3: + ifm_y_needed = ifm_box.end_coord[-3] + if ifm_y_present < ifm_y_needed: + for prev_cmd in prev_pass_gen: + yield prev_cmd + rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass) + if rng is not None: + ifm_y_present = max(ifm_y_present, rng[1]) + if ifm_y_present >= ifm_y_needed: + break + + if weight_tensor is not None and weight_box is None: + weight_box = Box.make_weight_box( + weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise + ) + yield from dma_weights_if_necessary(ps, weight_box, weight_tensor) + + # Check if first/last stripe in pass + is_first_h_stripe = start == y_start + is_last_h_stripe = (start + y_step) >= y_dim + + stripe = NpuStripe( + ps, + block_config, + is_first, + is_last, + is_first_h_stripe, + is_last_h_stripe, + ifm_tensor, + ifm_box, + ofm_tensor, + ofm_box, + weight_tensor, + weight_box, + scale_tensor, + concat_axis, + concat_offset, + None, + None, + pad_top, + pad_bottom, + ) + yield stripe + else: + assert 0, "unknown scheduling strategy" + + +def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs): + if strat == SchedulingStrategy.WeightStream: + for idx in range(len(passes)): + yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx) + elif strat == SchedulingStrategy.IfmStream: + yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1) + else: + assert 0, "Unknown streaming strategy" + + +def generate_high_level_command_stream_for_cascaded_pass(cps): + yield from generate_high_level_command_stream_for_pass_list( + cps.strategy, cps.passes, [ps.block_config for ps in cps.passes] + ) + + +def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream): + res = [] + for cps in sg.cascaded_passes: + if cps.placement == PassPlacement.Npu: + res += list(generate_high_level_command_stream_for_cascaded_pass(cps)) + + sg.high_level_command_stream = res + if verbose_high_level_command_stream: + sg.print_high_level_command_stream() + + +def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs): + highest_ofm_write = 0 + if not passes[0].ifm_tensor or not passes[-1].ofm_tensor: + return 0 + + ifm_read = passes[0].ifm_tensor.storage_size + min_overlap = 999999999999999999999 + ofm_size = passes[-1].ofm_tensor.storage_size() + if strat == SchedulingStrategy.WeightStream: + return 0 + for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs): + if cmd.is_npu_pass_command(): + if cmd.is_first: + ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False) + if ifm_read is None: + return 0 + if cmd.is_last: + write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True) + if write_offset is None: + return 0 + highest_ofm_write = max(write_offset, highest_ofm_write) + + if cmd.is_first or cmd.is_last: + overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0) + can_overwrite = ofm_size - overlap_required + min_overlap = min(min_overlap, can_overwrite) + + if cmd.is_first: + ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True) + + min_overlap = max(min_overlap, 0) + return min_overlap + + +def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps): + return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]) diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py new file mode 100644 index 00000000..b63c1ea1 --- /dev/null +++ b/ethosu/vela/insert_dma.py @@ -0,0 +1,60 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Insert DMA operations into the graph for transfering weights. + +from .nn_graph import Operation, MemArea, TensorPurpose, NpuBlockType +from . import rewrite_graph + + +def insert_dma_cmd(op, arch): + if op.type == "DMA": + return op # Already rewritten + for idx, tens in enumerate(op.inputs): + + if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area: + if tens.purpose == TensorPurpose.Weights: + only_vector_product_consumers = True + for oper in tens.consumers(): + if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct: + only_vector_product_consumers = False + break + + # Tensor products has no need for DMA, tensors are only read once and can be in flash. + # Other operations re-reads tensors, this is better done from SRAM. + if not only_vector_product_consumers: + # Insert a DMA command here, as well as a new tensor situated in SRAM of the same size. + new_tens = tens.clone_into_fast_storage(arch) + dma_cmd = Operation("DMA", tens.ops[0].name + "_dma") + dma_cmd.inputs = [tens] + dma_cmd.outputs = [new_tens] + dma_cmd.attrs["source"] = tens.mem_area + dma_cmd.attrs["destination"] = new_tens.mem_area + dma_cmd.run_on_npu = True + new_tens.ops = [dma_cmd] + op.inputs[idx] = new_tens + return op + + +def insert_dma_commands(nng, arch, verbose_graph=False): + + for idx, sg in enumerate(nng.subgraphs): + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [], [insert_dma_cmd]) + if verbose_graph: + nng.print_graph() + return nng diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py new file mode 100644 index 00000000..24f1f64c --- /dev/null +++ b/ethosu/vela/live_range.py @@ -0,0 +1,324 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Build a live range graph for tensors in one or more subgraphs. Used for tensor allocation as well as in the scheduler. +# Can work with either a pass packed subgraph or a scheduled subgraph. + +from .tensor import Tensor, MemArea +from .nn_graph import TensorPurpose, PassPlacement +from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass + + +class LiveRange: + def __init__(self, tens): + self.tensors = [] # Tensors that are assigned to the same LiveRange will be allocated to the same address + self.start_time = 99999999999 + self.end_time = -1 + self.size = 0 + self.name = "" + + if tens: + self.add_tensor(tens) + + def __str__(self): + return "" % (self.name, self.start_time, self.end_time) + + __repr__ = __str__ + + def add_tensor(self, tens): + if self.size == 0: + self.size = tens.storage_size() + self.name = tens.name # LiveRange will be named after the first tensor added + else: + assert ( + self.size >= tens.storage_size() + ), "Tensors assigned to the same LiveRange need to fit the size of the LiveRange." + + self.tensors.append(tens) + + def mark_usage(self, op_time): + if op_time == -1: + return + op_time_start = op_time + op_time_end = op_time + 1 + + self.start_time = min(self.start_time, op_time_start) + self.end_time = max(self.end_time, op_time_end) + + def overlaps_ranges(self, other): + return max(self.start_time, other.start_time) < min(self.end_time, other.end_time) + + def overlaps_address(self, other): + # Returns the first pair of tensors in this LiveRange and 'other' which have + # overlapping addresses + for tens in self.tensors: + for other_tens in other.tensors: + if max(tens.address, other_tens.address) < min( + tens.address + self.size, other_tens.address + other.size + ): + return True, tens, other_tens + + return False, None, None + + def __lt__(self, other): + if self.start_time != other.start_time: + return self.start_time < other.start_time + if self.end_time != other.end_time: + return self.end_time < other.end_time + if self.size != other.size: + return self.size < other.size + return self.name < other.name + + def set_address(self, address): + # Set address of all unaddressed tensors in LiveRange + for tens in self.tensors: + if tens.address == 0: + tens.address = address + # Also need to set the address to the tensor's cpu/npu clones + if tens.cpu_tensor != None: + tens.cpu_tensor.address = address + if tens.npu_tensor != None: + tens.npu_tensor.address = address + + def get_alignment(self): + # Get max alignment of LiveRange's tensors + if self.tensors: + alignment = 0 + for tens in self.tensors: + alignment = max(alignment, tens.alignment) + + return alignment + + return Tensor.AllocationQuantum + + +def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area): + for ps in sg.passes: + if ps.placement == PassPlacement.MemoryOnly: + # For memory only passes, e.g. Reshape. Add input and output tensor to the same LiveRange + input_tensor = ps.inputs[0] + output_tensor = ps.outputs[0] + # If the input or output tensor is tied to a Cpu tensor, i.e. a subgraph input + # or output, fuse the live-range with the Cpu tensors' live-range instead. + input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor != None else input_tensor + output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor != None else output_tensor + if not tensor_should_be_ignored(input_tensor, target_mem_area) and not tensor_should_be_ignored( + output_tensor, target_mem_area + ): + lr_graph.fuse_ranges(input_tensor, output_tensor) + + +class LiveRangeGraph: + def __init__(self): + self.ranges = {} # tens -> range + self.allowed_overlaps = {} # (tens,tens) -> overlap_int + self.ignore_tensors = set() + self.processed_subgraphs = set() + self.current_time = 0 + + def get_or_create_range(self, tens): + for rng in self.ranges.values(): + # Return the live range of the tensor (or it's cpu/npu clone) + if any(tensor in rng.tensors for tensor in [tens, tens.npu_tensor, tens.cpu_tensor]): + return rng + + # No live range found for the tensor, create a new one + rng = LiveRange(tens) + self.ranges[tens] = rng + return rng + + def fuse_ranges(self, in_tens, out_tens): + live_range = self.get_or_create_range(in_tens) + assert out_tens not in self.ranges, out_tens + live_range.add_tensor(out_tens) + self.ranges[out_tens] = live_range + return live_range + + +def extract_live_ranges_from_passes( + sg, + target_mem_area, + mark_output_tensors_overlapping_with_input_tensors=False, + ignore_subgraph_input_output_tensors=False, +): + lr_graph = LiveRangeGraph() + + if ignore_subgraph_input_output_tensors: + lr_graph.ignore_tensors.update(sg.input_tensors) + lr_graph.ignore_tensors.update(sg.output_tensors) + + def tensor_should_be_ignored(tens, target_mem_area): + if tens.mem_area != target_mem_area: + return True + if tens in lr_graph.ignore_tensors: + return True + if tens.name.endswith("reshape_shape_npu"): + # Reshape tensor, no need to allocate + lr_graph.ignore_tensors.add(tens) + return True + return False + + # Merge only memory operations in the NPU subgraphs + if sg.placement == PassPlacement.Npu: + merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area) + + for idx, ps in enumerate(sg.passes): + ps.time = 2 * idx + + time_for_pass = ps.time + + for tens in ps.inputs: + if tensor_should_be_ignored(tens, target_mem_area): + continue + rng = lr_graph.get_or_create_range(tens) + rng.mark_usage(time_for_pass) + + for tens in ps.intermediates: + if tensor_should_be_ignored(tens, target_mem_area): + continue + rng = lr_graph.get_or_create_range(tens) + rng.mark_usage(time_for_pass) + + for tens in ps.outputs: + if tensor_should_be_ignored(tens, target_mem_area): + continue + rng = lr_graph.get_or_create_range(tens) + output_time = time_for_pass + if not mark_output_tensors_overlapping_with_input_tensors and ps.is_element_wise: + output_time += 1 + rng.mark_usage(output_time) + + end_time = len(sg.passes) * 2 + for tens in sg.output_tensors: + if tensor_should_be_ignored(tens, target_mem_area): + continue + rng = lr_graph.get_or_create_range(tens) + rng.mark_usage(end_time) + + return lr_graph + + +def extract_live_ranges_from_cascaded_passes( + sg, + target_mem_area, + mark_output_tensors_overlapping_with_input_tensors=False, + use_ifm_ofm_overlap=True, + ignore_subgraph_input_output_tensors=False, + lr_graph=None, +): + if lr_graph == None: + lr_graph = LiveRangeGraph() + + if sg in lr_graph.processed_subgraphs: + # if subgraph has been processed already, return the lr_graph as is + return lr_graph + + if ignore_subgraph_input_output_tensors: + lr_graph.ignore_tensors.update(sg.input_tensors) + lr_graph.ignore_tensors.update(sg.output_tensors) + + def tensor_should_be_ignored(tens, target_mem_area): + if tens.mem_area != target_mem_area: + return True + if tens in lr_graph.ignore_tensors: + return True + if tens.name.endswith("reshape_shape_npu"): + # Reshape tensor, no need to allocate + lr_graph.ignore_tensors.add(tens) + return True + return False + + # Merge only memory operations in the NPU subgraphs + if sg.placement == PassPlacement.Npu: + merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area) + + for cps in sg.cascaded_passes: + cps.time = lr_graph.current_time + + time_for_pass = cps.time + + is_element_wise = cps.is_element_wise + + for tens in cps.inputs: + if tensor_should_be_ignored(tens, target_mem_area): + continue + rng = lr_graph.get_or_create_range(tens) + rng.mark_usage(time_for_pass) + + cps_primary_op = cps.passes[0].primary_op + if cps_primary_op and cps_primary_op.type == "NpuOp" and target_mem_area in set((MemArea.Sram, MemArea.Dram)): + # If the primary-op is an NpuOp that means this is where an Npu subgraph + # is called. Go into said subgraph and extract live ranges before continuing. + npu_sg = cps_primary_op.attrs["subgraph"] + lr_graph = extract_live_ranges_from_cascaded_passes( + npu_sg, + target_mem_area, + mark_output_tensors_overlapping_with_input_tensors, + use_ifm_ofm_overlap, + False, + lr_graph, + ) + # Set the new time after handling the Npu subgraph + time_for_pass = lr_graph.current_time + cps.time = time_for_pass + + for tens in cps.intermediates: + if tensor_should_be_ignored(tens, target_mem_area): + continue + rng = lr_graph.get_or_create_range(tens) + rng.mark_usage(time_for_pass) + + for tens in cps.outputs: + if tensor_should_be_ignored(tens, target_mem_area): + continue + rng = lr_graph.get_or_create_range(tens) + output_time = time_for_pass + if not mark_output_tensors_overlapping_with_input_tensors and is_element_wise: + output_time += 1 + rng.mark_usage(output_time) + + if use_ifm_ofm_overlap: + # fill allowed overlap for ifm and ofm tensor + ifm_tensor = cps.passes[0].ifm_tensor + ofm_tensor = cps.passes[-1].ofm_tensor + if ( + ifm_tensor is not None + and ofm_tensor is not None + and not tensor_should_be_ignored(ifm_tensor, target_mem_area) + and not tensor_should_be_ignored(ofm_tensor, target_mem_area) + ): + lr_graph.allowed_overlaps[(ifm_tensor, ofm_tensor)] = calc_allowed_ofm_ifm_overlap_for_cascaded_pass( + cps + ) + + lr_graph.current_time += 2 + + end_time = 0 + for rng in lr_graph.ranges.values(): + # Find the maximum end time of all live-ranges in the graph + end_time = max(end_time, rng.end_time) + + for tens in sg.output_tensors: + if tensor_should_be_ignored(tens, target_mem_area): + continue + rng = lr_graph.get_or_create_range(tens) + rng.mark_usage(end_time) + + # Add subgraph to set of processed subgraphs + lr_graph.processed_subgraphs.add(sg) + return lr_graph diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py new file mode 100644 index 00000000..9b1824b5 --- /dev/null +++ b/ethosu/vela/mark_tensors.py @@ -0,0 +1,363 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Mark purpose and select formats for Tensors. Also compresses the weights. + +from . import rewrite_graph +from . import weight_compressor +from .architecture_features import Block +from .nn_graph import TensorPurpose, TensorFormat, PassPlacement +from .operation import NpuBlockType + + +def purpose_from_list(lst): + def purpose(op, idx): + return lst[idx] + + return purpose + + +def all_fm(op, idx): + return TensorPurpose.FeatureMap + + +def all_parameter(op, idx): + return TensorPurpose.FeatureMap + + +def input0_from_output_rest_parameter(op, idx): + if idx == 0: + res = op.outputs[0].purpose + if res == TensorPurpose.Unknown: + print("Warning: Propagating unknown tensor purpose", op) + return res + return TensorPurpose.FeatureMap + + +def inputs_from_output(op, idx): + res = op.outputs[0].purpose + if res == TensorPurpose.Unknown: + print("Warning: Propagating unknown tensor purpose", op) + return res + +tensor_purposes = [ # ops, input_purpose + ( + set( + ( + "Relu", + "Relu6", + "Mul", + "Add", + "Sub", + "Rsqrt", + "Abs", + "Cast", + "Exp", + "Floor", + "FloorDiv", + "FloorMod", + "SquaredDifference", + "AddN", + "BiasAdd", + "RealDiv", + "Maximum", + "Minimum", + "Sigmoid", + "Tanh", + "FusedBatchNorm", + "AvgPool", + "MaxPool", + "Squeeze", + "Softmax", + "LRN", + "Assign", + "BatchMatMul", + "ZerosLike", + "ExtractImagePatches", + "MulAct", + "AddAct", + "SubAct", + "DivAct", + "AvgPoolAct", + "MaxPoolAct", + "LeakyRelu", + ) + ), + all_fm, + ), + ( + set( + ( + "Conv2D", + "DepthwiseConv2dNative", + "MatMul", + "Conv2DBiasAct", + "DepthwiseConv2dBiasAct", + "FullyConnectedAct", + ) + ), + purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]), + ), + ( + set(("Conv2DBackpropInputSwitched",)), + purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]), + ), + ( + set(("QuantizedConv2D", "QuantizedMatMul")), + purpose_from_list( + [ + TensorPurpose.FeatureMap, + TensorPurpose.Weights, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + ] + ), + ), + ( + set( + ( + "Reshape", + "Min", + "Max", + "Mean", + "Pad", + "MirrorPad", + "ArgMax", + "ArgMin", + "ExpandDims", + "ResizeNearestNeighbor", + "ResizeBilinear", + "Tile", + "Transpose", + "Mfcc", + ) + ), + purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]), + ), + ( + set(("QuantizedReshape", "QuantizedResizeBilinear")), + purpose_from_list( + [TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap] + ), + ), + ( + set(("QuantizedBiasAdd", "QuantizedAdd", "QuantizedMul")), + purpose_from_list( + [ + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + ] + ), + ), + ( + set( + ( + "Dequantize", + "Quantize", + "QuantizeV2", + "QuantizedRelu", + "QuantizedRelu1", + "QuantizedRelu6", + "QuantizedAvgPool", + "QuantizedMaxPool", + "Slice", + "SplitV", + ) + ), + purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]), + ), + ( + set(("BatchToSpaceND", "SpaceToBatchND", "DepthToSpaceND", "SpaceToDepthND")), + purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]), + ), + ( + set(("BlockLSTM",)), + purpose_from_list( + [ + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.Weights, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + ] + ), + ), + (set(("SplitSliceRead",)), purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap])), + (set(("Shape", "ConcatSliceWrite", "AudioSpectrogram")), purpose_from_list([TensorPurpose.FeatureMap])), + ( + set(("StridedSlice",)), + purpose_from_list( + [TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap] + ), + ), + (set(("Fill", "Pack", "Range")), all_parameter), + ( + set(("Requantize",)), + purpose_from_list( + [ + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + TensorPurpose.FeatureMap, + ] + ), + ), + (set(("Placeholder", "SubgraphInput", "Const", "VariableV2")), purpose_from_list([])), + (set(("FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars")), input0_from_output_rest_parameter), + ( + set(("Square", "Sqrt", "Log", "Less", "Enter", "Exit", "Identity", "StopGradient", "Merge", "Switch")), + inputs_from_output, + ), + (None, all_fm), +] + + +for ops, input_purpose in tensor_purposes: + if ops is None: + continue + for op in ops: + assert len(op) > 1, "string literal has been decomposed" + + +def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False): + def mark_tensor_helper(tens, purpose): + + if tens.purpose == TensorPurpose.Unknown or tens.purpose == purpose: + tens.purpose = purpose + else: + assert 0, "Cannot resolve tensor purpose %s and %s for tensor %s" % (tens.purpose, purpose, tens) + tens.mem_area = arch.tensor_storage_mem_area[tens.purpose] + + if len(tens.ops) == 1 and tens.ops[0].type == "Const": + tens.mem_area = ( + arch.permanent_storage_mem_area + ) # special case constants, as they must be in permanent storage + + def rewrite_mark_tensor_purpose(op, arch): + # find disconnected outputs and mark as parameters + for tens in op.outputs: + if not tens.consumers(): + mark_tensor_helper(tens, TensorPurpose.FeatureMap) + + for ops, input_purpose in tensor_purposes: + if ops is None or op.type in ops: + if ops is None: + print( + "warning: don't know how to mark up purpose for", + op.type, + op.inputs, + "triggering all feature map fallback", + ) + for idx, tens in enumerate(op.inputs): + purpose = input_purpose(op, idx) + mark_tensor_helper(tens, purpose) + break + return op + + for sg in nng.subgraphs: + sg = rewrite_graph.rewrite_graph_pre_order(sg, arch, [], [rewrite_mark_tensor_purpose]) + for tens in sg.output_tensors: + mark_tensor_helper(tens, TensorPurpose.FeatureMap) + + if verbose_tensor_purpose: + nng.print_graph_with_tensors() + + return nng + + +reshape_operations = set( + ( + "Reshape", + "QuantizedReshape", + "ExpandDims", + "Squeeze", + "BatchToSpaceND", + "SpaceToBatchND", + "DepthToSpaceND", + "SpaceToDepthND", + "Placeholder", + ) +) + + +def mark_tensor_format(nng, arch, verbose_tensor_format=False): + formats_for_tensor = {} + + def init_tens(tens): + if tens.purpose == TensorPurpose.FeatureMap: + fmt = arch.default_feature_map_format + elif tens.purpose == TensorPurpose.Weights: + fmt = arch.default_weight_format + else: + assert 0, "unknown tensor purpose %s" % (tens.purpose,) + return fmt + + def find_npu_usage_of_tensor(tens): + for op in tens.consumers(): + if op.type == "DMA": + return find_npu_usage_of_tensor(op.outputs[0]) + if "npu_block_type" in op.attrs: + return op.attrs["npu_block_type"] + return NpuBlockType.Default + + def visit_tens(tens, ps): + if not tens in formats_for_tensor: + fmt = init_tens(tens) + else: + fmt = formats_for_tensor[tens] + + formats_for_tensor[tens] = fmt + + for sg in nng.subgraphs: + for ps in sg.passes: + for tens in ps.outputs: + visit_tens(tens, ps) + for tens in ps.intermediates: + visit_tens(tens, ps) + for tens in ps.inputs: + visit_tens(tens, ps) + + for tens, fmt in formats_for_tensor.items(): + tens.set_format(fmt, arch) + if fmt == TensorFormat.WeightsCompressed and tens.values is not None: + npu_block_type = find_npu_usage_of_tensor(tens) + if len(tens.ops) == 1 and tens.ops[0].type == "DMA": + weight_compressor.compress_weights(tens, arch, npu_block_type, Block(32, 32, 32), 32) + # Alias compressed weights back into source tensor + src_tens = tens.ops[0].inputs[0] + src_tens.compressed_values = tens.compressed_values + src_tens.storage_shape = tens.storage_shape + src_tens.brick_size = tens.brick_size + src_tens.weight_compression_scales = tens.weight_compression_scales + src_tens.weight_compressed_offsets = tens.weight_compressed_offsets + src_tens.compression_scale_for_worst_weight_stream = tens.compression_scale_for_worst_weight_stream + src_tens.storage_compression_scale = tens.storage_compression_scale + + if verbose_tensor_format: + nng.print_passes_with_tensors() diff --git a/ethosu/vela/model_reader.py b/ethosu/vela/model_reader.py new file mode 100644 index 00000000..6d7a3a4f --- /dev/null +++ b/ethosu/vela/model_reader.py @@ -0,0 +1,45 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Dispatcher for reading a neural network model. + + +class ModelReaderOptions: + def __init__(self, batch_size=1): + self.batch_size = batch_size + + def __str__(self): + return type(self).__name__ + ": " + str(self.__dict__) + + __repr__ = __str__ + + +def read_model(fname, options, feed_dict={}, output_node_names=[], initialisation_nodes=[]): + if fname.endswith(".tflite"): + from . import tflite_reader + + nng = tflite_reader.read_tflite( + fname, + options.batch_size, + feed_dict=feed_dict, + output_node_names=output_node_names, + initialisation_nodes=initialisation_nodes, + ) + else: + assert 0, "Unknown model format" + return nng diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py new file mode 100644 index 00000000..8d335bd8 --- /dev/null +++ b/ethosu/vela/nn_graph.py @@ -0,0 +1,548 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Neural network graph classes and enums. +# Pass - A packed pass containing one or more Operations. +# CascadedPass - A scheduled pass containing one or more Passes, as well as a scheduling strategy and block +# configurations. +# Subgraph - Holds a neural network subgraph, pointing at Tensors, Operations, Passes, and CascadedPasses. +# Graph - A full neural network graph with one or more Subgraphs. + +import enum +from .data_type import BaseType, DataType +from .tensor import MemArea, TensorPurpose, TensorSubPurpose, TensorFormat, Tensor +from .operation import Operation, NpuBlockType + + +class PassPlacement(enum.Enum): + Unknown = 0 + Cpu = 1 + Npu = 2 + MemoryOnly = 3 + StartupInit = 4 + + +class TensorAllocator(enum.Enum): + LinearAlloc = 1 + Greedy = 2 + + def __str__(self): + return self.name + + +class Pass: + def __init__(self, name, placement, is_element_wise, npu_block_type): + self.inputs = [] + self.intermediates = [] + self.outputs = [] + self.ops = [] + self.primary_op = None + self.ifm_tensor = None + self.ifm2_tensor = None + self.ofm_tensor = None + self.weight_tensor = None + self.scale_tensor = None + self.name = name + self.cascade = None + self.placement = placement + + # TODO: rename is_element_wise because it is not the same as an ElementWise operator. It is used by the tensor + # allocation and requires that the OFM and IFM has the exact same address. Essentially complete overlap. + self.is_element_wise = is_element_wise + self.npu_block_type = npu_block_type + self.block_config = None # will be filled in by scheduler + self.shared_buffer = None # will be filled in by scheduler + + self.predecessors = [] + self.successors = [] + + def __str__(self): + return "" % (self.name, self.placement, [op.type for op in self.ops]) + + __repr__ = __str__ + + def get_primary_op_ifm_weights(self): + if not self.primary_op: + return None, None + return self.primary_op.get_ifm_ifm2_weights_ofm()[::2] + + def get_primary_op_ifm_ifm2_weights_ofm(self): + if not self.primary_op: + return None, None, None, None + return self.primary_op.get_ifm_ifm2_weights_ofm() + + def get_primary_op_ifm_weights_biases_ofm(self): + if not self.primary_op: + return None, None, None, None + return self.primary_op.get_ifm_weights_biases_ofm() + + +class SchedulingStrategy(enum.Enum): + Unknown = -1 + IfmStream = 0 + WeightStream = 1 + + +class SchedulerRewrite(enum.Enum): + Nop = 0 + ChangeTensorSubPurpose = 1 + + +class CascadedPass: + def __init__(self, name, strat, inputs, intermediates, outputs, passes, placement, is_element_wise): + self.name = name + self.strategy = strat + self.inputs = inputs + self.intermediates = intermediates + self.outputs = outputs + self.passes = passes + self.placement = placement + self.is_element_wise = is_element_wise + + self.predecessors = [] + self.successors = [] + + def __str__(self): + return "" % ( + self.strategy, + len(self.passes), + self.name, + [ps.name for ps in self.passes], + [ps.block_config for ps in self.passes], + ) + + __repr__ = __str__ + + +class Subgraph: + def __init__(self, name="", placement=PassPlacement.Cpu): + self.output_tensors = [] + self.input_tensors = [] + self.original_inputs = [] # Preserve the original input order + self.passes = [] + self.cascaded_passes = [] + self.name = name + self.high_level_command_stream = [] + self.placement = placement + self.command_stream_tensor = None + self.flash_tensor = None + + self.memory_used = {} + + def __str__(self): + return "" % ( + self.name, + len(self.passes), + len(self.cascaded_passes), + ) + + __repr__ = __str__ + + def update_consumers(self): + visit_op_set = set() + visit_tensor_set = set() + self.input_tensors = [] + + print_visit = False + + def visit_op(op): + if op in visit_op_set: + return + + visit_op_set.add(op) + for inp in op.inputs: + if print_visit: + print(inp, "adding consumer", op) + visit_tensor(inp) + inp.consumer_list.append(op) + + if op.type in set(("Placeholder", "SubgraphInput")): + assert len(op.outputs) == 1 + self.input_tensors.append(op.outputs[0]) + + for out in op.outputs: + if out not in visit_tensor_set: + out.consumer_list = [] # reset unvisited output, just in case + + def visit_tensor(tens): + if tens in visit_tensor_set: + return + visit_tensor_set.add(tens) + tens.consumer_list = [] + for op in tens.ops: + visit_op(op) + + for ps in self.passes: + for tens in ps.outputs + ps.inputs: + tens.consumer_list = [] # reset unvisited tensors to start with + + for tens in self.output_tensors: + visit_tensor(tens) + tens.consumer_list.append(None) # special op to indicate that the graph consumes the result + + print_visit = True + for ps in self.passes: + for op in ps.ops: + visit_op(op) + for tens in ps.inputs: + visit_tensor(tens) + + def build_pass_links(self): + for idx, ps in enumerate(self.passes): + ps.time = 2 * idx + ps.predecessors = [] + ps.successors = [] + + for ps in self.passes: + for tens in ps.inputs: + for op in tens.ops: + pred_pass = op.scheduled_pass + assert pred_pass.time < ps.time + if ps not in pred_pass.successors: + pred_pass.successors.append(ps) + + if pred_pass not in ps.predecessors: + ps.predecessors.append(pred_pass) + + assert tens in pred_pass.outputs + + def build_pass_dag_predecessors(self): + for ps in self.passes: + ps.dag_predecessors = [] + + class State(enum.Enum): + NotVisited = 0 + BeingVisited = 1 + Visited = 2 + + pass_visit_dict = {} + + def visit_pass(ps): + state = pass_visit_dict.get(ps, State.NotVisited) + if state == State.Visited: + return True + elif state == State.BeingVisited: + return False # this is a loop, need to remove this link + elif state == State.NotVisited: + pass_visit_dict[ps] = State.BeingVisited + + ps.dag_predecessors = [] + for pred in ps.predecessors: + if visit_pass(pred): + ps.dag_predecessors.append(pred) + + pass_visit_dict[ps] = State.Visited + return True + + for ps in self.passes: + if not ps.successors: + visit_pass(ps) + + def build_cascaded_pass_links(self): + for cps in self.cascaded_passes: + cps.predecessors = [] + cps.successors = [] + + for cps in self.cascaded_passes: + for tens in cps.inputs: + for op in tens.ops: + pred_cpass = op.scheduled_pass.cascade + if cps not in pred_cpass.successors: + pred_cpass.successors.append(cps) + + if pred_cpass not in cps.predecessors: + cps.predecessors.append(pred_cpass) + + assert tens in pred_cpass.outputs + + def refresh_after_modification(self): + self.update_consumers() + + def prune_startup_init_pass(self): + assert len(self.passes) >= 1 + ps = self.passes[0] + assert ps.placement == PassPlacement.StartupInit + + ps.outputs = [out_tens for out_tens in ps.outputs if len(out_tens.consumers()) > 0] + ps.ops = [op for op in ps.ops if op.outputs[0] in ps.outputs] + + def get_all_ops(self): + all_ops = [] + visit_op_set = set() + visit_tensor_set = set() + + def visit_op(op): + if op in visit_op_set: + return + visit_op_set.add(op) + for inp in op.inputs: + visit_tensor(inp) + + all_ops.append(op) + + def visit_tensor(tens): + if tens in visit_tensor_set: + return + visit_tensor_set.add(tens) + for op in tens.ops: + visit_op(op) + + for tens in self.output_tensors: + visit_tensor(tens) + + return all_ops + + def print_operators(self): + all_ops = self.get_all_ops() + unique_ops = [] + print("print_operators") + for op in all_ops: + if op.type in set(("Const", "Identity", "Placeholder")): + continue + + attrs = op.attrs + if ( + op.type == "Conv2D" + or op.type == "DepthwiseConv2dNative" + or op.type == "Conv2DBiasAct" + or op.type == "DepthwiseConv2dBiasAct" + ): + kshape = op.inputs[1].shape + attrs["kshape"] = [kshape[0], kshape[1]] + attrs["type"] = op.type + attrs.pop("use_cudnn_on_gpu", None) + if attrs not in unique_ops: + unique_ops.append(attrs) + # print attributes in human readable format + a = attrs.copy() + s = a.pop("type") + data_format = a.pop("data_format", None) + if data_format and data_format != b"NHWC": + s += " " + str(data_format) + t = a.pop("T", None) + if t: + s += " " + str(t)[9:-2] + srct = a.pop("SrcT", None) + if srct: + s += " " + str(srct)[9:-2] + dstt = a.pop("DstT", None) + if dstt: + s += "->" + str(dstt)[9:-2] + print(s + " " + str(a)) + + def print_graph(self): + all_ops = self.get_all_ops() + for idx, op in enumerate(all_ops): + print(idx, op.type, op.name) + + def print_graph_with_tensors(self): + all_ops = self.get_all_ops() + for idx, op in enumerate(all_ops): + print(idx, op.type, op.name) + for idx, tens in enumerate(op.inputs): + print(" Input %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens)) + for idx, tens in enumerate(op.outputs): + print(" Output %02d %20s %20s %s" % (idx, tens.purpose.name, tens.mem_area.name, tens)) + print() + + def print_graph_with_tensor_quantization(self): + all_ops = self.get_all_ops() + for idx, op in enumerate(all_ops): + print(idx, op.type, op.name) + for idx, tens in enumerate(op.inputs): + q = tens.quantization + if q is None: + print(" Input %02d %10s NO QUANTIZATION INFO %s" % (idx, tens.dtype, tens.name)) + else: + print( + " Input %02d %10s min=%s max=%s scale=%s zero_point=%s %s" + % (idx, tens.dtype, q.min, q.max, q.scale_f32, q.zero_point, tens.name) + ) + for idx, tens in enumerate(op.outputs): + q = tens.quantization + if q is None: + print(" Output %02d %10s NO QUANTIZATION INFO %s" % (idx, tens.dtype, tens.name)) + else: + print( + " Output %02d %10s min=%s max=%s scale=%s zero_point=%s %s" + % (idx, tens.dtype, q.min, q.max, q.scale_f32, q.zero_point, tens.name) + ) + print() + + def print_passes(self): + for idx, ps in enumerate(self.passes): + print("%03d %s" % (idx * 2, ps)) + + def print_passes_with_tensors(self): + for idx, ps in enumerate(self.passes): + print("%3d %s" % (idx * 2, ps)) + for idx, tens in enumerate(ps.inputs): + print( + " Input %2d %-15s %-15s %-15s %s" + % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name) + ) + for idx, tens in enumerate(ps.intermediates): + print( + " Intermediate %2d %-15s %-15s %-15s %s" + % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name) + ) + for idx, tens in enumerate(ps.outputs): + print( + " Output %2d %-15s %-15s %-15s %s" + % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name) + ) + print() + + def print_cascaded_passes(self): + for idx, ps in enumerate(self.cascaded_passes): + print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024)) + + def print_cascaded_passes_with_tensors(self): + for idx, ps in enumerate(self.cascaded_passes): + print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024)) + for idx, tens in enumerate(ps.inputs): + print( + " Input %2d %-15s %-15s %-15s %s" + % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name) + ) + for idx, tens in enumerate(ps.intermediates): + print( + " Intermediate %2d %-15s %-15s %-15s %s" + % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name) + ) + for idx, tens in enumerate(ps.outputs): + print( + " Output %2d %-15s %-15s %-15s %s" + % (idx, tens.purpose.name, tens.mem_area.name, tens.format.name, tens.name) + ) + print() + + def print_cascaded_passes_with_tensor_sizes(self): + for idx, ps in enumerate(self.cascaded_passes): + print("%3d %s SRAM used %.1f KB" % (idx * 2, ps, ps.sram_used / 1024)) + for idx, tens in enumerate(ps.inputs): + print( + " Input %2d %7.1f KB %-24s %-15s %-15s %-20s %s" + % ( + idx, + tens.storage_size() / 1024, + tens.storage_shape, + tens.mem_area.name, + tens.purpose.name, + tens.format.name, + tens.name, + ) + ) + for idx, tens in enumerate(ps.intermediates): + print( + " Intermediate %2d %7.1f KB %-24s %-15s %-15s %-20s %s" + % ( + idx, + tens.storage_size() / 1024, + tens.storage_shape, + tens.mem_area.name, + tens.purpose.name, + tens.format.name, + tens.name, + ) + ) + for idx, tens in enumerate(ps.outputs): + print( + " Output %2d %7.1f KB %-24s %-15s %-15s %-20s %s" + % ( + idx, + tens.storage_size() / 1024, + tens.storage_shape, + tens.mem_area.name, + tens.purpose.name, + tens.format.name, + tens.name, + ) + ) + print() + + def print_high_level_command_stream(self): + for idx, cmd in enumerate(self.high_level_command_stream): + print("%3d %s" % (idx, cmd)) + + +class Graph: + def __init__(self, name="", batch_size=1): + self.name = name + self.batch_size = batch_size + self.subgraphs = [] + + self.memory_used = {} + self.bits_per_element = {} + self.total_size = {} + self.total_elements = {} + + def get_root_subgraph(self): + return self.subgraphs[0] + + def prune_startup_init_pass(self): + for sg in self.subgraphs: + sg.prune_startup_init_pass() + + def update_consumers(self): + for sg in self.subgraphs: + sg.update_consumers() + + def refresh_after_modification(self): + for sg in self.subgraphs: + sg.refresh_after_modification() + + def print_operators(self): + for sg in self.subgraphs: + sg.print_operators() + + def print_graph(self): + for sg in self.subgraphs: + sg.print_graph() + + def print_graph_with_tensors(self): + for sg in self.subgraphs: + sg.print_graph_with_tensors() + + def print_graph_with_tensor_quantization(self): + for sg in self.subgraphs: + sg.print_graph_with_tensor_quantization() + + def print_passes(self): + for sg in self.subgraphs: + sg.print_passes() + + def print_passes_with_tensors(self): + for sg in self.subgraphs: + sg.print_passes_with_tensors() + + def print_cascaded_passes(self): + for sg in self.subgraphs: + sg.print_cascaded_passes() + + def print_cascaded_passes_with_tensors(self): + for sg in self.subgraphs: + sg.print_cascaded_passes_with_tensors() + + def print_cascaded_passes_with_tensor_sizes(self): + for sg in self.subgraphs: + sg.print_cascaded_passes_with_tensor_sizes() + + def print_high_level_command_stream(self): + for sg in self.subgraphs: + sg.print_high_level_command_stream() diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py new file mode 100644 index 00000000..84cc4931 --- /dev/null +++ b/ethosu/vela/npu_performance.py @@ -0,0 +1,516 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the +# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'. +# +# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance +# estimate. + +import enum +from . import numeric_util +import numpy as np +from .tensor import TensorPurpose, MemArea, TensorFormat, shape_num_elements, Tensor, TensorBlockTraversal +from .operation import Operation +from .data_type import DataType, BaseType +from .nn_graph import PassPlacement, NpuBlockType, SchedulerRewrite, Pass +from .architecture_features import Block, Kernel + + +def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2): + ps2_strides = (1, 1, 1, 1) + ps2_dilation = (1, 1, 1, 1) + for op in ps2.ops: + if "strides" in op.attrs: + ps2_strides = op.attrs["strides"] + if "dilation" in op.attrs: + ps2_dilation = op.attrs["dilation"] + + ifm_idx, _, weight_idx, _, _ = op.get_ifm_ifm2_weight_bias_ofm_indices() + + rolling_buffer_sizes = [] + + weight_tensor = op.inputs[weight_idx] + + ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1]) + kernel = Kernel( + weight_tensor.shape[1], weight_tensor.shape[0], ps2_strides[2], ps2_strides[1], ps2_dilation[2], ps2_dilation[1] + ) + kernel_block = Block(weight_tensor.shape[1], weight_tensor.shape[0], 65536) + + if ps2.npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)): + ifm_block_depth = arch.calc_ifm_block_depth( + op.inputs[ifm_idx].shape[-1], op.inputs[ifm_idx].dtype.size_in_bits() + ) + else: + ifm_block_depth = block_config_ps2[-1] + + ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, kernel_block) + + # The performed height calculation is for worst case + height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0]) + width = ifm_block.width + + rolling_buffer_sizes.append(height) + rolling_buffer_sizes.append(width) + + return rolling_buffer_sizes + + +class PassCycles(enum.IntEnum): + Dpu = 0 + ElementWise = 1 + Cpu = 2 + SramAccess = 3 + TotalPerPass = 4 + DramAccess = 5 + OnChipFlashAccess = 6 + OffChipFlashAccess = 7 + Total = 8 + Size = 9 + + def display_name(self): + return ( + "DPU", + "Element wise", + "CPU", + "SRAM Access", + "Total per Pass", + "DRAM Access", + "On-chip Flash Access", + "Off-chip Flash Access", + "Total", + "Size", + )[self.value] + + def identifier_name(self): + return ( + "dpu", + "element_wise", + "cpu", + "sram_access", + "total_per_pass", + "dram_access", + "on_chip_flash_access", + "off_chip_flash_access", + "total", + "size", + )[self.value] + + @staticmethod + def all(): + return ( + PassCycles.Dpu, + PassCycles.ElementWise, + PassCycles.Cpu, + PassCycles.SramAccess, + PassCycles.DramAccess, + PassCycles.OnChipFlashAccess, + PassCycles.OffChipFlashAccess, + PassCycles.Total, + ) + + +class MacCount(enum.IntEnum): + NeuralNetworkMacs = 0 + HardwareMacs = 1 + Size = 2 + + def display_name(self): + return ("Neural Network Macs", "Hardware Macs", "Size")[self.value] + + def identifier_name(self): + return ("nn_macs", "hardware_macs", "size")[self.value] + + @staticmethod + def all(): + return (MacCount.NeuralNetworkMacs, MacCount.HardwareMacs) + + +class BandwidthDirection(enum.IntEnum): + Read = 0 + Write = 1 + Size = 2 + + def display_name(self): + return self.name + + def identifier_name(self): + return self.name.lower() + + @staticmethod + def all(): + return (BandwidthDirection.Read, BandwidthDirection.Write) + + +def make_bandwidth_array(): + return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) + + +def make_macs_array(): + return np.zeros(MacCount.Size, np.int) + + +def make_cycles_array(): + return np.zeros(PassCycles.Size) + + +def make_metrics_arrays(): + return (make_bandwidth_array(), make_macs_array(), make_cycles_array()) + + +def get_n_blocks_and_area( + ifm_brick_size, ifm_height_width, orig_skirt, clamped_skirt, block_config, min_block_size, strides +): + + ifm_block_config = (block_config[0] * strides[1], block_config[1] * strides[2]) + + n_normal_blocks = [] + remainder_size = [] + for i in range(2): + non_skirt_dim = ifm_height_width[i] - orig_skirt[i] - orig_skirt[2 + i] + n_blocks = non_skirt_dim // ifm_block_config[i] + n_normal_blocks.append(n_blocks) + remainder_dim = numeric_util.round_up( + ((non_skirt_dim - n_blocks * ifm_block_config[i] - 1) // strides[i + 1]) + 1, min_block_size[i] + ) + remainder_size.append(remainder_dim) + + # this will actually calculate reads into the edge padding. + + # there are four cases in total, handling the edges that will not fill a complete block. + + # 0000000001 + # 0000000001 + # 0000000001 + # 0000000001 + # 0000000001 + # 0000000001 + # 2222222223 + total_blocks = 0 + total_area = 0 + + block_setup = ( + (n_normal_blocks[0] * n_normal_blocks[1], block_config), + (1 * n_normal_blocks[1], (remainder_size[0], block_config[1])), + (n_normal_blocks[0] * 1, (block_config[0], remainder_size[1])), + (1 * 1, remainder_size), + ) + + for n_blocks, block_size in block_setup: + if block_size[0] == 0 or block_size[1] == 0: + continue + read_dims = [0, 0] + for i in range(2): + read_dims[i] = ( + numeric_util.round_up(clamped_skirt[i], ifm_brick_size[i + 1]) + + block_size[i] * strides[i + 1] + + numeric_util.round_up(clamped_skirt[2 + i], ifm_brick_size[i + 1]) + ) + assert n_blocks >= 0 + total_blocks += n_blocks + total_area += n_blocks * read_dims[0] * read_dims[1] + assert total_blocks >= 1 + return total_blocks, total_area, block_setup + + +def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False): + if block_config is None: + block_config = ps.block_config + bws = make_bandwidth_array() + macs = make_macs_array() + cycles = make_cycles_array() + blocks = 0 + ifm_read_multiple = 1 + weight_read_multiple = 0 + + if ps.placement in set((PassPlacement.MemoryOnly, PassPlacement.StartupInit)): + return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass + + min_block_size = arch.min_block_sizes[ps.npu_block_type] + + skirt = (0, 0, 0, 0) + explicit_padding = (0, 0, 0, 0) + primary_op = ps.primary_op + replacement_read_bws = {} + if primary_op: + skirt = primary_op.attrs.get("skirt", skirt) + explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding) + assert primary_op.attrs["npu_block_type"] == ps.npu_block_type + npu_block_type = primary_op.attrs["npu_block_type"] + + ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm() + + npu_convolution_ops = set((NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)) + if (npu_block_type == NpuBlockType.Pooling and len(ifm_tensor.shape) == 4) or ( + npu_block_type in npu_convolution_ops + ): + + batch_size = ifm_tensor.shape[0] + ifm_tensor_shape = list(ifm_tensor.shape) + ifm_depth = ifm_tensor.bandwidth_shape[3] + + # add in padding + ifm_tensor_shape[1] += explicit_padding[0] + explicit_padding[2] # height += top and bottom + ifm_tensor_shape[2] += explicit_padding[1] + explicit_padding[3] # width += left and right + + strides = primary_op.attrs["strides"] + if npu_block_type != NpuBlockType.Pooling: + weight_tensor_shape = weight_tensor.shape + weight_tensor_bandwidth_shape = weight_tensor.bandwidth_shape + weight_tensor_element_size = weight_tensor.element_size() + weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale + nn_ops = ( + int(ofm_tensor.shape[0]) + * int(ofm_tensor.shape[1]) + * int(ofm_tensor.shape[2]) + * int(weight_tensor_shape[0]) + * int(weight_tensor_shape[1]) + * int(weight_tensor_shape[2]) + * int(weight_tensor_shape[3]) + / int(strides[1]) + / int(strides[2]) + ) + else: + weight_tensor_shape = [ + primary_op.attrs["ksize"][1], + primary_op.attrs["ksize"][2], + 1, + ifm_tensor_shape[3], + ] + weight_tensor_bandwidth_shape = weight_tensor_shape + weight_tensor_element_size = 0 + weight_tensor_bandwidth_compression_scale = 0.0 + nn_ops = 0 # pooling doesn't count as NN ops + + kernel_dims = weight_tensor_shape[:2] + + sub_kernel_limits = arch.sub_kernel_limits[npu_block_type] + # count the sub kernels; the IFM block needs to be refetched for each of them + n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0]) + n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1]) + n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x + + clamped_skirt = list(skirt) + clamped_skirt[2] = min(clamped_skirt[2], sub_kernel_limits[0] - 1 - clamped_skirt[0]) + clamped_skirt[3] = min(clamped_skirt[3], sub_kernel_limits[1] - 1 - clamped_skirt[1]) + n_blocks, area, block_setup = get_n_blocks_and_area( + ifm_tensor.brick_size, + ifm_tensor_shape[1:3], + skirt, + clamped_skirt, + block_config, + min_block_size, + strides, + ) + + blocks = n_blocks * numeric_util.round_up_divide(weight_tensor_shape[3], block_config[3]) + + n_weight_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], block_config[3]) + if npu_block_type == NpuBlockType.ConvolutionDepthWise or npu_block_type == NpuBlockType.Pooling: + n_weight_stages = 1 # force to no reread + + ifm_tensor_bw = ( + n_sub_kernels + * batch_size + * area + * ifm_depth + * n_weight_stages + * ifm_tensor.element_size() + * ifm_tensor.bandwidth_compression_scale + ) + replacement_read_bws[ifm_tensor] = ifm_tensor_bw + ifm_read_multiple = n_weight_stages + + replacement_read_bws[weight_tensor] = ( + batch_size + * shape_num_elements(weight_tensor_bandwidth_shape) + * weight_tensor_element_size + * weight_tensor_bandwidth_compression_scale + * n_blocks + ) # read once per block and batch + weight_read_multiple = n_blocks + + n_kernel_xy = kernel_dims[0] * kernel_dims[1] + n_input_channels_at_a_time = block_config[2] + + if npu_block_type == NpuBlockType.Pooling or weight_tensor.block_traversal in set( + (TensorBlockTraversal.PartKernelFirst, TensorBlockTraversal.DepthWise) + ): + n_input_channels_at_a_time = numeric_util.round_up_divide(n_input_channels_at_a_time, 4) + n_kernel_xy = max( + n_kernel_xy, 4 + ) # need at least 4, as this is the minimum duty cycle for secondary accumulator writes + if weight_tensor is not None: + n_kernel_xy = numeric_util.round_up( + n_kernel_xy, 4 + ) # weights need to be read in blocks of 4 + + num_mac_ops = 0 + for n_blocks_for_size, block_size in block_setup: + num_mac_ops += ( + batch_size + * n_blocks_for_size + * block_size[0] + * block_size[1] + * numeric_util.round_up(weight_tensor_shape[2], n_input_channels_at_a_time) + * numeric_util.round_up(weight_tensor_shape[3], block_config[3]) + * n_kernel_xy + ) + + if npu_block_type == NpuBlockType.Pooling: + # TODO: improve pooling estimation + cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle / 2 + else: + cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle + macs[MacCount.NeuralNetworkMacs] += nn_ops + macs[MacCount.HardwareMacs] += num_mac_ops + + elif npu_block_type == NpuBlockType.VectorProduct: + nn_macs = ( + ifm_tensor.shape[0] + * numeric_util.round_up(weight_tensor.shape[-2], block_config[2]) + * numeric_util.round_up(weight_tensor.shape[-1], block_config[3]) + ) + num_mac_ops = nn_macs + + cycles[PassCycles.Dpu] = num_mac_ops / arch.num_macs_per_cycle + macs[MacCount.NeuralNetworkMacs] += nn_macs + macs[MacCount.HardwareMacs] += num_mac_ops + + blocks = 1 * numeric_util.round_up_divide(weight_tensor.shape[-1], block_config[3]) + + non_zero_fraction = 1.0 + if ifm_tensor.values is not None: + nz_vector = np.amax(ifm_tensor.values != 0, axis=0) # max across batch axis + non_zero_fraction = np.average(nz_vector) + + replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() + replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction + ifm_read_multiple = 1 + weight_read_multiple = non_zero_fraction + else: + if ps.placement == PassPlacement.Npu and len(ps.outputs): + # Assume element-wise operation going through the element pipelines. + # Work out how many elements we have and calculate performance. + out = ps.outputs[0] + elms = out.elements() + + cycles[PassCycles.ElementWise] = numeric_util.round_up_divide(elms, arch.num_elem_wise_units) + + if ps.placement == PassPlacement.Cpu: + cycles[PassCycles.Cpu] = arch.cpu_cycle_estimate(ps.ops[0]) + + # apply the desired rewrites + for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list: + if ps != ps_to_rewrite: + continue + if rewrite_op == SchedulerRewrite.Nop: + pass # these are fine, no bandwidth changes + elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,): + bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens] + replacement_read_bws[tens] = 0 + + for tens in ps.outputs: + if force_outputs_to_fast_storage: + bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() + else: + bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() + + for tens in ps.intermediates: + bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth() + + if tens in replacement_read_bws: + bw = replacement_read_bws[tens] + else: + bw = tens.bandwidth() + + bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw + + for tens in ps.inputs: + if tens in replacement_read_bws: + bw = replacement_read_bws[tens] + else: + bw = tens.bandwidth() + + bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw + + cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram] + cycles[PassCycles.TotalPerPass] = np.max(cycles[: PassCycles.TotalPerPass]) + + # quick build access counts for only current pass, even though these aren't the final numbers + update_summary_cycles(arch, bws, macs, cycles) + + return bws, macs, cycles, blocks, ifm_read_multiple, weight_read_multiple + + +def update_summary_cycles(arch, bws, macs, cycles): + cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram] + cycles[PassCycles.OnChipFlashAccess] = ( + np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash] + ) + cycles[PassCycles.OffChipFlashAccess] = ( + np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash] + ) + + cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total]) + return cycles + + +def collate_stats_for_cascaded_pass(arch, bws, macs, cycles): + return bws, macs, cycles + + +def performance_for_cascaded_pass(arch, cps): + total_bws = make_bandwidth_array() + total_macs = make_macs_array() + total_cycles = make_cycles_array() + + for ps in cps.passes: + bws, macs, cycles, blocks, _, _ = performance_metrics_for_pass(arch, ps) + ps.bandwidths = bws + ps.macs = macs + ps.cycles = cycles + ps.n_blocks = blocks + total_bws += bws + total_macs += macs + total_cycles += cycles + + bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles) + cps.bandwidths = bws + cps.macs = macs + cps.cycles = cycles + return bws, macs, cycles + + +def calc_performance_for_network(nng, arch): + total_bws = make_bandwidth_array() + total_macs = np.zeros(MacCount.Size) + total_cycles = np.zeros(PassCycles.Size) + + for sg in nng.subgraphs: + for cps in sg.cascaded_passes: + bws, macs, cycles = performance_for_cascaded_pass(arch, cps) + total_bws += bws + total_macs += macs + total_cycles += cycles + total_cycles += arch.inter_pass_cycle_delay + + nng.bandwidths = total_bws + nng.macs = total_macs + nng.cycles = total_cycles diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py new file mode 100644 index 00000000..4542c25b --- /dev/null +++ b/ethosu/vela/npu_serialisation.py @@ -0,0 +1,145 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Serialises and packs an NPU subgraph into tensors. + +from .nn_graph import PassPlacement +from .tensor import MemArea, Tensor, TensorPurpose, TensorFormat +from .operation import Operation +from .data_type import DataType +import numpy as np +from . import driver_actions +import struct + + +def make_memory_tensor(name, mem_area, sz, want_values, arch): + tens = Tensor([sz], DataType.uint8, name) + tens.mem_area = mem_area + tens.purpose = TensorPurpose.FeatureMap + tens.set_format(TensorFormat.NHWC, arch) + if want_values: + tens.values = np.zeros(tens.shape, np.uint8) + return tens + + +def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor): + start_addr = src_tensor.address + for compressed_values in src_tensor.compressed_values: + end_addr = start_addr + len(compressed_values) + memory_tensor.values[start_addr:end_addr] = compressed_values + start_addr = end_addr + + +def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens): + if sg.placement != PassPlacement.Npu: + return scratch_tens, flash_tens + + flash_area = arch.permanent_storage_mem_area + scratch_area = MemArea.Sram + + flash_size = sg.memory_used.get(flash_area, 0) + scratch_size = sg.memory_used.get(scratch_area, 0) + + # Prepare driver actions for this command tensor + da_list = [] + driver_actions.emit_fourcc(da_list, "COP1") + driver_actions.emit_config(da_list, 0, 1, arch) + driver_actions.emit_cmd_stream_header(da_list, len(sg.register_command_stream)) + + # Append command stream words + da_list.extend(sg.register_command_stream) + + # Convert to bytes + payload_bytes = struct.pack("<{0}I".format(len(da_list)), *da_list) + + command_stream_size_bytes = len(payload_bytes) + + # Adjust the bits per element calculation to exclude metadata generated by Vela + nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes + nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes + nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size + nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size + + if flash_tens == scratch_tens == None: + # First Npu subgraph, create scratch and flash tensors + sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch) + sg.scratch_tensor.purpose = TensorPurpose.Scratch + sg.flash_tensor = make_memory_tensor(sg.name + "_flash", flash_area, flash_size, True, arch) + else: + sg.scratch_tensor = scratch_tens + sg.scratch_tensor.shape[0] += scratch_size + sg.flash_tensor = flash_tens + sg.flash_tensor.shape[0] += flash_size + + for cps in sg.cascaded_passes: + for ps in cps.passes: + if ps.placement == PassPlacement.Npu and ps.weight_tensor != None: + # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address + # is pointing at the destination address of where the weights should be placed in SRAM. + # This ensures that the Flash weight tensor is used instead and thus gets the correct address. + if ps.weight_tensor.ops[0].type == "DMA": + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0]) + else: + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor) + + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor) + + sg.command_stream_tensor = make_memory_tensor( + sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch + ) + sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8) + + return sg.scratch_tensor, sg.flash_tensor + + +def add_const_tens_to_startup_cascaded_pass(startup_cps, tens): + op = Operation("Const", tens.name + "_const") + op.outputs = [tens] + tens.ops = [op] + startup_cps.passes[0].ops.insert(0, op) + startup_cps.passes[0].outputs.insert(0, tens) + startup_cps.outputs.insert(0, tens) + + +def rewrite_npu_call_ops(nng, sg, arch): + if sg.placement != PassPlacement.Cpu: + return + + startup_cps = sg.cascaded_passes[0] + + for idx, cps in enumerate(sg.cascaded_passes): + for ps in cps.passes: + for op in ps.ops: + if op.type == "NpuOp": + callee = op.attrs["subgraph"] + op.attrs["custom_options"] = {"type": op.type} + + sz = 0 + for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]: + op.inputs.insert(0, tens) + ps.inputs.insert(0, tens) + cps.inputs.insert(0, tens) + if tens != callee.scratch_tensor: + add_const_tens_to_startup_cascaded_pass(startup_cps, tens) + sz += tens.storage_size() + + for prev_cps in sg.cascaded_passes[: idx + 1]: + prev_cps.sram_used += sz + + if callee.scratch_tensor is not None: + cps.sram_used += callee.scratch_tensor.storage_size() diff --git a/ethosu/vela/numeric_util.py b/ethosu/vela/numeric_util.py new file mode 100644 index 00000000..e5bc88b8 --- /dev/null +++ b/ethosu/vela/numeric_util.py @@ -0,0 +1,89 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Numerical utilities for various types of rounding etc. + +import math +import numpy as np + + +def round_up(a, b): + return ((a + b - 1) // b) * b + + +def round_up_divide(a, b): + return (a + b - 1) // b + + +def round_up_to_int(v): + return int(math.ceil(v)) + + +def round_down_to_power_of_two(v): + assert v > 0 + while v & (v - 1): + v &= v - 1 + + return v + + +def round_up_to_power_of_two(v): + return round_down_to_power_of_two(2 * v - 1) + + +def round_down_log2(v): + return int(math.floor(np.log2(v))) + + +def round_up_log2(v): + return int(math.ceil(np.log2(v))) + + +def round_to_int(v): + return np.rint(v).astype(np.int64) + + +# Performs rounding away from zero. +# n.b. This is identical to C++11 std::round() +def round_away_zero(f): + r = -0.5 if (f < 0) else 0.5 + return np.trunc(f + r) + + +def quantise_float32(f, scale=1.0, zero_point=0): + return zero_point + int(round_away_zero(np.float32(f) / np.float32(scale))) + + +def clamp_tanh(x): + if x <= -4: + y = -1.0 + elif x >= 4: + y = 1.0 + else: + y = math.tanh(x) + return y + + +def clamp_sigmoid(x): + if x <= -8: + y = 0.0 + elif x >= 8: + y = 1.0 + else: + y = 1 / (1 + math.exp(-x)) + return y diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py new file mode 100644 index 00000000..d2f2806a --- /dev/null +++ b/ethosu/vela/operation.py @@ -0,0 +1,285 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Internal representation of a Neural Network Operation. + +import enum + + +class NpuBlockType(enum.Enum): + Default = 0 + ConvolutionMxN = 1 + VectorProduct = 2 + Pooling = 3 + ConvolutionDepthWise = 4 + ElementWise = 5 + + +class Operation: + """Class representing a Neural Network operation. Has a name, a type, +input and output tensors, as well as an attribute dictionary.""" + + __slots__ = "type", "name", "attrs", "inputs", "outputs", "flops", "scheduled_pass", "run_on_npu" + + def __init__(self, op_type, name): + self.type = op_type + self.name = name + self.attrs = {} + self.inputs = [] + self.outputs = [] + self.flops = 0 + self.run_on_npu = True + self.scheduled_pass = None + + def clone(self, suffix="_clone"): + res = Operation(self.type, self.name + suffix) + + res.attrs = dict(self.attrs) + res.inputs = list(self.inputs) + res.outputs = list(self.outputs) + res.flops = self.flops + res.scheduled_pass = self.scheduled_pass + + return res + + def __str__(self): + return "" % (self.name, self.type) + + __repr__ = __str__ + + def get_ifm_ifm2_weight_bias_ofm_indices(self): + ifm_idx = -1 + ifm2_idx = -1 + weight_idx = -1 + bias_idx = -1 + ofm_idx = -1 + npu_block_type = self.attrs.get("npu_block_type", NpuBlockType.Default) + if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)): + ifm_idx = 0 + weight_idx = 1 + ofm_idx = 0 + + if self.type in set(("Conv2DBiasAct", "DepthwiseConv2dBiasAct", "TransposeConvAct")): + if len(self.inputs) >= 3: + bias_idx = 2 + + elif npu_block_type == NpuBlockType.Pooling: + ifm_idx = 0 + ofm_idx = 0 + elif npu_block_type == NpuBlockType.VectorProduct: + ifm_idx = 0 + weight_idx = 1 + ofm_idx = 0 + + if self.type in set(("FullyConnectedAct",)): + if len(self.inputs) >= 3: + bias_idx = 2 + + if self.type == "BlockLSTM": + ifm_idx = 3 + weight_idx = 4 + ofm_idx = 6 + + elif npu_block_type == NpuBlockType.ElementWise: + ifm_idx = 0 + ifm2_idx = 1 + ofm_idx = 0 + + # LeakyRelu and Abs have a single IFM + if self.type in set(("LeakyRelu", "Abs")): + ifm2_idx = -1 + + elif self.type == "Conv2DBackpropInput": + ifm_idx = 2 + weight_idx = 1 + ofm_idx = 0 + + elif self.type in set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims")): + ifm_idx = 0 + ofm_idx = 0 + + elif self.is_split_op(): + ifm_idx = 0 + ofm_idx = 0 + if self.type == "Split": + ifm_idx = 1 + + elif self.is_concat_op(): + ifms, _ = self.get_concat_inputs_axis() + ifm_idx = self.inputs.index(ifms[0]) + if len(ifms) > 1: + ifm2_idx = self.inputs.index(ifms[1]) + ofm_idx = 0 + + return ifm_idx, ifm2_idx, weight_idx, bias_idx, ofm_idx + + def get_ifm_ifm2_weights_ofm(self): + ifm_tensor = None + ifm2_tensor = None + weight_tensor = None + ofm_tensor = None + + ifm_idx, ifm2_idx, weight_idx, bias_idx, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices() + if ifm_idx != -1: + ifm_tensor = self.inputs[ifm_idx] + if ifm2_idx != -1: + ifm2_tensor = self.inputs[ifm2_idx] + if weight_idx != -1: + weight_tensor = self.inputs[weight_idx] + if ofm_idx != -1: + ofm_tensor = self.outputs[ofm_idx] + + return ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor + + def get_ifm_weights_biases_ofm(self): + ifm_tensor = None + weight_tensor = None + bias_tensor = None + ofm_tensor = None + + ifm_idx, _, weight_idx, bias_idx, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices() + if ifm_idx != -1: + ifm_tensor = self.inputs[ifm_idx] + if weight_idx != -1: + weight_tensor = self.inputs[weight_idx] + if bias_idx != -1: + bias_tensor = self.inputs[bias_idx] + if ofm_idx != -1: + ofm_tensor = self.outputs[ofm_idx] + + return ifm_tensor, weight_tensor, bias_tensor, ofm_tensor + + concat_ops = set(("Concat", "ConcatV2", "QuantizedConcat", "ConcatTFLite", "PackReshaped")) + + def is_concat_op(self): + return self.type in Operation.concat_ops + + def get_concat_inputs_axis(self): + assert self.is_concat_op() + + if self.type == "ConcatV2": + axis_tensor = self.inputs[-1] + inputs = self.inputs[:-1] + elif self.type == "Concat": + axis_tensor = self.inputs[0] + inputs = self.inputs[1:] + elif self.type == "QuantizedConcat": + axis_tensor = self.inputs[0] + inputs = self.inputs[1:] + inputs = inputs[: len(inputs) // 3] # Skip min/max + + if self.type == "ConcatTFLite": + inputs = self.inputs + axis = self.attrs["axis"] + elif self.type == "PackReshaped": + # Requires fixup_pack_input to be called before this point + inputs = self.inputs + axis = self.attrs["axis"] + assert len(self.inputs) == self.attrs["values_count"] + else: + assert len(axis_tensor.ops) == 1 and axis_tensor.ops[0].type == "Const" + axis = int(axis_tensor.values) + + return inputs, axis + + split_ops = set(("Split", "StridedSlice", "Slice", "UnpackReshaped")) + + def is_split_op(self): + return self.type in Operation.split_ops + + def get_split_inputs_axis(self): + assert self.is_split_op() + + offset_start = None + offset_end = None + axis = None + if self.type == "Split": + # TODO: Extend split capabilities + # If num_or_size_splits is an integer, then value is split along dimension axis into num_split smaller + # tensors. This requires that num_split evenly divides value.shape[axis]. + # If num_or_size_splits is a 1-D Tensor (or list), we call it size_splits and value is split into + # len(size_splits) elements. The shape of the i-th element has the same size as the value except along + # dimension axis where the size is size_splits[i]. + num_splits = self.attrs.get("num_splits") + axis_tens = self.inputs[0] + assert len(axis_tens.ops) == 1 and axis_tens.ops[0].type == "Const" + axis = int(axis_tens.values) + input_tens = self.inputs[1] + outputs = self.outputs + assert num_splits == len(outputs) + + elif self.type == "Slice": + input_tens, begin_tens, size_tens = self.inputs + outputs = self.outputs + offset_start = [0] * len(input_tens.shape) + offset_end = [0] * len(input_tens.shape) + + for idx in range(len(begin_tens.values)): + # Check if the op should slice in dimension idx + if size_tens.values[idx] != input_tens.shape[idx]: + offset_start[idx] = begin_tens.values[idx] + offset_end[idx] = size_tens.values[idx] + offset_start[idx] + + elif self.type == "StridedSlice": + input_tens, begin_tens, end_tens, strides_tens = self.inputs + outputs = self.outputs + out_tens = outputs[0] + offset_start = [0] * len(outputs[0].shape) + offset_end = [0] * len(outputs[0].shape) + + # Extract masks + begin_mask = self.attrs["begin_mask"] + ellipsis_mask = self.attrs["ellipsis_mask"] + end_mask = self.attrs["end_mask"] + new_axis_mask = self.attrs["new_axis_mask"] + shrink_axis_mask = self.attrs["shrink_axis_mask"] + # TODO: Either extend this to support these different masks or check + # for this at an earlier stage and place the op on Cpu if needed + assert begin_mask == end_mask + assert new_axis_mask == ellipsis_mask == 0 + # shrink_axis_mask is not supported by the Operation class but the operation + # may have the attribute modified and handled in the graph optimization phase. + assert shrink_axis_mask == 0 + assert len(input_tens.shape) == len(out_tens.shape) + + for idx in range(len(input_tens.shape)): + # If the i:th bit in begin_mask is set then the value on begin[i] should be ignored + if (begin_mask & (1 << idx)) == 0: + # Check if the op should slice in dimension idx + if end_tens.values[idx] != input_tens.shape[idx] or ( + end_tens.values[idx] == input_tens.shape[idx] and begin_tens.values[idx] != 0 + ): + offset_start[idx] = begin_tens.values[idx] + offset_end[idx] = end_tens.values[idx] + + else: + # Don't slice in this axis, instead use fullest possible range + continue + + elif self.type == "UnpackReshaped": + # Requires fixup_unpack_output to be called before this point + input_tens = self.inputs[0] + outputs = self.outputs + axis = self.attrs["axis"] + num_splits = self.attrs["num"] + # Number of outputs have to equal the value of the dimension to unpack + assert num_splits == len(outputs) == input_tens.shape[axis] + else: + assert False + + return input_tens, outputs, axis, offset_start, offset_end diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py new file mode 100644 index 00000000..663520fc --- /dev/null +++ b/ethosu/vela/pass_packing.py @@ -0,0 +1,489 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations. + +from .nn_graph import Operation, Pass, PassPlacement, TensorPurpose, NpuBlockType, Tensor +import collections +import enum +from .data_type import BaseType, DataType + + +class PassFlags(enum.Flag): + Empty = 0 + Pre = 1 + Main = 2 + Post = 4 + Mac = 8 + Dma = 32 + ElementWise = 256 + Npu = 512 + Cpu = 1024 + StartupInit = 2048 + MemoryOnly = 4096 + PostFusingLimited = 8192 + + +npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",)) + +mac_main_ops = set( + ( + # convolutions + "Conv2DBiasAct", + "Conv2D", + "QuantizedConv2D", + "Conv2DBackpropInputSwitched", + # depth-wise convolutions + "DepthwiseConv2dBiasAct", + "DepthwiseConv2dNative", + "QuantizedDepthwiseConv2D", + # FC layers + "QuantizedMatMul", + "MatMul", + "FullyConnectedAct", + # RNN/LSTM/GRU + "BlockLSTM", + # pooling + "QuantizedMaxPool", + "QuantizedAvgPool", + "AvgPool", + "MaxPool", + "AvgPoolAct", + "MaxPoolAct", + ) +) + +binary_elem_wise_main_ops = set( + ( + # binary element-wise + "AddAct", + "MulAct", + "SubAct", + "QuantizedAdd", + "QuantizedSub", + "QuantizedMul", + "Mul", + "Add", + "Sub", + "Minimum", + "Maximum", + ) +) + +unary_elem_wise_main_ops = set(("LeakyRelu", "Abs")) # Unary element-wise operations + +elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops + +activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1")) +npu_post_ops = activation_ops | set( + # Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct. + ("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm") +) + +npu_post_fuse_limited_ops = set( + # Set of post operators that should not be fused with main/elementwise ops + ("ConcatSliceWrite", "Sigmoid", "Tanh") +) + +elem_wise_ops = elem_wise_main_ops | activation_ops | set(("Sigmoid", "Tanh")) + + +quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min")) +cpu_ops = ( + set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN")) + | quantization_ops +) + +npu_dma_ops = set(("DMA",)) +startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput")) +memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",)) + + +test_sequence = [ + ( + # ops_set + npu_post_ops, + # incompatible_pack_flags + PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main, + # flags_to_set + PassFlags.Npu | PassFlags.Post, + # flags_to_clear + PassFlags.Empty, + ), + ( + # ops_set + npu_post_fuse_limited_ops, + # incompatible_pack_flags + PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main, + # flags_to_set + PassFlags.Npu | PassFlags.PostFusingLimited, + # flags_to_clear + PassFlags.Empty, + ), + ( + # ops_set + mac_main_ops, + # incompatible_pack_flags + PassFlags.Cpu + | PassFlags.MemoryOnly + | PassFlags.ElementWise + | PassFlags.Pre + | PassFlags.Main + | PassFlags.PostFusingLimited, + # flags_to_set + PassFlags.Npu | PassFlags.Mac | PassFlags.Main, + # flags_to_clear + PassFlags.Empty, + ), + ( + # ops_set + elem_wise_main_ops, + # incompatible_pack_flags + PassFlags.Cpu + | PassFlags.MemoryOnly + | PassFlags.Mac + | PassFlags.Pre + | PassFlags.Main + | PassFlags.PostFusingLimited, + # flags_to_set + PassFlags.Npu | PassFlags.ElementWise | PassFlags.Main, + # flags_to_clear + PassFlags.Empty, + ), + ( + # ops_set + npu_pre_ops, + # incompatible_pack_flags + PassFlags.Cpu | PassFlags.MemoryOnly, + # flags_to_set + PassFlags.Npu | PassFlags.Mac | PassFlags.Pre | PassFlags.ElementWise, + # flags_to_clear + PassFlags.Empty, + ), + ( + # ops_set + npu_dma_ops, + # incompatible_pack_flags + PassFlags.Cpu | PassFlags.MemoryOnly, + # flags_to_set + PassFlags.Npu | PassFlags.Dma, + # flags_to_clear + PassFlags.Empty + ), + ( + # ops_set + startup_init_ops, + # incompatible_pack_flags + PassFlags.Npu | PassFlags.Cpu | PassFlags.MemoryOnly, + # flags_to_set + PassFlags.StartupInit | PassFlags.Main, + # flags_to_clear + PassFlags.Empty, + ), + ( + # ops_set + memory_only_ops, + # incompatible_pack_flags + PassFlags.Npu | PassFlags.Cpu, + # flags_to_set + PassFlags.MemoryOnly | PassFlags.Main, + # flags_to_clear + PassFlags.Empty + ), + ( + # ops_set + cpu_ops, + # incompatible_pack_flags + PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main, + # flags_to_set + PassFlags.Cpu | PassFlags.Main, + # flags_to_clear + PassFlags.Empty + ), + ( # This last one is a fallback for unrecognised operations + # ops_set + None, + # incompatible_pack_flags + PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main, + # flags_to_set + PassFlags.Cpu | PassFlags.Main, + # flags_to_clear + PassFlags.Empty + ), +] + +# Some sanity checking +for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence: + assert not flags_to_clear & flags_to_set + + if operation_set is not None: + for op in operation_set: + assert len(op) > 1 # This is to avoid string literals being decomposed + + +def pack_into_passes(nng, arch, verbose_packing=False): + def visit_op(op, ignored): + visit_op_refcount[op] += 1 + + if visit_op_refcount[op] == 1: # First-time visit, go and fix up unused output tensors + for tens in op.outputs: + if len(tens.consumers()) == 0: + visit_op_refcount[op] += 1 + + assert visit_op_refcount[op] <= len(op.outputs) + if visit_op_refcount[op] == len(op.outputs): + + if op.type in startup_init_ops: + startup_list.append(op) + else: + _, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm() + if ofm_tensor is None: + ofm_tensor = op.outputs[0] + build_pass((op,), ofm_tensor) + + def build_pass(start_ops_to_process, ofm_tensor=None): + reverse_ops_list = [] + curr_flags = PassFlags.Empty + npu_block_type = NpuBlockType.Default + + reverse_intermediates = [] + input_set = set() + ifm_tensor = None + primary_op = None + + to_process = collections.deque() + for start_op in start_ops_to_process: + to_process.append((start_op, None)) + + while to_process: + curr_op, tens = to_process.popleft() + + if curr_op in reverse_ops_list: + continue + + for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence: + if operation_set is None or curr_op.type in operation_set: + if not (curr_flags & incompatible_pack_flags): + if flags_to_set & PassFlags.Npu: + if not curr_op.run_on_npu: + continue + + reverse_ops_list.append(curr_op) + new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default) + if new_block_type != NpuBlockType.Default: + assert npu_block_type == NpuBlockType.Default + npu_block_type = new_block_type # Only one major block type per pass + assert primary_op is None + primary_op = curr_op + + curr_flags &= ~flags_to_clear + curr_flags |= flags_to_set + + if flags_to_set & PassFlags.Npu: + if flags_to_set & ( + PassFlags.Mac | PassFlags.ElementWise | PassFlags.Post | PassFlags.PostFusingLimited + ): + assert len(curr_op.inputs) >= 1 + if curr_op.type == "BlockLSTM": + ifm_tensor = curr_op.inputs[3] + else: + ifm_tensor = curr_op.inputs[0] + assert ifm_tensor.purpose == TensorPurpose.FeatureMap + + if flags_to_set & PassFlags.Dma: + # DMAs are special - Output buffers need to be preserved as intermediates, + # if the pass consumes the results + if tens is not None: + reverse_intermediates.append(tens) + + if operation_set is None: + print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU") + + for inp in curr_op.inputs: + can_pack = True + if len(inp.ops) == 1: + next_op = inp.ops[0] + for outp in next_op.outputs: + consumers = outp.consumers() + if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op): + can_pack = False + break + else: + can_pack = False + + if can_pack: + to_process.append((next_op, inp)) + else: + assert inp is not None + input_set.add(inp) + + break + + else: + # This operation is not compatible with already packed operations, just register the tensor as an input + assert tens is not None + input_set.add(tens) + + if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise | PassFlags.Mac): + # Make the choice that if we don't have a mac operation, the ambidextrous operations go on the + # element wise unit + curr_flags |= PassFlags.ElementWise + + is_element_wise = True + for op in reverse_ops_list: + if not op.type in elem_wise_ops and not op.type in npu_dma_ops: + is_element_wise = False + break + + placement = PassPlacement.Unknown + if curr_flags & PassFlags.Npu: + assert placement == PassPlacement.Unknown + placement = PassPlacement.Npu + if curr_flags & PassFlags.Cpu: + assert placement == PassPlacement.Unknown + placement = PassPlacement.Cpu + if curr_flags & PassFlags.MemoryOnly: + assert placement == PassPlacement.Unknown + placement = PassPlacement.MemoryOnly + if curr_flags & PassFlags.StartupInit: + assert placement == PassPlacement.Unknown + placement = PassPlacement.StartupInit + assert placement != PassPlacement.Unknown + + ops_list = list(reversed(reverse_ops_list)) + intermediates = list(reversed(reverse_intermediates)) + + if primary_op == None: + primary_op = create_primary_op(ops_list) + if primary_op != None: + visit_tensor_refcount[primary_op.inputs[0]] += 1 + npu_block_type = primary_op.attrs["npu_block_type"] + for input_tens in primary_op.inputs: + if input_tens not in input_set: + input_set.add(input_tens) + + ordered_input_list = [] + input_refcounts = collections.defaultdict(int) + for op in ops_list: + for inp in op.inputs: + if inp in input_set: + if input_refcounts[inp] == 0: + ordered_input_list.append(inp) + input_refcounts[inp] += 1 + + name = ops_list[0].name + non_dma_ops = [op for op in ops_list if op.type != "DMA"] + if non_dma_ops: + name = non_dma_ops[0].name + ps = Pass(name, placement, is_element_wise, npu_block_type) + ps.ops = ops_list + ps.primary_op = primary_op + ps.inputs = ordered_input_list + ps.intermediates = intermediates + ps.outputs = list(ops_list[-1].outputs) + ps.ifm_tensor = ifm_tensor + + # ElementWise operation, 2 IFMs + if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops: + ps.ifm_tensor = ps.inputs[0] + + if len(ps.inputs) == 1: + # Only 1 input, IFM and IFM2 are the same tensor + ps.ifm2_tensor = ps.inputs[0] + else: + ps.ifm2_tensor = ps.inputs[1] + else: + ps.ifm_tensor = ifm_tensor + ps.ifm2_tensor = None + + ps.ofm_tensor = ofm_tensor + assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None + ps.weight_tensor = ps.get_primary_op_ifm_weights()[1] + ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2] + + for op in ps.ops: + op.scheduled_pass = ps + + reverse_pass_list.append(ps) + + for inp, refcount in input_refcounts.items(): + for _ in range(refcount): + visit_tensor(inp) + + return ps + + def visit_tensor(tens): + visit_tensor_refcount[tens] += 1 + assert visit_tensor_refcount[tens] <= len(tens.consumers()) + if visit_tensor_refcount[tens] == len(tens.consumers()): + for op in reversed(tens.ops): + visit_op(op, tens) + + def create_primary_op(ops_list): + if any(op.type in (npu_pre_ops | npu_post_ops | npu_post_fuse_limited_ops) for op in ops_list): + # Configure a 1x1 AvgPool and attach the op onto it + op = ops_list[0] + inp = op.inputs[0] + avgpool_name = op.name + "_avgpool" + avgpool_op = Operation("AvgPool", avgpool_name) + avgpool_op.inputs = [inp] + avgpool_op.inputs[0].consumer_list.append(avgpool_op) + avgpool_op.attrs["padding"] = b"VALID" + avgpool_op.attrs["npu_block_type"] = NpuBlockType.Pooling + avgpool_op.attrs["stride_w"] = 1 + avgpool_op.attrs["stride_h"] = 1 + avgpool_op.attrs["filter_width"] = 1 + avgpool_op.attrs["filter_height"] = 1 + avgpool_op.attrs["strides"] = [1, 1, 1, 1] + avgpool_op.attrs["ksize"] = [1, 1, 1, 1] + avgpool_op.attrs["skirt"] = [0, 0, 0, 0] + avgpool_op.attrs["explicit_padding"] = [0, 0, 0, 0] + avgpool_out = inp.clone("_avgpooled") + avgpool_out.consumer_list.append(op) + avgpool_out.ops = [avgpool_op] + avgpool_op.outputs = [avgpool_out] + + op.inputs[0] = avgpool_out + ops_list.insert(0, avgpool_op) + + return avgpool_op + + return None + + for sg in nng.subgraphs: + reverse_pass_list = [] + visit_op_refcount = collections.defaultdict(int) + visit_tensor_refcount = collections.defaultdict(int) + + startup_list = [] + + for tens in sg.output_tensors: + visit_tensor(tens) + + if startup_list: + startup_ps = build_pass(startup_list) + startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs + startup_ps.name = "startup_weight_initialisation" + + sg.passes = list(reversed(reverse_pass_list)) + sg.build_pass_links() + + if verbose_packing: + nng.print_passes() + + return nng diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py new file mode 100644 index 00000000..64de9709 --- /dev/null +++ b/ethosu/vela/range_set.py @@ -0,0 +1,154 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Helper classes to track memory accesses for calculating dependencies between Commands. + +from enum import IntEnum +from collections import defaultdict +from functools import lru_cache + + +class RangeSet: + """A Range set class to track ranges and whether they intersect. +Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas.""" + + def __init__(self, start=None, end=None, ranges=None): + if ranges is None: + ranges = [] + + self.ranges = ranges # track a list of (start, end) tuples, always in ascending order sorted by start. + + if start is not None and start != end: + assert start < end + self.ranges.append((start, end)) + + def __or__(self, other): + combined_ranges = list(sorted(self.ranges + other.ranges)) + return RangeSet(ranges=combined_ranges) + + def __ior__(self, other): + self.ranges = list(sorted(self.ranges + other.ranges)) + return self + + def intersects(self, other): + a_ranges = self.ranges + b_ranges = other.ranges + + a_idx = 0 + b_idx = 0 + + while a_idx < len(a_ranges) and b_idx < len(b_ranges): + ar = a_ranges[a_idx] + br = b_ranges[b_idx] + if max(ar[0], br[0]) < min(ar[1], br[1]): + return True # intersection + + # advance one of the two upwards + if ar[0] < br[0]: + a_idx += 1 + else: + assert ar[0] != br[0] + # note ar[0] == br[0] cannot happen, then we'd have an intersection + b_idx += 1 + + return False + + def __str__(self): + return "" % (["%#x:%#x" % (int(start), int(end)) for start, end in self.ranges],) + + __repr__ = __str__ + + +class MemoryRangeSet: + """Extended version of the RangeSet class that handles having different memory areas""" + + def __init__(self, mem_area=None, start=None, end=None, regions=None): + + if regions is None: + regions = {} + self.regions = regions + + if mem_area is not None: + self.regions[mem_area] = RangeSet(start, end) + + def __or__(self, other): + combined_regions = { + mem_area: (self.regions.get(mem_area, RangeSet()) | other.regions.get(mem_area, RangeSet())) + for mem_area in (self.regions.keys() | other.regions.keys()) + } + return MemoryRangeSet(regions=combined_regions) + + def __ior__(self, other): + self.regions = { + mem_area: (self.regions.get(mem_area, RangeSet()) | other.regions.get(mem_area, RangeSet())) + for mem_area in (self.regions.keys() | other.regions.keys()) + } + return self + + def intersects(self, other): + for mem_area in self.regions.keys() & other.regions.keys(): + if self.regions[mem_area].intersects(other.regions[mem_area]): + return True + return False + + def __str__(self): + s = "" + for mem_area, rng in self.regions.items(): + s += "%s: %s\t" % (mem_area, rng) + return s + + __repr__ = __str__ + + +class AccessDirection(IntEnum): + Read = 0 + Write = 1 + Size = 2 + + +class MemoryAccessSet: + """Tracks memory ranges, but also access patterns to know which accesses actually are in conflict""" + + def __init__(self): + self.accesses = [MemoryRangeSet() for i in range(AccessDirection.Size)] + + def add(self, memory_range_set, access): + self.accesses[access] |= memory_range_set + + @lru_cache(maxsize=None) + def conflicts(self, other): + + # True dependencies, or write -> read + if self.accesses[AccessDirection.Write].intersects(other.accesses[AccessDirection.Read]): + return True + + # Anti-dependencies, or read -> write + if self.accesses[AccessDirection.Read].intersects(other.accesses[AccessDirection.Write]): + return True + + # Output dependencies, or write -> write + if self.accesses[AccessDirection.Write].intersects(other.accesses[AccessDirection.Write]): + return True + + # read -> read does not cause a conflict + return False + + def __str__(self): + return "Read: %s\nWrite: %s\n\n" % (self.accesses[AccessDirection.Read], self.accesses[AccessDirection.Write]) + + __repr__ = __str__ diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py new file mode 100644 index 00000000..5563b969 --- /dev/null +++ b/ethosu/vela/register_command_stream_generator.py @@ -0,0 +1,945 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Register level (low-level) command stream generation for Ethos-U55. Takes a high-level command stream and generates +# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit +# stream suitable for interpretation by the Ethos-U55 processor. + +from collections import defaultdict +from enum import Enum, IntEnum +from .high_level_command_stream import CommandType +from .ethos_u55_regs.ethos_u55_regs import * +from .tensor import MemArea, TensorBlockTraversal +from .operation import NpuBlockType +from .numeric_util import quantise_float32, round_up, round_away_zero, round_up_to_int, clamp_sigmoid, clamp_tanh +from .data_type import BaseType +import numpy as np +from .shared_buffer_allocation import SharedBufferAllocation +from .architecture_features import SharedBufferArea, SHRAMElements, ArchitectureFeatures +from .nn_graph import TensorFormat, SchedulingStrategy +from .range_set import ( + MemoryAccessSet, + AccessDirection, +) +from .mark_tensors import ( + reshape_operations, +) +from .architecture_features import Block, Kernel, Rect +from . import scaling + + +class RegisterMachine: + def __init__(self): + self.n_banks = 1 + self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)] + self.bank_idx = 0 + + def set_register(self, reg, value): + is_changed = self.registers[self.bank_idx][reg] != value + self.registers[self.bank_idx][reg] = value + # is_changed = True # force command + return is_changed + + def switch_bank(self): + self.bank_idx = (self.bank_idx + 1) % self.n_banks + + +class CmdMode(IntEnum): + NoPayload = 0x0000 + Payload32 = 0x4000 + Mask = 0xC000 + CmdOpMask = 0x03FF + + +class BasePointerIndex(IntEnum): + ReadOnly = 0 # base address slot index for weights and scaling + Scratch = 1 # base address slot index for scratch memory area + + +# TODO: Replace with definitions from ethos_u55_regs +class IFM2Broadcast(IntEnum): + BroadcastHdim = 1 << 0 + BroadcastWdim = 1 << 1 + BroadcastCdim = 1 << 2 + ReverseOperandOrder = 1 << 6 + UseIFM2Scalar = 1 << 7 + + +class CommandStreamEmitter: + def __init__(self): + self.cmd_stream = [] + self.reg_machine = [RegisterMachine(), RegisterMachine()] + self.last_absolute_wait = defaultdict(int) + + def get_reg_machine(self, cmd): + if "DMA" in cmd.name: + return self.reg_machine[1] + else: + return self.reg_machine[0] + + def size_in_bytes(self): + sz = 0 + for cmd in self.cmd_stream: + sz += len(cmd) * 4 + return sz + + def to_list(self): + return [elem for cmd in self.cmd_stream for elem in cmd] + + def print_cmds(self): + print("Code: Command: Param: Payload:") + for words_for_one_command in self.cmd_stream: + code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits + param = words_for_one_command[0] >> 16 # higher 16 bits + + payload_mode = CmdMode(code & CmdMode.Mask) + + # code and command + s = " 0x%04x " % code + if payload_mode == CmdMode.NoPayload: + s += str(cmd0(code & CmdMode.CmdOpMask)) + else: + s += str(cmd1(code & CmdMode.CmdOpMask)) + + s = s.ljust(40) + s += "%5d" % param + + # payload + if payload_mode == CmdMode.Payload32: + s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1]) + else: + s += " -" + + print(s) + + def cmd0_with_param(self, cmd, param): + if isinstance(param, Enum): + param = int(param.value) + else: + param = int(param) + param = param & 0xFFFF + command = cmd.value | (param << 16) + if not self.get_reg_machine(cmd).set_register(cmd, (command, param)): + return + + # This is not a redundant command, actually write it + self.cmd_stream.append((command,)) + + def cmd1_with_offset(self, cmd, offset, param=0x0): + offset = int(offset) & 0xFFFFFFFFF + command = cmd.value | CmdMode.Payload32.value | (param << 16) + + if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)): + return + + # This is not a redundant command, actually write it + self.cmd_stream.append((command, offset)) + + def cmd_wait(self, cmd, param, absolute_wait_time): + if absolute_wait_time <= self.last_absolute_wait[cmd]: + return + + self.last_absolute_wait[cmd] = absolute_wait_time + param = int(param) + command = ((param & 0xFFFF) << 16) | cmd.value + self.cmd_stream.append((command,)) + + def cmd_do_operation(self, cmd, param=0): + param = int(param) + command = ((param & 0xFFFF) << 16) | cmd.value + + self.cmd_stream.append((command,)) + self.get_reg_machine(cmd).switch_bank() + + +def calc_command_dependencies(cmd_stream, arch): + cmd_starts = {} + cmd_ends = {} + memory_accesses = {} + + # Keep track of accumulated number of commands in command stream. + # First element kernel ops: (# of blocks, # of commands) + # Second element DMA ops: (# of commands) + pos = np.array((np.array((0, 0)), np.array([0]))) + + dependencies = {} + + for cmd in cmd_stream: + cmd_starts[cmd] = pos + op_count = cmd.get_operation_count() + # Keep track of both num blocks and commands + cmd_add = 0 if (op_count[0] == 0) else 1 + pos = np.array((pos[0] + np.array((op_count[0], cmd_add)), pos[1] + np.array([op_count[1]]))) + cmd_ends[cmd] = np.array((pos[0], pos[1])) + memory_accesses[cmd] = cmd.get_memory_accesses() + + for idx, cmd in enumerate(cmd_stream): + curr_accesses = memory_accesses[cmd] + # Keep track of command dependency. + # First element kernel ops: (# of blocks, # of commands) + # Second element DMA ops: (# of commands) + dep_offsets = np.array((np.array((-1, -1)), np.array([-1]))) + dep_cmds = [None] * CommandType.Size.value + if idx > 0: + # Look at the previous commands in backwards order + for prev_cmd in cmd_stream[idx - 1 :: -1]: + assert prev_cmd is not cmd + if dep_cmds[prev_cmd.cmdtype] is None: + is_dependency = False + if cmd.cmdtype == CommandType.NpuStripe and prev_cmd.cmdtype == CommandType.NpuStripe: + # Special handling here, as dpu -> dpu operations require additional care + if not SharedBufferAllocation.is_compatible(prev_cmd.ps.shared_buffer, cmd.ps.shared_buffer): + is_dependency = True + elif memory_accesses[prev_cmd].conflicts(curr_accesses): + is_dependency = True + else: + if memory_accesses[prev_cmd].conflicts(curr_accesses): + is_dependency = True + + if is_dependency: + new_offset = cmd_ends[prev_cmd][prev_cmd.cmdtype] + if new_offset[0] > dep_offsets[prev_cmd.cmdtype][0]: + dep_cmds[prev_cmd.cmdtype] = prev_cmd + dep_offsets[prev_cmd.cmdtype] = new_offset + + # Check if we've got dependencies for all commands, in which case we can early out + for dep in dep_cmds: + if dep is None: + break + else: + break # all handled + + # Convert absolute to relative dependencies, using None to signal the special case of no + # dependency of this kind + res = [None] * CommandType.Size.value + for i in range(CommandType.Size.value): + if dep_cmds[i] is not None: + res[i] = cmd_starts[cmd][i] - dep_offsets[i] + + dependencies[cmd] = cmd_starts[cmd], res + + return dependencies + + +def get_op_kernel(ps): + if ps.primary_op is None: + return None + + strides = ps.primary_op.attrs.get("strides", (1, 1, 1, 1)) + dilation = ps.primary_op.attrs.get("dilation", (1, 1, 1, 1)) + if ps.weight_tensor: + if ps.npu_block_type in set((NpuBlockType.VectorProduct, NpuBlockType.ElementWise)): + k_h = 1 + k_w = 1 + else: + k_h = ps.weight_tensor.shape[0] + k_w = ps.weight_tensor.shape[1] + else: + k_h = ps.primary_op.attrs.get("filter_height", 1) + k_w = ps.primary_op.attrs.get("filter_width", 1) + + return Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1]) + + +def full_shape(shape, fill): + return ([fill] * (4 - len(shape))) + shape + + +def has_prev_op_dependency(prev_cmd, cmd): + if prev_cmd is None: + return False + if (prev_cmd.cmdtype == cmd.cmdtype == CommandType.NpuStripe) and (prev_cmd.ps != cmd.ps): + if prev_cmd.ofm_tensor == cmd.ifm_tensor: + return True + else: + return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id + return False + + +def get_op_ofm_rect(cmd): + start = full_shape(cmd.ofm_box.start_coord, 0) + end = full_shape(cmd.ofm_box.end_coord, 1) + return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1) + + +def get_op_ifm_rect(cmd): + start = full_shape(cmd.ifm_box.start_coord, 0) + end = full_shape(cmd.ifm_box.end_coord, 1) + return Rect(start[-2], start[-3], start[-1], end[-2] - 1, end[-3] - 1, end[-1] - 1) + + +def get_op_ifmofm_block_depth(arch, cmd): + # Note: NOT equivalent to the normal ifm block depth calculation since + # it takes into account 'depthless' block operations by returning full + # depth + if cmd.ps.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling, NpuBlockType.ElementWise): + return cmd.ofm_box.get_size_shape()[-1] + + return arch.calc_ifm_block_depth(cmd.ifm_box.get_size_shape()[-1], cmd.ifm_tensor.dtype.bits) + + +def get_op_padding_lt(cmd): + if cmd.ps.npu_block_type not in ( + NpuBlockType.ConvolutionDepthWise, + NpuBlockType.Pooling, + NpuBlockType.ConvolutionMxN, + ): + return (0, 0) + + explicit_padding = list(cmd.ps.primary_op.attrs["explicit_padding"]) # (top, left, bottom, right) + + # Check if this is for horizontal ifm streaming + if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe): + explicit_padding[0] = cmd.pad_top + explicit_padding[2] = cmd.pad_bottom + + return (explicit_padding[1], explicit_padding[0]) + + +def generate_register_command_stream(nng, sg, arch, verbose=False): + emit = CommandStreamEmitter() + + base_ptr_idx_map = { + MemArea.Sram: BasePointerIndex.Scratch, + MemArea.OnChipFlash: BasePointerIndex.ReadOnly, + MemArea.OffChipFlash: BasePointerIndex.ReadOnly, + MemArea.Dram: BasePointerIndex.ReadOnly, + } + + # Maps an AccumulatorType enum to the corresponding acc_format value + acc_format_map = { + SHRAMElements.Acc16: acc_format.FP_S5_10.value, + SHRAMElements.Acc32: acc_format.INT_32BIT.value, + SHRAMElements.Acc40: acc_format.INT_40BIT.value, + } + + # Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE + elementwise_mode_map = { + "MulAct": elementwise_mode.MUL.value, + "AddAct": elementwise_mode.ADD.value, + "SubAct": elementwise_mode.SUB.value, + "Minimum": elementwise_mode.MIN.value, + "Maximum": elementwise_mode.MAX.value, + "LeakyRelu": elementwise_mode.LRELU.value, + "Abs": elementwise_mode.ABS.value, + } + + cmd_stream = [] + for cmd in sg.high_level_command_stream: + if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default: + print("Warning: Skipping register command stream generation for", cmd.ps) + else: + cmd_stream.append(cmd) + + dependencies = calc_command_dependencies(cmd_stream, arch) + + # Initialise operator dependency state + prev_ifm_rect = cur_ifm_rect = None + prev_ifm_block_depth = cur_ifm_block_depth = None + prev_ofm_rect = cur_ofm_rect = None + prev_ofm_block = cur_ofm_block = None + prev_kernel = cur_kernel = None + prev_cmd = None + + def emit_wait_commands(cmd): + # The command is fully set up, emit whatever wait commands we need + absolute_dep, relative_dep = dependencies[cmd] + if relative_dep[CommandType.NpuStripe] is not None: + if cmd.cmdtype == CommandType.DMA: + param = relative_dep[CommandType.NpuStripe][1] + if param <= 3: + emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, param, absolute_dep[CommandType.NpuStripe][1]) + else: + param = relative_dep[CommandType.NpuStripe][0] + param = min(param, 0xFFFF) # Clamp to allowable wait amount + + if relative_dep[CommandType.DMA] is not None: + param = relative_dep[CommandType.DMA][0] + param = min(param, 0xF) # Clamp to allowable wait amount + emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0]) + prev_cmd = None # Clear any dependency + + # Start by issuing REGION commands since they remain the same + emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, BasePointerIndex.Scratch) + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, BasePointerIndex.Scratch) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, BasePointerIndex.Scratch) + for cmd in cmd_stream: + if cmd.cmdtype == CommandType.DMA: + start_coord = cmd.box.start_coord + + src_addr = cmd.in_tensor.address_for_coordinate(start_coord) + dst_addr = cmd.out_tensor.address_for_coordinate(start_coord) + + if cmd.in_tensor.compressed_values is not None: + stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord) + sz = cmd.in_tensor.size_of_compressed_stream(stream_index) + else: + sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr + + # TODO: Yoda support needs to use feature_maps_not_in_fast_storage and force_outputs_to_fast_storage + emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, base_ptr_idx_map[cmd.in_tensor.mem_area]) + emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, src_addr) + emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, base_ptr_idx_map[cmd.out_tensor.mem_area]) + emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dst_addr) + emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, sz) + dma_channel = 0 + mode = 0 # From external to external + + emit_wait_commands(cmd) + emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, dma_channel * 16 + mode) + + elif cmd.cmdtype == CommandType.NpuStripe: + + ps = cmd.ps + primary_op = ps.primary_op + npu_block_type = ps.npu_block_type + # Specifies if global scale from the NPU_SET_OFM_SCALE register should be used instead of per-channel scale + use_global_scale = False + # Specifies type of rounding to be used. + rounding_mode = rounding.TFL + fmf = primary_op.attrs.get("fused_memory_function", None) + faf = primary_op.attrs.get("fused_activation_function", None) + + # Specifies which operand to apply scaling to in bitexact elementwise ADD/SUB + op_to_scale = 0 + + # Update state history + prev_ifm_rect = cur_ifm_rect + prev_ifm_block_depth = cur_ifm_block_depth + prev_ofm_rect = cur_ofm_rect + prev_ofm_block = cur_ofm_block + prev_kernel = cur_kernel + + block_config = ps.block_config + emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config[1] - 1) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config[3] - 1) + + shared_buffer = ps.shared_buffer + + if npu_block_type == NpuBlockType.ElementWise: + ifm2_broadcast = 0 + + if cmd.ifm_tensor.shape == []: + # The scalar has to be the ifm2 tensor so switch the ifms + cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor + cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box + + # Set ReverseOperandOrder bit to IFM2_BROADCAST + ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder + + # Calculate scales needed for arithmetic elementwise operators + if primary_op.type in set(("AddAct", "MulAct", "SubAct",)): + input_scale = cmd.ifm_tensor.quantization.scale_f32 + input2_scale = cmd.ifm2_tensor.quantization.scale_f32 + output_scale = cmd.ofm_tensor.quantization.scale_f32 + use_global_scale = True + + if primary_op.type == "MulAct": + if (faf == "Sigmoid") or (faf == "Tanh"): + output_scale = 1 / 0x3000 + + ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale) + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) + else: # AddAct/SubAct + if (faf == "Sigmoid") or (faf == "Tanh"): + output_scale = 1 / 0x3000 + + if input_scale == input2_scale: + opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale( + input_scale, input2_scale, output_scale + ) + opa_shift = 0 # Unused for this case + else: + # Use advanced implementation only when input scales differ + bitdepth = cmd.ifm_tensor.dtype.bits + ( + opa_scale, + opa_shift, + ofm_scale, + shift, + op_to_scale, + ) = scaling.advanced_elementwise_add_sub_scale( + input_scale, input2_scale, output_scale, bitdepth + ) + opb_scale = 0 # Unused for this case + if ifm2_broadcast & IFM2Broadcast.ReverseOperandOrder: + # If the operand order is reversed we also have to swap which operand is scaled + if op_to_scale == scaling.OperandToScale.OPa: + op_to_scale = scaling.OperandToScale.OPb + else: + op_to_scale = scaling.OperandToScale.OPa + + emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift) + emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale) + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) + + if primary_op.type in set(("LeakyRelu", "Abs",)): + output_scale = cmd.ofm_tensor.quantization.scale_f32 + use_global_scale = True + + if primary_op.type == "LeakyRelu": + output_scale *= primary_op.attrs["alpha"] + + ofm_scale, shift = scaling.quantise_scale(output_scale) + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift) + + # For elementwise set the required SHRAM to be equal to the total size of SHRAM + shram_required = arch.shram_total_banks + emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required) + + # Acc buffers not needed so set AB_START to size of SHRAM + emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch.shram_total_banks) + + # Is not a unary operator + if cmd.ifm2_tensor is not None: + if cmd.ifm2_tensor.shape == []: + # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST + ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar + else: + ifm_box_shape = cmd.ifm_box.get_size_shape() + ifm2_box_shape = cmd.ifm2_box.get_size_shape() + + if len(cmd.ifm_tensor.shape) > 1 and ifm_box_shape[1] != ifm2_box_shape[1]: + # Broadcast in 'H' dimension + assert cmd.ifm2_tensor.shape[1] == 1 + ifm2_broadcast |= IFM2Broadcast.BroadcastHdim + + if len(cmd.ifm_tensor.shape) > 2 and ifm_box_shape[2] != ifm2_box_shape[2]: + # Broadcast in 'W' dimension + assert cmd.ifm2_tensor.shape[2] == 1 + ifm2_broadcast |= IFM2Broadcast.BroadcastWdim + + if len(cmd.ifm_tensor.shape) > 3 and ifm_box_shape[3] != ifm2_box_shape[3]: + # Broadcast in 'C' dimension + assert cmd.ifm2_tensor.shape[3] == 1 + ifm2_broadcast |= IFM2Broadcast.BroadcastCdim + + # Set IFM2_IB_START to the latter half of the IB space + ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM] + emit.cmd0_with_param( + cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) / 2 + ifm_ib_start + ) + + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast) + + else: + emit.cmd0_with_param( + cmd0.NPU_SET_IFM_IB_END, + shared_buffer.bank_locations[SharedBufferArea.IFM] + + shared_buffer.banks_required[SharedBufferArea.IFM], + ) + emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators]) + + emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element]) + + emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 0) + + if npu_block_type in set( + (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling) + ): + # Set up padding + explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right) + + # Check if this is for horizontal ifm streaming + if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe): + explicit_padding[0] = cmd.pad_top + explicit_padding[2] = cmd.pad_bottom + + # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output, + # because of activation function needed to be fused. + if cmd.ifm_box.start_coord[-2] > 0: + explicit_padding[1] = 0 + if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]: + explicit_padding[3] = 0 + + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0]) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1]) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2]) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, explicit_padding[3]) + + stride = primary_op.attrs["strides"][2] - 1 + stride |= (primary_op.attrs["strides"][1] - 1) << 1 + + if npu_block_type == NpuBlockType.Pooling: + k_height, k_width = primary_op.attrs["ksize"][1:3] + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1) + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, k_width - 1) + + valid_padding = sum(explicit_padding) == 0 + + if primary_op.type in set(("AvgPool", "AvgPoolAct")) and valid_padding: + # For valid padding vela has to output scaling values + if faf == "Sigmoid" or faf == "Tanh": + rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32 + rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 + + scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits) + scale = int(round_away_zero(scale * rescale)) + else: + # In case avg pool fused with concat or other memory operation, rescaling might be needed. + # k_height == k_width == 1 is allways true in this case + # Normally the scale is maximised, to get maximum precision, which means that + # if rescale != 1, scale need to consider the number of bits needed for rescaling + rescale = cmd.ifm_tensor.quantization.scale_f32 / cmd.ofm_tensor.quantization.scale_f32 + rescale_bits = 0 + if k_height == k_width == 1: + if fmf == "ConcatSliceWrite": + rounding_mode = rounding.NATURAL + if rescale > 1: + rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1 + elif rescale < 1: + rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1) + scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits) + scale = int(round_away_zero(scale * rescale)) + + emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift) + # Valid-padded average pool should use the global scale from + # NPU_SET_OFM_SCALE register, which is set above. + use_global_scale = True + + else: # Convolution + assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, cmd.weight_tensor.shape[0] - 1) + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, cmd.weight_tensor.shape[1] - 1) + if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst: + # Part-kernel-first weight ordering + assert npu_block_type == NpuBlockType.ConvolutionMxN + stride |= 1 << 2 + + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride) + + elif npu_block_type in set((NpuBlockType.VectorProduct,)): + # Vector product is implemented using a 1x1 convolution so need + # to setup the appropriate padding and kernel info + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, 0) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, 0) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, 0) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, 0) + + # kernel stride reg = 0 means stride(1,1) + depth first weight + # order + dilation(0,0) + kernel_split_size=8 + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, 0) + + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, 0) + emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, 0) + + if npu_block_type in set( + (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) + ): + # Emit Weight base address commands, only maps the area required for + # this command's weights from the larger tensor. + stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord) + weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord) + weight_len = cmd.weight_tensor.size_of_compressed_stream(stream_index) + # Select weight/scale region depending on where permanent storage was defined + weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_area] + if arch.permanent_storage_mem_area == MemArea.Sram: + weight_region = BasePointerIndex.ReadOnly + emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region) + emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr) + emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_len) + + # Emit Scale & Bias base address commands, with length matching the amount required by + # the weight tensors. + if cmd.scale_tensor is not None: + # Get address and size of the scale/bias data area + scale_addr = cmd.scale_tensor.address_for_coordinate(cmd.weight_box.start_coord[-1:]) + scale_len = ( + cmd.scale_tensor.address_for_coordinate(cmd.weight_box.end_coord[-1:], True) - scale_addr + ) + # Emit base address for NPU to access scale & bias data + scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_area] + if arch.permanent_storage_mem_area == MemArea.Sram: + scale_region = BasePointerIndex.ReadOnly + emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, scale_region) + emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr) + emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, round_up(scale_len, 16)) + + ofm_quant = cmd.ofm_tensor.quantization + ofm_quant_qmin = cmd.ofm_tensor.quantization.quant_min + ofm_quant_qmax = cmd.ofm_tensor.quantization.quant_max + ifm_min = cmd.ifm_tensor.quantization.min + ifm_max = cmd.ifm_tensor.quantization.max + + # Emit commands for any fused activation function + if faf == None: + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) + # Even if no activation function, values need to be set to override previous values + faf_min = ofm_quant_qmin + faf_max = ofm_quant_qmax + elif faf == "Relu": + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) + faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point) + faf_max = ofm_quant_qmax + elif faf == "Relu6": + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) + faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point) + faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point) + elif faf == "ReluN1To1": + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) + faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point) + faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point) + elif faf == "Tanh": + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH) + faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point) + faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point) + elif faf == "Sigmoid": + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID) + faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point) + faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point) + else: + raise Exception("Unsupported fused_activation_function = " + faf) + + # Activation range needs to be set based upon the quantisation range and the fused activation range + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min)) + emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, min(ofm_quant_qmax, faf_max)) + + out_shape = cmd.ofm_box.get_size_shape() + if len(out_shape) >= 4: + emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, out_shape[-3] - 1) + else: + emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, 0) + if len(out_shape) >= 2: + emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, out_shape[-2] - 1) + else: + emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, 0) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, out_shape[-1] - 1) + + if npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)): + in_shape = cmd.ifm_box.get_size_shape() + emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, in_shape[-1] - 1) + else: + emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, out_shape[-1] - 1) + + for tens, box, ptr_ops, stride_ops, zero_point_op in ( + ( + cmd.ifm_tensor, + cmd.ifm_box, + (cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3), + (cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X), + cmd0.NPU_SET_IFM_ZERO_POINT, + ), + ( + cmd.ifm2_tensor, + cmd.ifm2_box, + ( + cmd1.NPU_SET_IFM2_BASE0, + cmd1.NPU_SET_IFM2_BASE1, + cmd1.NPU_SET_IFM2_BASE2, + cmd1.NPU_SET_IFM2_BASE3, + ), + (cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X), + cmd0.NPU_SET_IFM2_ZERO_POINT, + ), + ( + cmd.ofm_tensor, + cmd.ofm_box, + (cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3), + (cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X), + cmd0.NPU_SET_OFM_ZERO_POINT, + ), + ): + + if tens == None: + continue + + need_zero_point = (faf != None) or (fmf == "ConcatSliceWrite") + if ( + primary_op.type in set(("AvgPool", "AvgPoolAct")) and not need_zero_point + ) or tens.quantization == None: + # Actual integer operation, just set scale to 1 and zero point to 0 + emit.cmd0_with_param(zero_point_op, 0) + else: + assert tens.quantization.zero_point is not None, "need an actual zero point set" + emit.cmd0_with_param(zero_point_op, int(tens.quantization.zero_point)) + + if tens.shape == []: + # Empty shape, elementwise constant + ifm2_scalar = tens.quant_values.astype(np.uint8) + assert ifm2_scalar.size == 1 + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, ifm2_scalar.item(0)) + continue + + height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer( + box.start_coord, box.end_coord + ) + if npu_block_type != NpuBlockType.VectorProduct: + if tens == cmd.ifm_tensor: + emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT0_M1, height_0 - 1) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_HEIGHT1_M1, height_1 - 1) + emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, width_0 - 1) + elif tens == cmd.ofm_tensor: + emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT0_M1, height_0 - 1) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT1_M1, height_1 - 1) + emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, width_0 - 1) + elif tens == cmd.ifm2_tensor: + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT0_M1, height_0 - 1) + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_HEIGHT1_M1, height_1 - 1) + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_WIDTH0_M1, width_0 - 1) + else: + if len(out_shape) == 2: + # TODO: N is put in W-dimension for now + # Should be spread over H and W, but then block size selectetion, + # and stride calculation should be changed + if tens == cmd.ifm_tensor: + emit.cmd0_with_param(cmd0.NPU_SET_IFM_WIDTH0_M1, out_shape[-2] - 1) + elif tens == cmd.ofm_tensor: + emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH0_M1, out_shape[-2] - 1) + else: + assert False + + for idx, addr in enumerate(addresses): + if addr is None: + addresses[idx] = 0 + + emit.cmd1_with_offset(ptr_ops[0], addresses[0]) + emit.cmd1_with_offset(ptr_ops[1], addresses[1]) + emit.cmd1_with_offset(ptr_ops[2], addresses[2]) + emit.cmd1_with_offset(ptr_ops[3], addresses[3]) + + strides = tens.get_strides() + emit.cmd1_with_offset(stride_ops[0], strides[1]) # stride between 16-byte channel blocks (C) + emit.cmd1_with_offset(stride_ops[2], strides[3]) # stride between horisontal values (W) + emit.cmd1_with_offset(stride_ops[1], strides[2]) # stride between vertical values (H) + + if tens.format == TensorFormat.NHCWB16: + # Check that all BasePointer addresses are aligned to 16 bytes + assert (int(addresses[0]) % 16) == 0 + assert (int(addresses[1]) % 16) == 0 + assert (int(addresses[2]) % 16) == 0 + assert (int(addresses[3]) % 16) == 0 + + ofm_dtype = cmd.ofm_tensor.dtype + assert ofm_dtype.type & BaseType.Int + prec = 0 + if ofm_dtype.size_in_bits() == 8: + prec = 0 + elif ofm_dtype.size_in_bits() == 16: + prec = 2 + else: + assert 0 + + if ofm_dtype.type & BaseType.Signed: + prec += 1 + + if use_global_scale: + # Set global scale bit, as opposed to using per channel scale + prec |= 1 << 8 + + if cmd.ofm_tensor.format == TensorFormat.NHCWB16: + prec |= 1 << 6 + + prec |= rounding_mode.value << 14 + + emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec) + + prec = None + weight_bits = 8 + if cmd.weight_tensor is not None: + weight_bits = cmd.weight_tensor.dtype.size_in_bits() + + ifm_dtype = cmd.ifm_tensor.dtype + + assert weight_bits == 8, "Unsupported weight bit depth" + assert ifm_dtype.size_in_bits() in {8, 16} + + if ifm_dtype.size_in_bits() == 8: + if ifm_dtype.type & BaseType.Signed: + prec = ifm_precision.W8_S8 + else: + prec = ifm_precision.W8_U8 + elif ifm_dtype.size_in_bits() == 16: + if ifm_dtype.type & BaseType.Signed: + prec = ifm_precision.W8_S16 + else: + prec = ifm_precision.W8_U16 + + ifm_prec = prec.value + ifm2_prec = ifm_prec + + if cmd.ifm_tensor.format == TensorFormat.NHCWB16: + ifm_prec |= 1 << 6 + + ifm_prec |= op_to_scale << 8 + + emit.cmd0_with_param(cmd0.NPU_SET_IFM_PRECISION, ifm_prec) + + if cmd.ifm2_tensor is not None: + if cmd.ifm2_tensor.format == TensorFormat.NHCWB16: + ifm2_prec |= 1 << 6 + emit.cmd0_with_param(cmd0.NPU_SET_IFM2_PRECISION, ifm2_prec) + + emit_wait_commands(cmd) + + # Get op parameters + cur_ifm_block_depth = get_op_ifmofm_block_depth(arch, cmd) + cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3]) + cur_ofm_rect = get_op_ofm_rect(cmd) + cur_ifm_rect = get_op_ifm_rect(cmd) + cur_kernel = get_op_kernel(cmd.ps) + cur_padLT = get_op_padding_lt(cmd) + if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd): + if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape: + blockdep = arch.calc_block_dep( + prev_ifm_rect, + prev_ofm_rect, + prev_ifm_block_depth, + prev_ofm_block, + prev_kernel, + cur_ifm_rect, + cur_ofm_rect, + cur_ifm_block_depth, + cur_ofm_block, + cur_kernel, + cur_padLT, + ) + else: + blockdep = 0 + else: + blockdep = ArchitectureFeatures.MAX_BLOCKDEP + + # Set between every op (dependent or not) + blockdep = min(blockdep, arch.max_blockdep) + emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep) + prev_cmd = cmd + + if npu_block_type == NpuBlockType.ConvolutionMxN: + emit.cmd_do_operation(cmd0.NPU_OP_CONV) + elif npu_block_type == NpuBlockType.ConvolutionDepthWise: + emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE) + elif npu_block_type == NpuBlockType.VectorProduct: + # Vector product is implemented using a 1x1 convolution + emit.cmd_do_operation(cmd0.NPU_OP_CONV) + elif npu_block_type == NpuBlockType.Pooling: + param = "Max" not in primary_op.type + emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param) + elif npu_block_type == NpuBlockType.ElementWise: + param = elementwise_mode_map[primary_op.type] + emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param) + else: + print("Warning: Skipping register command stream generation for", ps) + + # Fill in final part of command stream: + emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF) + + sg.register_command_stream = emit.to_list() + if verbose: + emit.print_cmds() + print("number of commands", len(emit.cmd_stream)) + print("command stream length in words", len(sg.register_command_stream)) diff --git a/ethosu/vela/rewrite_graph.py b/ethosu/vela/rewrite_graph.py new file mode 100644 index 00000000..e6e24e62 --- /dev/null +++ b/ethosu/vela/rewrite_graph.py @@ -0,0 +1,171 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Functions for abstracting out the traversal and rewriting of graphs so that the optimisation passes can focus on the +# correct operation. +# +# Requires two lists, one of functions that rewrite Tensors, and one of functions that rewrite Operations. +# +# Pre-order traversal, this supports rewrites. Therefore, functions can return something other than the original value. +# +# Post-order traversal, this does not support rewrites. Therefore, functions must return the original value. + + +def rewrite_graph_pre_order(sg, arch, tensor_rewrite_list, op_rewrite_list, rewrite_unsupported=True): + + op_visit_dict = dict() + tens_visit_dict = dict() + + def visit_op(op): + if op in op_visit_dict: + return op_visit_dict[op] + res = op + prev_res = None + while prev_res != res: + prev_res = res + for rewrite in op_rewrite_list: + if res.run_on_npu or rewrite_unsupported: + res = rewrite(res, arch) + + op_visit_dict[op] = res + op_visit_dict[res] = res + + inputs = res.inputs + res.inputs = [] + for tens in inputs: + res.inputs.append(visit_tens(tens)) + + outputs = res.outputs + res.outputs = [] + for tens in outputs: + res.outputs.append(visit_tens(tens)) + + return res + + def visit_tens(tens): + if tens in tens_visit_dict: + return tens_visit_dict[tens] + + res = tens + prev_res = None + while prev_res != res: + prev_res = res + for rewrite in tensor_rewrite_list: + res = rewrite(res, arch) + + tens_visit_dict[tens] = res + tens_visit_dict[res] = res + + ops = res.ops + res.ops = [] + for op in ops: + res.ops.append(visit_op(op)) + return res + + sg.output_tensors = [visit_tens(tens) for tens in sg.output_tensors] + sg.refresh_after_modification() + + return sg + + +def visit_graph_post_order(sg, arch, tensor_visit_list, op_visit_list): + + op_visit_dict = dict() + tens_visit_dict = dict() + + def visit_op(op): + if op in op_visit_dict: + return op_visit_dict[op] + op_visit_dict[op] = op + + for tens in op.inputs: + visit_tens(tens) + + for visit in op_visit_list: + visit(op, arch) + + for tens in op.outputs: + visit_tens(tens) + + return op + + def visit_tens(tens): + if tens in tens_visit_dict: + return tens_visit_dict[tens] + + tens_visit_dict[tens] = tens + + for op in tens.ops: + visit_op(op) + + for visit in tensor_visit_list: + visit(tens, arch) + + return tens + + for tens in sg.output_tensors: + visit_tens(tens) + + sg.refresh_after_modification() + + return sg + + +def verify_graph_health(nng): + + for sg in nng.subgraphs: + verify_subgraph_health(sg) + + return True + + +def verify_subgraph_health(sg): + op_visit_dict = dict() + tens_visit_dict = dict() + + def visit_op(op): + if op in op_visit_dict: + return op_visit_dict[op] + op_visit_dict[op] = op + + for tens in op.inputs: + assert op in tens.consumers() + visit_tens(tens) + + for tens in op.outputs: + assert op in tens.ops + visit_tens(tens) + + return op + + def visit_tens(tens): + if tens in tens_visit_dict: + return tens_visit_dict[tens] + + tens_visit_dict[tens] = tens + + for op in tens.ops: + assert tens in op.outputs + visit_op(op) + + return tens + + for tens in sg.output_tensors: + visit_tens(tens) + + return True diff --git a/ethosu/vela/scaling.py b/ethosu/vela/scaling.py new file mode 100644 index 00000000..b255f938 --- /dev/null +++ b/ethosu/vela/scaling.py @@ -0,0 +1,91 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Contains various scaling calculations for weights, elementwise operations, pooling etc. + +import math +from .numeric_util import round_away_zero +from enum import IntEnum + + +class OperandToScale(IntEnum): + OPa = 1 + OPb = 2 + + +# Quantise floating point scale value into 32-bit int scale and 6-bit shift +def quantise_scale(scale): + significand, exponent = math.frexp(scale) + significand_q31 = int(round_away_zero(significand * (1 << 31))) + exponent_q31 = exponent - 31 + shift = exponent_q31 * -1 + + if shift >= (1 << 6): + # Shift outside of valid range, set scale to 0 + return 0, 16 + + return significand_q31, shift + + +# Calculate global OFM scale for Average Pooling +def quantise_pooling_scale(nr_kernel_elements, rescale_bits=0): + _, k = math.frexp(nr_kernel_elements - 1) + N = 31 - rescale_bits + scale = ((1 << (N + k)) + (1 << k)) // nr_kernel_elements + shift = N + k + + assert shift < (1 << 6) + + return scale, shift + + +# Calculate elementwise Mul OFM scale+shift +def elementwise_mul_scale(input_scale, input2_scale, output_scale): + output_rescale = (input_scale * input2_scale) / output_scale + out_scale, out_shift = quantise_scale(output_rescale) + return out_scale, out_shift + + +# Simplified version of calculating elementwise Add/Sub scales +def simplified_elementwise_add_sub_scale(input1_scale, input2_scale, output_scale, input_shift=16): + max_input_scale = max(input1_scale, input2_scale) + + input1_rescale = input1_scale * (1 << input_shift) / (2 * max_input_scale) + input2_rescale = input2_scale * (1 << input_shift) / (2 * max_input_scale) + output_rescale = (2 * max_input_scale) / (output_scale * (1 << input_shift)) + + out_scale, out_shift = quantise_scale(output_rescale) + + return input1_rescale, input2_rescale, out_scale, out_shift + + +# Advanced version of calculating elementwise Add/Sub scales +def advanced_elementwise_add_sub_scale(input1_scale, input2_scale, output_scale, bitdepth): + # Always scale the smaller of the input scales + max_input_scale = max(input1_scale, input2_scale) + min_input_scale = min(input1_scale, input2_scale) + input_shift = 20 if bitdepth == 8 else 14 + op_to_scale = OperandToScale.OPa if input1_scale < input2_scale else OperandToScale.OPb + + input1_rescale, _, out_scale, out_shift = simplified_elementwise_add_sub_scale( + min_input_scale, max_input_scale, output_scale, input_shift + ) + + in_scale, in_shift = quantise_scale(input1_rescale) + + return in_scale, in_shift, out_scale, out_shift, op_to_scale diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py new file mode 100644 index 00000000..c35c1566 --- /dev/null +++ b/ethosu/vela/scheduler.py @@ -0,0 +1,949 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# The scheduler costs various strategies for scheduling the network in order to select the block configuration. + +import enum +from .nn_graph import ( + TensorPurpose, + TensorSubPurpose, + TensorFormat, + MemArea, + SchedulingStrategy, + CascadedPass, + PassPlacement, + SchedulerRewrite, + Operation, + NpuBlockType, +) +from . import live_range +import numpy as np +from . import npu_performance +from . import stats_writer +from .npu_performance import make_bandwidth_array, make_macs_array, make_cycles_array, make_metrics_arrays, PassCycles +import time, copy +from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_pass_list +from .shared_buffer_allocation import ( + find_block_configs_suitable_for_pass_and_shared_buffer, + shared_buffer_allocation_for_pass_and_block_config, +) +from functools import lru_cache + + +class ParetoMetric(enum.Enum): + BwCycMem = 1 + BwCycMemBlkH = 2 + + def __str__(self): + return self.name + + +class SchedulerOptions: + def __init__( + self, + use_cascading=True, + use_ifm_ofm_overlap=True, + verbose_schedule=False, + verbose_pareto_frontier_schedules=False, + use_ifm_streaming=True, + pareto_metric=ParetoMetric.BwCycMem, + ): + self.use_cascading = use_cascading + self.use_ifm_ofm_overlap = use_ifm_ofm_overlap + self.verbose_schedule = verbose_schedule + self.verbose_pareto_frontier_schedules = verbose_pareto_frontier_schedules + self.use_ifm_streaming = use_ifm_streaming + self.pareto_metric = pareto_metric + + def __str__(self): + return type(self).__name__ + ": " + str(self.__dict__) + + __repr__ = __str__ + + +class Strategy: + __slots__ = "strat", "param", "passes", "block_configs", "rewrite_list", "bws", "macs", "cycles", "sram_used" + + def __init__(self, strat, param, passes, block_configs, rewrite_list, bws, macs, cycles, sram_used): + self.strat = strat + self.param = param + self.passes = passes + self.block_configs = block_configs + self.rewrite_list = ( + rewrite_list # list of (SchedulerRewrite, Tensor, new sub purpose, purpose param a, purpose param b, pass) + ) + self.bws = bws + self.macs = macs + self.cycles = cycles + self.sram_used = sram_used + + def __eq__(self, other): + if self.strat != other.strat: + return False + if self.param != other.param: + return False + if self.block_configs != other.block_configs: + return False + if self.passes != other.passes: + return False + if (self.bws != other.bws).any(): + return False + if (self.macs != other.macs).any(): + return False + if (self.cycles != other.cycles).any(): + return False + if self.sram_used != other.sram_used: + return False + return True + + def empty(self): + return not self.passes + + def key(self): + return self.passes[-1] + + def clone(self): + return Strategy( + self.strat, + self.param, + self.passes, + self.block_configs, + self.rewrite_list, + self.bws, + self.macs, + self.cycles, + self.sram_used, + ) + + def __str__(self): + return "" % ( + self.strat, + self.passes, + self.rewrite_list, + self.bws, + self.macs, + self.cycles, + self.sram_used, + ) + + __repr__ = __str__ + + +class StrategySet: + __slots__ = "strats", "bws", "macs", "cycles", "max_sram_used", "total_sram_used" + + def __init__(self, strats=None): + if strats is None: + strats = dict() + self.strats = strats # final pass in packed pass -> Strategy + self.bws, self.macs, self.cycles = make_metrics_arrays() + self.max_sram_used = 0 + self.total_sram_used = 0 + + def update_statistics(self): + self.bws = make_bandwidth_array() + self.max_sram_used = 0 + for ps, strat in self.strats.items(): + self.bws += strat.bws + self.macs += strat.macs + self.cycles += strat.cycles + self.max_sram_used = max(self.max_sram_used, strat.sram_used) + self.total_sram_used += strat.sram_used + + def clone_add_strategy(self, new_strat): + key = new_strat.key() + if key in self.strats: + assert new_strat == self.strats[key] + return self + else: + new_strats = dict(self.strats) + new_strats[key] = new_strat + new_set = StrategySet(new_strats) + new_set.bws = self.bws + new_strat.bws + new_set.macs = self.macs + new_strat.macs + new_set.cycles = self.cycles + new_strat.cycles + new_set.max_sram_used = max(self.max_sram_used, new_strat.sram_used) + new_set.total_sram_used = self.total_sram_used + new_strat.sram_used + return new_set + + def __eq__(self, other): + if (self.bws != other.bws).any(): + return False + if (self.macs != other.macs).any(): + return False + if (self.cycles != other.cycles).any(): + return False + if self.max_sram_used != other.max_sram_used: + return False + if self.total_sram_used != other.total_sram_used: + return False + if self.strats != other.strats: + return False + return True + + def __str__(self): + return "" % ( + self.max_sram_used, + list(ps.name for ps in self.strats), + ) + + __repr__ = __str__ + + +empty_strategy = Strategy( + SchedulingStrategy.Unknown, None, [], [], [], make_bandwidth_array(), make_macs_array(), make_cycles_array(), 0 +) +INFINITY = 1e30 + +ABORT_SEARCH = [] + + +def flatten_list_of_lists(lstlst): + lst = [] + for v in lstlst: + lst.extend(v) + return lst + + +class DynamicProgrammingScheduler: + def __init__(self, nng, sg, arch, sram_limit, options: SchedulerOptions): + self.nng = nng + self.sg = sg + self.arch = arch + self.sram_limit = sram_limit + self.options = copy.copy(options) + self.use_cascading = options.use_cascading + + if self.arch.feature_map_storage_mem_area != MemArea.Sram: + self.use_ifm_ofm_overlap = False # force off IFM/OFM overlap if IFMs and OFMs are not in the SRAM + self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap + + self.verbose_schedule = options.verbose_schedule + self.verbose_pareto_frontier_schedules = options.verbose_pareto_frontier_schedules + self.mem_area = MemArea.Sram + + self.bandwidth_weights = arch.bandwidth_weights + self.cycles_weight = arch.cycles_weight + self.max_sram_used_weight = arch.max_sram_used_weight + + self.n_combinations_searched = 0 + + self.feature_maps_not_in_fast_storage = ( + arch.tensor_storage_mem_area[TensorPurpose.FeatureMap] != arch.fast_storage_mem_area + ) + + self.pareto_max_candidates = 16 + + self.ifm_stream_npu_blocks = set( + (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling,) + ) + + num_pareto_metrics = 4 + view_values = ",".join(["d"] * num_pareto_metrics) + order_values = ["f%d" % (idx,) for idx in range(num_pareto_metrics)] + + def pareto_metric(self, candidate): + strat, strat_set = candidate + total_cycles = strat.cycles[PassCycles.Total] + strat_set.cycles[PassCycles.Total] + bws = strat.bws + strat_set.bws + last_block_height = 0 + if self.options.pareto_metric == ParetoMetric.BwCycMemBlkH and len(strat.block_configs) > 0: + last_block_height = strat.block_configs[-1][0] + + return ( + np.tensordot(bws, self.bandwidth_weights, axes=3) + total_cycles * self.cycles_weight, + strat_set.max_sram_used, + strat.sram_used, + last_block_height, + ) + + def filter_pareto_frontier(self, candidates, remove_equally_good_candidates): + + candidates = [cand for cand in candidates if max(cand[0].sram_used, cand[1].max_sram_used) <= self.sram_limit] + + if len(candidates) <= 1: + return candidates + assert remove_equally_good_candidates + start = time.time() + pareto_vals = np.zeros((len(candidates), DynamicProgrammingScheduler.num_pareto_metrics)) + ids = np.arange(len(candidates), dtype=np.int32) + for idx, cand in enumerate(candidates): + pareto_vals[idx] = self.pareto_metric(cand) + + sort_order = np.argsort( + pareto_vals.view(DynamicProgrammingScheduler.view_values), + order=DynamicProgrammingScheduler.order_values, + axis=0, + kind="stable", + ).flatten() + pareto_vals = pareto_vals[sort_order] + ids = ids[sort_order] + + pareto_frontier = [] + while len(ids) > 0: + pareto_frontier.append(candidates[ids[0]]) + not_dominated_by_first = (pareto_vals < pareto_vals[0]).any(axis=1) + ids = ids[not_dominated_by_first] + pareto_vals = pareto_vals[not_dominated_by_first] + + if len(pareto_frontier) > self.pareto_max_candidates: + pareto_frontier = self.sort_by_candidate_metric(pareto_frontier) + pareto_frontier = pareto_frontier[: self.pareto_max_candidates] + + return pareto_frontier + + def candidate_metric(self, candidate): + strat, strat_set = candidate + max_sram_used = max(strat_set.max_sram_used, strat.sram_used) + bws = strat.bws + strat_set.bws + total_cycles = strat.cycles[PassCycles.Total] + strat_set.cycles[PassCycles.Total] + + return ( + max_sram_used * self.max_sram_used_weight + + np.tensordot(bws, self.bandwidth_weights, axes=3) + + total_cycles * self.cycles_weight + ) + + def sort_by_candidate_metric(self, candidate_list): + sorted_list = list(sorted(candidate_list, key=self.candidate_metric)) + return sorted_list + + def best_candidate(self, candidate_list): + if len(candidate_list) == 0: + return ABORT_SEARCH + if len(candidate_list) == 1: + return candidate_list[0] + sorted_list = self.sort_by_candidate_metric(candidate_list) + return sorted_list[0] + + def graduate_strat(self, strat_type, sram_used, old_strat_data): + res = [] + for old_strat, old_strat_set in old_strat_data: + if old_strat.sram_used + sram_used > self.sram_limit: + continue # This strategy is bad, drop it + if old_strat_set.max_sram_used > self.sram_limit: + continue # This strategy is bad, drop it + assert old_strat.strat == SchedulingStrategy.Unknown + + new_strat = old_strat.clone() + new_strat.strat = strat_type + new_strat.sram_used = old_strat.sram_used + sram_used + + if self.use_ifm_ofm_overlap: + overlap = calc_allowed_ofm_ifm_overlap_for_pass_list( + new_strat.strat, new_strat.passes, new_strat.block_configs + ) + new_strat.sram_used -= overlap + + new_strat_set = old_strat_set.clone_add_strategy(new_strat) + res.append((empty_strategy, new_strat_set)) + return self.filter_pareto_frontier(res, remove_equally_good_candidates=True) + + def append_sram(self, sram_used, old_strat_data): + res = [] + for old_strat, strat_set in old_strat_data: + assert old_strat.strat == SchedulingStrategy.Unknown + assert old_strat.sram_used == 0 + new_strat = old_strat.clone() + new_strat.sram_used = old_strat.sram_used + sram_used + + res.append((new_strat, strat_set)) + return res + + def append_sram_block_config_performance_metrics(self, sram_used, block_config, metrics, old_strat_data): + res = [] + for old_strat, strat_set in old_strat_data: + assert old_strat.strat == SchedulingStrategy.Unknown + new_strat = old_strat.clone() + bws, macs, cycles = metrics[:3] + + new_strat.sram_used = old_strat.sram_used + sram_used + new_strat.block_configs = old_strat.block_configs + [block_config] + new_strat.bws = old_strat.bws + bws + new_strat.macs = old_strat.macs + macs + new_strat.cycles = old_strat.cycles + cycles + new_strat.bws, new_strat.macs, new_strat.cycles = npu_performance.collate_stats_for_cascaded_pass( + self.arch, new_strat.bws, new_strat.macs, new_strat.cycles + ) + + res.append((new_strat, strat_set)) + return res + + def append_sram_pass_block_config_performance_metrics_rewrite_list( + self, sram_used, new_pass, block_config, metrics, rewrite_list, old_strat_data + ): + res = [] + for old_strat, strat_set in old_strat_data: + assert old_strat.strat == SchedulingStrategy.Unknown + new_strat = old_strat.clone() + bws, macs, cycles = metrics[:3] + new_strat.sram_used = old_strat.sram_used + sram_used + new_strat.block_configs = old_strat.block_configs + [block_config] + new_strat.bws = old_strat.bws + bws + new_strat.macs = old_strat.macs + macs + new_strat.cycles = old_strat.cycles + cycles + new_strat.passes = old_strat.passes + [new_pass] + new_strat.bws, new_strat.macs, new_strat.cycles = npu_performance.collate_stats_for_cascaded_pass( + self.arch, new_strat.bws, new_strat.macs, new_strat.cycles + ) + new_strat.rewrite_list = old_strat.rewrite_list + rewrite_list + res.append((new_strat, strat_set)) + return res + + def append_sram_rewrite_list(self, sram_used, rewrite_list, old_strat_data): + res = [] + for old_strat, strat_set in old_strat_data: + assert old_strat.strat == SchedulingStrategy.Unknown + new_strat = old_strat.clone() + new_strat.sram_used = old_strat.sram_used + sram_used + new_strat.rewrite_list = old_strat.rewrite_list + rewrite_list + res.append((new_strat, strat_set)) + return res + + def pass_to_strat(self, strat_data): + res = {} + for strat in strat_data[1].strats.values(): + for ps in strat.passes: + res[ps] = strat + return res + + def compatible_strats(self, a, b): + intersection = a.keys() & b.keys() + for k in intersection: + if a[k] != b[k]: + return False + return True + + def collate_strats_for_passes(self, all_passes): + if len(all_passes) == 0: + return [(empty_strategy, StrategySet(dict()))] + if len(all_passes) == 1: + return all_passes[0] # save some space in the common case + all_strands = [[self.pass_to_strat(strat_data) for strat_data in strand] for strand in all_passes] + prev_combos = [dict()] + for j, strand in enumerate(all_strands): + new_combos = [] + for i, alt in enumerate(strand): + for prev in prev_combos: + if self.compatible_strats(prev, alt): + cmb = dict(prev) + cmb.update(all_passes[j][i][1].strats) + new_combos.append(cmb) + prev_combos = new_combos + + res = [] + for d in prev_combos: + s = StrategySet(d) + s.update_statistics() + res.append((empty_strategy, s)) + return res + + def search_all_but_one_predecessor(self, ps, pred_pass, pred_pass_data): + # get the rest of the predecessors + other_predecessors = [pred for pred in ps.dag_predecessors if pred != pred_pass] + other_predecessor_data = self.search_pass_list(other_predecessors) + + # pred strat data has an incomplete strategy, which we need + # to continue on, whereas the other ones have completed strategies. + # we need to merge these, but keep the incomplete strategy too. + + res = [] + for pred_pass_strat, pred_pass_strat_set in pred_pass_data: + all_strats = [ + [(empty_strategy, pred_pass_strat_set)], # pred strat data but with a dummy empty strategy + other_predecessor_data, # this one is fine to use as-is + ] + collated_strat_data = self.collate_strats_for_passes(all_strats) + strat_data = [(pred_pass_strat, strat_set) for _, strat_set in collated_strat_data] + res.extend(strat_data) + return res + + def calc_non_local_mem_usage(self): + ignore_subgraph_input_output_tensors = self.sg.placement == PassPlacement.Cpu + range_set = live_range.extract_live_ranges_from_passes( + self.sg, + self.mem_area, + mark_output_tensors_overlapping_with_input_tensors=True, + ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors, + ) + range_dict = range_set.ranges + + # find which ranges overlap passes but aren't input/outputs of the passes. + # these won't be counted by the dynamic programming search and must be counted in manually. + end_pos = max(ps.time for ps in self.sg.passes) + 2 + mem_usage = np.zeros(end_pos) + self.sg.base_sram_used + non_local_mem_usage = np.zeros(end_pos, dtype=np.int64) + + for tens, rng in range_dict.items(): + storage_size = tens.storage_size() + assert tens.mem_area == self.mem_area + mem_usage[rng.start_time : rng.end_time] += storage_size + + for ps in self.sg.passes: + local_mem_usage = 0 + for tens in ps.inputs + ps.outputs + ps.intermediates: + if tens.mem_area != self.mem_area: + continue + + local_mem_usage += tens.storage_size() + + non_local_mem_usage[ps.time] = mem_usage[ps.time] - local_mem_usage + + self.non_local_mem_usage = non_local_mem_usage + + def search(self): + self.calc_non_local_mem_usage() + starting_passes = [ps for ps in self.sg.passes if not ps.successors] + strat_data = self.search_pass_list(starting_passes) + + _, best_set = self.best_candidate(strat_data) + + if self.verbose_pareto_frontier_schedules: + print( + "Scheduler searched %d combinations and found %d candidate schedules along the pareto frontier" + % (self.n_combinations_searched, len(strat_data,)) + ) + for idx, (_, strat_set) in enumerate(strat_data): + extra = "" + if strat_set == best_set: + extra = "(Best candidate)" + print("Candidate", idx, extra) + memory_used = {MemArea.Sram: strat_set.max_sram_used} + stats_writer.print_performance_metrics_for_strat( + self.arch, + "", + strat_set.cycles, + strat_set.macs, + strat_set.bws, + self.nng.batch_size, + memory_used, + len(self.sg.passes), + len(strat_set.strats), + ) + + return best_set + + def search_pass_list(self, pass_list): + all_strats = [] + for ps in pass_list: + strat = self.search_output(ps) + all_strats.append(strat) + strat_data = self.collate_strats_for_passes(all_strats) + for strd in strat_data: + for ps in pass_list: + assert ps in strd[1].strats # should have strategies for everything we asked to search + return strat_data + + def search_predecessors(self, ps): + + # protect against graphs with loops. collate_strats_for_passes will sort this out later so that + # we have strats for all passes + + pass_list = ps.dag_predecessors + strat_data = self.search_pass_list(pass_list) + + return strat_data + + @lru_cache(maxsize=None) + def search_output(self, ps): + + assert ps in self.sg.passes + candidate_list = [] + + candidate_list.extend(self.search_weight_streaming_output(ps)) + + if self.options.use_ifm_streaming: + candidate_list.extend(self.search_ifm_streaming_output(ps)) + + best = self.filter_pareto_frontier(candidate_list, remove_equally_good_candidates=True) + + if not best: + print( + "Warning: Dynamic search programming algorithm failed for pass %s, invoking fallback strategy" + % (ps.name,) + ) + return self.search_predecessors(ps) + + return best + + def search_ifm_streaming_output(self, ps): + if ps.placement != PassPlacement.Npu: + return ABORT_SEARCH + if ps.npu_block_type not in self.ifm_stream_npu_blocks: + return ABORT_SEARCH + strat_data = self.search_ifm_streaming_body(ps, False) + + sram_used = self.non_local_mem_usage[ps.time] + for tens in ps.outputs: + if tens.mem_area == self.mem_area: + sram_used += tens.storage_size() + + return self.graduate_strat(SchedulingStrategy.IfmStream, sram_used, strat_data) + + @lru_cache(maxsize=None) + def search_ifm_streaming_body(self, ps, force_outputs_to_fast_storage): + if ps.placement != PassPlacement.Npu: + return ABORT_SEARCH + if ps.npu_block_type not in self.ifm_stream_npu_blocks: + return ABORT_SEARCH + ifm_input_search_resuls = self.search_ifm_streaming_input(ps) + res = [] + + base_sram_used = 0 + for tens in ps.intermediates: + if tens.mem_area == self.mem_area: + base_sram_used += tens.storage_size() + + all_block_configs = self.get_block_configs(ps) + for block_config in all_block_configs: + all_strats = [] + + if self.use_cascading: + all_strats.extend(self.search_ifm_streaming_partial(ps, block_config)) + + all_strats.extend(ifm_input_search_resuls) + + rewrite_list = [] + sram_used = base_sram_used + + metrics = npu_performance.performance_metrics_for_pass( + self.arch, + ps, + block_config, + rewrite_list=rewrite_list, + force_outputs_to_fast_storage=force_outputs_to_fast_storage, + ) + + res.extend( + self.append_sram_pass_block_config_performance_metrics_rewrite_list( + sram_used, ps, block_config, metrics, rewrite_list, all_strats + ) + ) + + self.n_combinations_searched += len(res) + res = self.filter_pareto_frontier(res, remove_equally_good_candidates=True) + return res + + def search_ifm_streaming_partial(self, ps, block_config): + if ps.placement != PassPlacement.Npu: + return ABORT_SEARCH + + if len(ps.inputs) < 1: + return ABORT_SEARCH + + ifm_tensor = ps.ifm_tensor + + if ifm_tensor is None: + return ABORT_SEARCH + if ifm_tensor.purpose != TensorPurpose.FeatureMap: + return ABORT_SEARCH + if not ifm_tensor.storage_shape or len(ifm_tensor.storage_shape) != 4: + return ABORT_SEARCH + + pred_pass_list = [] + for pred_candidate in ps.dag_predecessors: + if len(pred_candidate.outputs) == 1 and pred_candidate.outputs[0] == ifm_tensor: + # we found a predecessor that produces this IFM tensor + if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps: + # and it only has one successor, namely us + if pred_candidate.placement == PassPlacement.Npu: + if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks: + # and it is on the Npu and fusable - it's a candidate + pred_pass_list.append(pred_candidate) + + if not pred_pass_list: + return ABORT_SEARCH + + all_candidates = [] + for pred_pass in pred_pass_list: + # recurse into the next pass + ifm_strat_data = self.search_ifm_streaming_body(pred_pass, self.feature_maps_not_in_fast_storage) + + strat_data = self.search_all_but_one_predecessor(ps, pred_pass, ifm_strat_data) + for strat_opt in strat_data: + + pred_pass_block_config = strat_opt[0].block_configs[-1] + rolling_buffer_dims = npu_performance.rolling_buffer_dims_from_passes( + self.arch, pred_pass, pred_pass_block_config, ps, block_config + ) + if rolling_buffer_dims is None: + continue # this does not pack properly, skip it. + + sram_used = 0 + for tens in ps.inputs: + if tens != ifm_tensor: + if tens.mem_area == self.mem_area: + sram_used += tens.storage_size() + + rolling_buffer_y, rolling_buffer_x = rolling_buffer_dims + + rewrite_list = [ + ( + SchedulerRewrite.ChangeTensorSubPurpose, + ifm_tensor, + TensorSubPurpose.RollingBufferY, + rolling_buffer_y, + None, + ps, + ) + ] + sram_used += ifm_tensor.storage_size_for_sub_purpose( + TensorSubPurpose.RollingBufferY, rolling_buffer_y, None + ) + + all_candidates.extend(self.append_sram_rewrite_list(sram_used, rewrite_list, [strat_opt])) + + self.n_combinations_searched += len(all_candidates) + return all_candidates + + def get_block_configs(self, ps): + if ps.placement != PassPlacement.Npu: + return [(1, 1, 1, 1)] # default + + block_configs = find_block_configs_suitable_for_pass_and_shared_buffer(self.arch, ps) + + # Take a limited number of the largest blocks + if self.arch.block_config_limit > 0: + # Sort by block area, followed by depth + block_configs.sort(key=lambda cfg: (cfg[0] * cfg[1]) << 8 | cfg[3], reverse=True) + bound = min(len(block_configs), self.arch.block_config_limit) + # We take 'n' from the fat end of the list, and 'n' from the thin end of the list. + tmp = block_configs[:bound] + tmp.extend(block_configs[max(bound, len(block_configs) - bound) :]) + block_configs = tmp + + return block_configs + + def search_ifm_streaming_input(self, ps): + sram_used = 0 + for tens in ps.inputs: + if tens.mem_area == self.mem_area: + sram_used += tens.storage_size() + + return self.append_sram(sram_used, self.search_predecessors(ps)) + + def search_weight_streaming_output(self, ps): + strat_data = self.search_weight_streaming_body(ps) + + sram_used = self.non_local_mem_usage[ps.time] + for tens in ps.outputs: + if tens.mem_area == self.mem_area: + sram_used += tens.storage_size() + + return self.graduate_strat(SchedulingStrategy.WeightStream, sram_used, strat_data) + + @lru_cache(maxsize=None) + def search_weight_streaming_body(self, ps): + + strat_data = self.search_weight_streaming_input(ps) + + res = [] + + all_block_configs = self.get_block_configs(ps) + + for block_config in all_block_configs: + + sram_used = 0 + rewrite_list = [] + + for tens in ps.intermediates: + if tens.mem_area == self.mem_area: + if tens.purpose == TensorPurpose.Weights: + sram_used += tens.storage_size_for_sub_purpose( + TensorSubPurpose.DoubleBuffer, block_config[3] + ) + rewrite_list.append( + ( + SchedulerRewrite.ChangeTensorSubPurpose, + tens, + TensorSubPurpose.DoubleBuffer, + block_config[3], + None, + ps, + ) + ) + else: + sram_used += tens.storage_size() + + metrics = npu_performance.performance_metrics_for_pass( + self.arch, ps, block_config, rewrite_list=rewrite_list + ) + + res.extend( + self.append_sram_pass_block_config_performance_metrics_rewrite_list( + sram_used, ps, block_config, metrics, rewrite_list, strat_data + ) + ) + + self.n_combinations_searched += len(res) + res = self.filter_pareto_frontier(res, remove_equally_good_candidates=True) + return res + + def search_weight_streaming_input(self, ps): + sram_used = 0 + for tens in ps.inputs: + if tens.mem_area == self.mem_area: + sram_used += tens.storage_size() + + return self.append_sram(sram_used, self.search_predecessors(ps)) + + def apply_result(self, strat_set, arch): + pass_to_cascaded_pass = dict() + for _, strat in strat_set.strats.items(): + # rewrite the tensors that need this first. e.g. make rolling buffers + inputs = [] + intermediates = [] + outputs = [] + + for ps in strat.passes: + inputs += ps.inputs + intermediates += ps.intermediates + outputs += ps.outputs + + for tens in set(inputs) & set(outputs): + # tensors that are in both sets are intermediates + + # find pass with input/output tensor, and check if they are both placed on NPU + input_placement = None + output_placement = None + for ps in strat.passes: + if tens in ps.inputs: + input_placement = ps.placement + if tens in ps.outputs: + output_placement = ps.placement + if input_placement == output_placement == PassPlacement.Npu: + tens.set_format(TensorFormat.NHCWB16, arch) + + intermediates.append(tens) + inputs.remove(tens) + outputs.remove(tens) + + for rewrite_op, tens, sub_purpose, param_a, param_b, ps in strat.rewrite_list: + if rewrite_op == SchedulerRewrite.ChangeTensorSubPurpose: + tens.mem_area = self.arch.fast_storage_mem_area + tens.set_new_sub_purpose(sub_purpose, param_a, param_b) + else: + assert 0, "unknown rewrite_op " + str(rewrite_op) + + is_element_wise = True + for ps in strat.passes: + assert ps.placement == strat.passes[0].placement + if not ps.is_element_wise: + is_element_wise = False + break + + cascaded_pass = CascadedPass( + strat.passes[0].name, + strat.strat, + inputs, + intermediates, + outputs, + strat.passes, + strat.passes[0].placement, + is_element_wise, + ) + assert strat.sram_used >= 0 + cascaded_pass.sram_used = strat.sram_used + + for idx, ps in enumerate(strat.passes): + assert ps not in pass_to_cascaded_pass + pass_to_cascaded_pass[ps] = cascaded_pass + ps.cascade = cascaded_pass + ps.block_config = strat.block_configs[idx] + + if ps.placement == PassPlacement.Npu: + ps.shared_buffer = shared_buffer_allocation_for_pass_and_block_config( + self.arch, ps, ps.block_config + ) + assert ps.shared_buffer is not None + + for op in ps.ops: + subgraph = op.attrs.get("subgraph") + if subgraph: + subgraph.base_sram_used = cascaded_pass.sram_used + + # all passes should have a cascaded pass now + if len(pass_to_cascaded_pass) != len(self.sg.passes): + print( + "mismatch: we have %d passes, but only %d have cascaded passes associated" + % (len(self.sg.passes), len(pass_to_cascaded_pass)) + ) + for ps in self.sg.passes: + if not ps in pass_to_cascaded_pass: + print("%3d pass missing cascaded pass %s" % (ps.time, ps)) + + assert len(pass_to_cascaded_pass) == len(self.sg.passes) + # we have all the passes, but we need to put them in order and build predecessor/successor links. + + visit_pass_set = set() + cascaded_passes = [] + + def visit_pass(ps): + if ps in visit_pass_set: + return + visit_pass_set.add(ps) + + cps = ps.cascade + dont_traverse = set(cps.passes) + + for ps in cps.passes: + for pred in ps.predecessors: + if pred in dont_traverse: + continue + visit_pass(pred) + + cascaded_passes.append(cps) + + starting_passes = [ps for ps in self.sg.passes if not ps.successors] + for ps in starting_passes: + visit_pass(ps) + + # reorder so startup init cascaded passes come first + def is_startup_cascaded_pass(cps): + if not cps.passes: + return False + return cps.placement == PassPlacement.StartupInit + + cascaded_passes = [cps for cps in cascaded_passes if is_startup_cascaded_pass(cps)] + [ + cps for cps in cascaded_passes if not is_startup_cascaded_pass(cps) + ] + + self.sg.cascaded_passes = cascaded_passes + self.sg.build_cascaded_pass_links() + + +def schedule_passes(nng, arch, options: SchedulerOptions): + + for sg in nng.subgraphs: + sg.base_sram_used = 0 + + for sg in nng.subgraphs: + # re-entering the same nodes from different contexts requires us to + # build a simplified directed acyclic (DAG) version of the graph to + # use for traversal, rather than using a visit dictionary. this avoids + # recursing infinitely due to loops. + sg.build_pass_dag_predecessors() + + dps = DynamicProgrammingScheduler(nng, sg, arch, arch.sram_size, options) + + strat_set = dps.search() + + dps.apply_result(strat_set, arch) + + if options.verbose_schedule: + sg.print_cascaded_passes() diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py new file mode 100644 index 00000000..b5408d19 --- /dev/null +++ b/ethosu/vela/shared_buffer_allocation.py @@ -0,0 +1,199 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass. + +import numpy as np +from .nn_graph import NpuBlockType +from .numeric_util import round_up_divide, round_up +from .architecture_features import Block, Kernel, SHRAMElements, SharedBufferArea, ArchitectureFeatures +from . import pass_packing + + +class SharedBufferAllocation: + def __init__(self, arch, ps): + self.arch = arch + + self.bank_locations = np.zeros(SharedBufferArea.Size) + self.banks_required = np.zeros(SharedBufferArea.Size) + + ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm() + + strides = (1, 1, 1, 1) + dilation = (1, 1, 1, 1) + self.kernel = Kernel(1, 1) + is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise + + if ps.primary_op: + strides = ps.primary_op.attrs.get("strides", strides) + dilation = ps.primary_op.attrs.get("dilation", dilation) + k_h = 1 + k_w = 1 + if weight_tensor: + if ps.primary_op.type != "FullyConnectedAct": + k_h = weight_tensor.shape[0] + k_w = weight_tensor.shape[1] + else: + k_h = ps.primary_op.attrs.get("filter_height", 1) + k_w = ps.primary_op.attrs.get("filter_width", 1) + + self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1]) + + self.is_equal_depth_op = is_elementwise or ps.npu_block_type in ( + NpuBlockType.ConvolutionDepthWise, + NpuBlockType.Pooling, + ) + self.strides = strides + + self.use_accumulator_element = SHRAMElements.Acc32 + if is_elementwise: + self.use_ifm_element = SHRAMElements.IFM8_Elementwise + else: + self.use_ifm_element = SHRAMElements.IFM8 + + self.ifm_bits = 0 + self.ifm_depth = 0 + if ifm_tensor: + self.ifm_bits = ifm_tensor.dtype.size_in_bits() + if ifm_tensor.shape == [] and is_elementwise: + # Elementwise operator with scalar in ifm, use ifm2 depth + self.ifm_depth = ifm2_tensor.shape[-1] + else: + self.ifm_depth = ifm_tensor.shape[-1] + if self.ifm_bits == 16: + self.use_accumulator_element = SHRAMElements.Acc40 + self.use_ifm_element = self.use_ifm_element + 1 + assert (self.use_ifm_element == SHRAMElements.IFM16) or ( + self.use_ifm_element == SHRAMElements.IFM16_Elementwise + ) + else: + assert self.ifm_bits == 8, "Unexpected IFM bitdepth" + + self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits) + self.ofm_tensor = ofm_tensor + + self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks + self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks + + def is_valid(self): + # Assign zero-based bank starts (first element remains zero) + self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1] + + # Accumulator area is measured from the end of the buffer + self.bank_locations[SharedBufferArea.Accumulators] = ( + self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators] + ) + ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM] + return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators] + + def try_block(self, ofm_block: Block): + # Get IFM block configuration + ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth + ifm_block = self.arch.get_ifm_block_size(ifm_block_depth, ofm_block, self.kernel) + ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth) + if ifm_config is None: + return None + + # Get OFM block configuration + ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth) + if ofm_config is None: + return None + + # Update bank counts for IFM and Accumulator + self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] + self.banks_required[SharedBufferArea.Accumulators] = ofm_config.banks[self.use_accumulator_element] + + # Validating calculates bank layout and returns validity + if not self.is_valid(): + return None + + return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth) + + def generate_used_mask(self, active_set): + res = np.zeros(self.arch.shram_total_banks, dtype=np.int64) + for kind in active_set: + start = int(self.bank_locations[kind]) + end = start + int(self.banks_required[kind]) + res[start:end] = 1 + return res + + def is_compatible(first, second): + """See if the bank allocations of two convolutions are compatible, + so that they can run back-to-back without a fence in between""" + + first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators)) + second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights)) + + first_mask = first.generate_used_mask(first_set) + second_mask = second.generate_used_mask(second_set) + + if np.sum(first_mask & second_mask): + # overlap + return False + + return True + + +def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config): + alloc = SharedBufferAllocation(arch, ps) + assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op + if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])): + return alloc + + return None + + +def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps): + alloc = SharedBufferAllocation(arch, ps) + + if arch.override_block_config: + config = alloc.try_block(arch.override_block_config) + assert config, "Block config override cannot be used" + return [config] + + # Constrain the search space if the OFM is smaller than the max block size + # - Add other block search constraints here if required + if len(alloc.ofm_tensor.shape) == 2: + max_block_height = max_block_width = alloc.ofm_tensor.shape[0] + else: + max_block_width = alloc.ofm_tensor.shape[-2] + max_block_height = alloc.ofm_tensor.shape[-3] + + # Common block depth + max_block_depth = alloc.ofm_tensor.shape[-1] + + # Constrain to valid ranges before search + max_block_width = min(arch.ofm_block_max.width, max_block_width) + max_block_height = min(arch.ofm_block_max.height, max_block_height) + max_block_depth = min(arch.ofm_block_max.depth, max_block_depth) + + valid_block_configs = [] + # Try a range of block shapes against this pass + for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width): + for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height): + # Try valid OFM block depths + for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth): + # OFM block depth has the constraint that if it causes the OFM to be + # split, it must be a multiple of the OFM split size + if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0): + config = alloc.try_block(Block(w, h, c)) + if config: + valid_block_configs.append(config) + + assert len(valid_block_configs) > 0 + return valid_block_configs diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py new file mode 100644 index 00000000..c4b4cd9e --- /dev/null +++ b/ethosu/vela/stats_writer.py @@ -0,0 +1,367 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Writes out per-pass and summary performance statistics to CSV files. + +import numpy as np +from .nn_graph import MemArea, TensorPurpose, PassPlacement +from .npu_performance import PassCycles, MacCount, BandwidthDirection +import csv +from .numeric_util import round_up_to_int +import sys + + +def write_summary_metrics_csv(nng, summary_filename, arch): + with open(summary_filename, "w") as f: + writer = csv.writer(f) + + labels = [ + "experiment", + "network", + ] + + labels += ( + ["accelerator_configuration", "system_config", "npu_clock", "sram_size"] + + [area.identifier_name() + "_bandwidth" for area in MemArea.all()] + + ["weights_storage_area", "feature_map_storage_area"] + ) + + labels += [ + "inferences_per_second", + "batch_size", + "inference_time", + "passes_before_fusing", + "passes_after_fusing", + ] + labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()] + labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"] + + for mem_area in MemArea.all(): + labels += [ + mem_area.identifier_name() + "_feature_map_read_bytes", + mem_area.identifier_name() + "_feature_map_write_bytes", + mem_area.identifier_name() + "_weight_read_bytes", + mem_area.identifier_name() + "_weight_write_bytes", + mem_area.identifier_name() + "_total_bytes", + ] + + labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"] + + labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()] + + writer.writerow(labels) + + data_items = [ + "default", + nng.name, + ] + + if arch: + data_items += ( + [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024] + + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()] + + [ + arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(), + arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(), + ] + ) + + midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock + midpoint_fps = 1 / midpoint_inference_time + + n_passes = sum(len(sg.passes) for sg in nng.subgraphs) + n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) + + data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes] + data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()] + + data_items += [ + nng.bits_per_element.get(MemArea.OnChipFlash, 0.0), + nng.bits_per_element.get(MemArea.OffChipFlash, 0.0), + ] + + for mem_area in MemArea.all(): + bws = nng.bandwidths[mem_area] + total_bw = np.sum(bws) + weight_bws = bws[TensorPurpose.Weights] + fm_bws = bws[TensorPurpose.FeatureMap] + data_items += [ + fm_bws[BandwidthDirection.Read], + fm_bws[BandwidthDirection.Write], + weight_bws[BandwidthDirection.Read], + weight_bws[BandwidthDirection.Write], + total_bw, + ] + + data_items += [ + nng.macs[MacCount.NeuralNetworkMacs], + nng.macs[MacCount.HardwareMacs], + nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12, + nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12, + ] + + data_items += [nng.cycles[kind] for kind in PassCycles.all()] + + writer.writerow(data_items) + + +def write_pass_metrics_csv(nng, pass_filename): + + with open(pass_filename, "w") as f: + writer = csv.writer(f) + + purpose_list = ( + ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)), + ("weights", (TensorPurpose.Weights,)), + ("feature_map", (TensorPurpose.FeatureMap,)), + ) + + direction_list = ( + ("total", (BandwidthDirection.Read, BandwidthDirection.Write)), + ("read", (BandwidthDirection.Read,)), + ("write", (BandwidthDirection.Write,)), + ) + bandwidth_names = [] + bandwidth_indices = [] + for mem_area in MemArea.all(): + for purpose, purpose_candidates in purpose_list: + for direction, direction_candidates in direction_list: + label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction) + bandwidth_names.append(label) + bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates)) + + all_macs = MacCount.all() + all_cycles = ( + PassCycles.Total, + PassCycles.Dpu, + PassCycles.ElementWise, + PassCycles.Cpu, + PassCycles.SramAccess, + PassCycles.DramAccess, + PassCycles.OnChipFlashAccess, + PassCycles.OffChipFlashAccess, + ) + writer.writerow( + [ + "name", + "operators", + "placement", + "streaming_strategy", + "block_config_height", + "block_config_width", + "block_config_input_channels", + "block_config_output_channels", + "n_blocks_in_pass", + ] + + ["cycles_" + v.identifier_name() for v in all_cycles] + + [v.identifier_name() for v in all_macs] + + bandwidth_names + + ["sram_used"] + ) + + def write_subgraph(sg): + for cps in sg.cascaded_passes: + if cps.placement == PassPlacement.StartupInit: + continue # skip the dummy init pass + + for ps in cps.passes: + if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp": + # just treat this as a call, unroll it + write_subgraph(ps.ops[0].attrs["subgraph"]) + continue + stats = [ps.name, " ".join(op.type for op in ps.ops)] + stats += [ps.placement.name] + stats += [cps.strategy.name] + stats += list(ps.block_config) + stats += [ps.n_blocks] + stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles] + stats += [round_up_to_int(ps.macs[v]) for v in all_macs] + for indices in bandwidth_indices: + res = 0 + i = indices[0] + for j in indices[1]: + for k in indices[2]: + res += round_up_to_int(ps.bandwidths[i, j, k]) + stats.append(res) + stats += [ps.sram_used] + + writer.writerow(stats) + + write_subgraph(nng.get_root_subgraph()) + + +def print_performance_metrics_for_strat( + arch, + name, + cycles, + macs, + bandwidths, + batch_size, + memory_used, + num_passes, + num_cascaded_passes, + n_operations=0, + cpu_operations=[], + bits_per_element=None, + show_cpu_operations=False, + f=sys.stdout, +): + + orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()] + + midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock + midpoint_fps = 1 / midpoint_inference_time + + mem_area_labels = [ + (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0 + ] + + if name: + print("", file=f) + print("Network summary for", name, file=f) + print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f) + print("System configuration %20s" % (arch.system_config,), file=f) + print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f) + for mem_area, label in mem_area_labels: + print( + "Design peak %-25s %12.2f GB/s" + % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,), + file=f, + ) + + print(file=f) + for mem_area, label in mem_area_labels: + if not mem_area in memory_used: + continue + + aug_label = label + " used" + + extra = "" + if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None: + extra = " (%.2f bits per element)" % (bits_per_element[mem_area],) + + print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f) + + print(file=f) + print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f) + + n_cpu_operations = len(cpu_operations) + if n_operations > 0: + print( + "%d/%d (%4.1f %%) operations falling back to the CPU" + % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100), + file=f, + ) + + if show_cpu_operations: + for op in cpu_operations: + + def format_tens_list(lst): + return " ".join(str(list(tens.shape)) for tens in lst) + + print( + "CPU operation: %s, inputs %s, outputs %s" + % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)), + file=f, + ) + + print("", file=f) + + for mem_area, label in mem_area_labels: + bws = bandwidths[mem_area] + total_bw = np.sum(bws) + weight_bws = bws[TensorPurpose.Weights] + fm_bws = bws[TensorPurpose.FeatureMap] + aug_label = label + " bandwidth" + print( + "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,), + file=f, + ) + print( + "Input %-25s %12.2f MB/batch" + % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,), + file=f, + ) + print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f) + print( + "Output %-25s %12.2f MB/batch" + % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,), + file=f, + ) + print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f) + print( + "Total %-25s per input %9.2f MB/inference (batch size %d)" + % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size), + file=f, + ) + print(file=f) + + print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f) + print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f) + print( + "Network Tops/s %12.2f Tops/s" + % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12), + file=f, + ) + print( + "Hardware Tops/s %12.2f Tops/s" + % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12), + file=f, + ) + print(file=f) + + for kind in PassCycles.all(): + aug_label = kind.display_name() + " cycles" + cyc = cycles[kind] + print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f) + print(file=f) + + print( + "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)" + % (midpoint_inference_time * 1000, midpoint_fps, batch_size), + file=f, + ) + print(file=f) + + +def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout): + n_passes = sum(len(sg.passes) for sg in nng.subgraphs) + n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) + n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes) + cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), []) + return print_performance_metrics_for_strat( + arch, + nng.name, + nng.cycles, + nng.macs, + nng.bandwidths, + nng.batch_size, + nng.memory_used, + n_passes, + n_cascaded_passes, + n_operations, + cpu_operations, + nng.bits_per_element, + show_cpu_operations, + f, + ) + + +def write_human_friendly_metrics(nng, arch, filename): + f = open(filename, "w") + print_performance_metrics(nng, arch, f=f) diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py new file mode 100644 index 00000000..23135f8a --- /dev/null +++ b/ethosu/vela/supported_operators.py @@ -0,0 +1,243 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# The SupportedOperators class which is a collection of all supported operators and parameter checks. + +from .data_type import BaseType + + +class SupportedOperators: + def __init__(self): + # Categorised lists of supported operators + self.npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead")) + self.convolution_ops = set(("Conv2DBiasAct", "Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitched")) + self.depthwise_convolution_ops = set( + ("DepthwiseConv2dBiasAct", "DepthwiseConv2dNative", "QuantizedDepthwiseConv2D") + ) + self.max_pooling_ops = set(("QuantizedMaxPool", "MaxPool", "MaxPoolAct")) + self.avg_pooling_ops = set(("QuantizedAvgPool", "AvgPool", "AvgPoolAct")) + self.pooling_ops = self.max_pooling_ops | self.avg_pooling_ops + self.fc_vector_products = set(("QuantizedMatMul", "MatMul", "FullyConnectedAct")) + self.mac_main_ops = ( + # convolutions + self.convolution_ops + # depth-wise convolutions + | self.depthwise_convolution_ops + # pooling + | self.pooling_ops + # FC layers + | self.fc_vector_products + # RNN/LSTM/GRU + | set(("BlockLSTM")) + ) + self.elem_wise_main_ops = set( + ( + # element-wise + "AddAct", + "MulAct", + "SubAct", + "QuantizedAdd", + "QuantizedSub", + "QuantizedMul", + "Mul", + "Add", + "Sub", + "Minimum", + "Maximum", + ) + ) + self.activation_ops = set( + ("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh") + ) + self.npu_post_ops = ( + # activation functions + self.activation_ops + # concatenation write direction + | set(("ConcatSliceWrite")) + # bias add and batch norm + | set(("QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")) + ) + self.split_ops = set(("Split", "StridedSlice", "Slice", "UnpackReshaped", "Unpack")) + self.concat_ops = set(("Concat", "ConcatV2", "QuantizedConcat", "ConcatTFLite", "PackReshaped", "Pack")) + self.memory_only_ops = ( + set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims")) | self.concat_ops | self.split_ops + ) + self.supported_fused_activations = set(("Relu", "Relu6", "ReluN1To1", "Tanh", "Sigmoid")) + self.supported_operators = ( + self.npu_pre_ops | self.mac_main_ops | self.elem_wise_main_ops | self.npu_post_ops | self.memory_only_ops + ) + # Setup supported operator restriction checkers + self.supported_operator_restrictions = {} + self.supported_operator_restrictions.update( + {op: self.check_convolution_restrictions for op in self.convolution_ops} + ) + self.supported_operator_restrictions.update( + {op: self.check_depthwise_convolution_restrictions for op in self.depthwise_convolution_ops} + ) + self.supported_operator_restrictions.update({op: self.check_pooling_restrictions for op in self.pooling_ops}) + self.supported_operator_restrictions.update( + {op: self.check_vector_product_restrictions for op in self.fc_vector_products} + ) + self.supported_operator_restrictions.update( + {op: self.check_element_wise_restrictions for op in self.elem_wise_main_ops} + ) + self.supported_operator_restrictions.update( + {op: self.check_memory_only_restrictions for op in self.memory_only_ops} + ) + + def is_operator_supported(self, op): + if op.type not in self.supported_operators: + return False + if not self.check_generic_restrictions(op): + return False + if op.type in self.supported_operator_restrictions: + return self.supported_operator_restrictions[op.type](op) + return True + + def check_generic_restrictions(self, op): + # check fully defined shapes + for t in op.inputs + op.outputs: + if not t.has_fully_defined_shape(): + print("Warning:", op, "has inputs/outputs of undefined shape, placing on CPU") + return False + + # check data type + tensors = [t for t in op.get_ifm_ifm2_weights_ofm() if t is not None] + if not tensors: + tensors = op.inputs + for t in tensors: + if not (t.dtype.type & BaseType.Int): + return False + if t.element_size() > 2 and op.type != "Requantize": + return False + # check size + if any(dim > 65536 for dim in t.shape): + return False + + # check fused activations + if ( + "fused_activation_function" in op.attrs + and op.attrs["fused_activation_function"] is not None + and op.attrs["fused_activation_function"] not in self.supported_fused_activations + ): + return False + return True + + def check_convolution_restrictions(self, op): + # check stride + if op.attrs["stride_w"] > 2 or op.attrs["stride_h"] > 2: + return False + + # check dilation + dilation_w_factor = op.attrs.get("dilation_w_factor", 1) + dilation_h_factor = op.attrs.get("dilation_h_factor", 1) + if dilation_w_factor > 2 or dilation_h_factor > 2: + return False + + # check data type + ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm() + if weight_tensor.element_size() > 1: + return False + + # check kernel size + dilated_weight_w = weight_tensor.shape[0] + (weight_tensor.shape[0] - 1) * (dilation_w_factor - 1) + dilated_weight_h = weight_tensor.shape[1] + (weight_tensor.shape[1] - 1) * (dilation_h_factor - 1) + if ( + dilated_weight_w > 64 + or dilated_weight_h > 64 + or dilated_weight_w * dilated_weight_h * weight_tensor.shape[2] > 127 * 65536 + ): + return False + + # check batch size + if ifm_tensor.shape[0] != 1: + return False + return True + + def check_depthwise_convolution_restrictions(self, op): + # check depth + ifm_tensor, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm() + if op.attrs["depth_multiplier"] > 1 and not ( + (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"]) + ): + return False + return self.check_convolution_restrictions(op) + + def check_pooling_restrictions(self, op): + # check stride + if op.attrs["stride_w"] > 2 or op.attrs["stride_h"] > 2: + return False + + # check data type + ifm_tensor, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm() + if ifm_tensor.dtype != ofm_tensor.dtype: + return False + + # check batch size + if ifm_tensor.shape[0] != 1: + return False + + if op.type in self.avg_pooling_ops: + # check kernel size + if op.attrs["padding"] == b"SAME" and (op.attrs["filter_width"] > 8 or op.attrs["filter_height"] > 8): + return False + if op.attrs["padding"] == b"VALID" and (op.attrs["filter_width"] > 256 or op.attrs["filter_height"] > 256): + return False + + if op.type in self.max_pooling_ops: + # check data type + if not ifm_tensor.dtype == ofm_tensor.dtype: + return False + # check kernel size + if op.attrs["filter_width"] > 256 or op.attrs["filter_height"] > 256: # any padding + return False + return True + + def check_vector_product_restrictions(self, op): + # check data type + ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm() + if weight_tensor.element_size() > 1: + return False + + return True + + def check_element_wise_restrictions(self, op): + # check data type + ifm_tensor, ifm2_tensor, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm() + if op.type in ("Minimum", "Maximum") and ifm_tensor.dtype != ofm_tensor.dtype: + return False + + # check batch size + if (len(ifm_tensor.shape) > 2 and ifm_tensor.shape[0] != 1) or ( + len(ifm2_tensor.shape) > 2 and ifm2_tensor.shape[0] != 1 + ): + return False + + # check scalar size + if (hasattr(ifm_tensor.values, "__len__") and len(ifm_tensor.values) > 1) or ( + hasattr(ifm2_tensor.values, "__len__") and len(ifm2_tensor.values) > 1 + ): + return False + return True + + def check_memory_only_restrictions(self, op): + # check stride size + if op.type == "StridedSlice": + if len(op.inputs) > 3 and any(stride != 1 for stride in op.inputs[3].values): + return False + return True diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py new file mode 100644 index 00000000..46040a46 --- /dev/null +++ b/ethosu/vela/tensor.py @@ -0,0 +1,629 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Internal representation of a Neural Network Tensor. + +import enum +from . import numeric_util +import numpy as np +from . import data_type +import uuid +from .range_set import MemoryRangeSet +from .numeric_util import round_up_divide + + +class MemArea(enum.IntFlag): + Unknown = 0 + Sram = 1 + Dram = 2 + OnChipFlash = 3 + OffChipFlash = 4 + Size = OffChipFlash + 1 + + def display_name(self): + return ("Unknown", "SRAM", "DRAM", "On-chip Flash", "Off-chip Flash", "Size")[self.value] + + def identifier_name(self): + return ("unknown", "sram", "dram", "on_chip_flash", "off_chip_flash", "size")[self.value] + + def all(): + return (MemArea.Sram, MemArea.Dram, MemArea.OnChipFlash, MemArea.OffChipFlash) + + def __str__(self): + return self.name + + +class TensorPurpose(enum.IntFlag): + Unknown = 0 + Weights = 1 + FeatureMap = 2 + Scratch = 3 + Size = 4 + + def display_name(self): + return ("Unknown", "Weights", "FeatureMap", "Scratch", "Size")[self.value] + + def identifier_name(self): + return ("unknown", "weights", "feature_map", "scratch", "size")[self.value] + + def all(): + return (TensorPurpose.Weights, TensorPurpose.FeatureMap) + + +class TensorSubPurpose(enum.Enum): + Standard = 0 + DoubleBuffer = 1 + RollingBufferX = 2 + RollingBufferY = 3 + RollingBufferXY = 4 + + def display_name(self): + return ("Standard", "Double Buffer", "Rolling Buffer X", "Rolling Buffer Y", "Rolling Buffer XY")[self.value] + + def identifier_name(self): + return ("standard", "double_buffer", "rolling_buffer_x", "rolling_buffer_y", "rolling_buffer_xy")[self.value] + + def all(): + return ( + TensorSubPurpose.Standard, + TensorSubPurpose.DoubleBuffer, + TensorSubPurpose.RollingBufferX, + TensorSubPurpose.RollingBufferY, + TensorSubPurpose.RollingBufferXY, + ) + + +class TensorFormat(enum.Flag): + Unknown = 0 + WeightsCompressed = 1 + NHWC = 2 + NHCWB16 = 3 + + def __str__(self): + return self.name + + +class TensorBlockTraversal(enum.Enum): + Default = 0 + DepthWise = 1 + DepthFirst = 2 + PartKernelFirst = 3 + + +def shape_num_elements(shp): + elems = 1 + if shp is None: + return None + for d in shp: + if d is None: + return None + elems *= d + return elems + + +def shape_fully_defined(shp): + if shp is None: + return False + for d in shp: + if d is None: + return False + return True + + +def shape_round_to_quantum(shp, quantum): + new_shp = list(shp) + + # Traverse backwards using length of shape since there may be more rounding quantums than shape elements + for i in range(-1, -len(shp) - 1, -1): + if new_shp[i] is not None: + new_shp[i] = numeric_util.round_up(new_shp[i], quantum[i]) + return new_shp + + +class QuantizationParameters: + __slots__ = "min", "max", "num_bits", "narrow_range", "scale_f32", "zero_point", "quant_min", "quant_max" + + def __init__(self, min=None, max=None, num_bits=None, narrow_range=None): + self.min = min + self.max = max + + self.num_bits = num_bits + self.narrow_range = narrow_range + + self.scale_f32 = None + self.zero_point = None + self.quant_min = None + self.quant_max = None + + def __str__(self): + return "" % ( + self.min, + self.max, + self.num_bits, + self.scale_f32, + self.zero_point, + ) + + __repr__ = __str__ + + def clone(self): + res = QuantizationParameters() + res.min = self.min + res.max = self.max + + res.num_bits = self.num_bits + res.narrow_range = self.narrow_range + + res.scale_f32 = self.scale_f32 + res.zero_point = self.zero_point + res.quant_min = self.quant_min + res.quant_max = self.quant_max + return res + + def dequantize(self, values): + if self.zero_point.size == 1 and self.scale_f32.size == 1: + # same scale is used for all values + res = (values.astype(np.float64) - self.zero_point) * self.scale_f32 + else: + # a different scale is used for different sets of values + values_as_float = values.astype(np.float64) + + # this is not compatible with the format of depthwise weights, + # where input is at index 3 (Output, Kh, Kw, Input) + # return the quantized values + return np.ndarray((values_as_float.shape)) + + shape = values_as_float.shape[0] + assert self.zero_point.size == self.scale_f32.size == shape + res = np.ndarray(values_as_float.shape) + for i in range(shape): + res[i] = (values_as_float[i] - self.zero_point[i]) * self.scale_f32[i] + + return res + + +class Tensor: + __slots__ = ( + "shape", + "storage_shape", + "bandwidth_shape", + "dtype", + "name", + "ops", + "consumer_list", + "values", + "quant_values", + "compressed_values", + "mem_area", + "format", + "purpose", + "sub_purpose", + "alignment", + "weight_transpose_depthwise", + "storage_compression_scale", + "bandwidth_compression_scale", + "compression_scale_for_worst_weight_stream", + "weight_compression_scales", + "weight_compression_config", + "storage_rounding_quantum", + "brick_size", + "address", + "quantization", + "weight_compressed_offsets", + "element_size_bytes", + "reshaped", + "block_traversal", + "offset", + "cpu_tensor", + "npu_tensor", + "equivalence_id", + ) + AllocationQuantum = 16 + + def __init__(self, shape, dtype, name): + self.shape = shape + self.storage_shape = shape + self.bandwidth_shape = shape + self.dtype = dtype + self.name = name + self.equivalence_id = uuid.uuid4() + + self.ops = [] + self.consumer_list = [] + # Below attributes are only set if a tensor has been cloned, + # either from Cpu -> Npu or vice versa. Needed for offline allocation + self.cpu_tensor = None # reference to the corresponding Cpu tensor + self.npu_tensor = None # reference to the corresponding Npu tensor + + self.values = None + self.quant_values = None + self.compressed_values = None + self.mem_area = MemArea.Unknown + self.format = TensorFormat.Unknown + self.purpose = TensorPurpose.Unknown + self.sub_purpose = TensorSubPurpose.Standard + self.alignment = Tensor.AllocationQuantum + self.weight_transpose_depthwise = False + + self.storage_compression_scale = 1.0 + self.bandwidth_compression_scale = 1.0 + self.compression_scale_for_worst_weight_stream = 1.0 + self.weight_compression_scales = None + self.weight_compression_config = None + self.weight_compressed_offsets = [] + self.storage_rounding_quantum = (1, 1, 1, 1) + self.brick_size = (1, 1, 1, 1) + self.address = 0 # start address of tensor. will be filled in by tensor allocator + self.element_size_bytes = 0 + + # quantization parameters + self.quantization = None + + self.reshaped = False + self.block_traversal = TensorBlockTraversal.Default + + def element_size(self): + if self.element_size_bytes == 0: + return self.dtype.size_in_bits() / 8 + return self.element_size_bytes + + def clone(self, suffix="_clone"): + res = Tensor(self.shape, self.dtype, self.name + suffix) + res.storage_shape = list(self.storage_shape) + res.bandwidth_shape = list(self.bandwidth_shape) + + res.ops = [] + res.consumer_list = [] + res.equivalence_id = self.equivalence_id + + res.values = self.values + res.quant_values = self.quant_values + res.compressed_values = self.compressed_values + res.mem_area = self.mem_area + res.format = self.format + res.purpose = self.purpose + res.sub_purpose = self.sub_purpose + res.alignment = self.alignment + res.weight_transpose_depthwise = self.weight_transpose_depthwise + + res.storage_compression_scale = self.storage_compression_scale + res.bandwidth_compression_scale = self.bandwidth_compression_scale + res.compression_scale_for_worst_weight_stream = self.compression_scale_for_worst_weight_stream + res.weight_compression_scales = self.weight_compression_scales + res.storage_rounding_quantum = self.storage_rounding_quantum + res.brick_size = self.brick_size + res.address = 0 + + if self.quantization is not None: + res.quantization = self.quantization.clone() + else: + res.quantization = None + + return res + + def clone_into_fast_storage(self, arch): + res = self.clone(suffix="_fast_storage") + res.mem_area = arch.fast_storage_mem_area + return res + + def set_format(self, fmt, arch): + self.format = fmt + shape_len = 0 + try: + shape_len = len(self.shape) + except TypeError: + pass + + self.storage_rounding_quantum = arch.storage_rounding_quantums[self.format] + self.storage_rounding_quantum = self.storage_rounding_quantum[-shape_len:] + if self.format == TensorFormat.NHCWB16: + self.storage_rounding_quantum = self.storage_rounding_quantum[:-1] + ( + int(self.storage_rounding_quantum[-1] / self.dtype.size_in_bytes()), + ) + self.brick_size = arch.brick_sizes[self.format] + self.brick_size = self.brick_size[-shape_len:] + if self.shape is None: + return + + self.bandwidth_shape = shape_round_to_quantum(self.shape, self.brick_size) + self.storage_shape = shape_round_to_quantum(self.shape, self.storage_rounding_quantum) + + if fmt == TensorFormat.WeightsCompressed: + compression_ratio = 5 / 8 + self.storage_compression_scale = compression_ratio + self.bandwidth_compression_scale = compression_ratio + self.compression_scale_for_worst_weight_stream = compression_ratio + + def storage_elements(self): + elems = shape_num_elements(self.storage_shape) + if elems is None: + return 0 + return elems + + def elements(self): + elems = shape_num_elements(self.shape) + if elems is None: + return 0 + return elems + + def has_fully_defined_shape(self): + return shape_fully_defined(self.shape) + + def storage_size(self): + raw_size = self.storage_elements() * self.element_size() + if raw_size == 0: + raw_size = 1 # force it to take up space + rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment) + return rounded_size + + def storage_size_for_sub_purpose(self, sub_purpose, param_a=None, param_b=None): + alt_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b) + elems = shape_num_elements(alt_shape) + if elems is None: + return 0 + if sub_purpose == TensorSubPurpose.DoubleBuffer: + raw_size = elems * self.element_size() * self.compression_scale_for_worst_weight_stream + else: + raw_size = elems * self.element_size() * self.storage_compression_scale + rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment) + return rounded_size + + def storage_shape_for_sub_purpose(self, sub_purpose, param_a, param_b): + shp = list(self.storage_shape) + if sub_purpose == TensorSubPurpose.DoubleBuffer: + assert len(shp) >= 2 + shp[-1] = min(shp[-1], param_a * 2) + elif sub_purpose == TensorSubPurpose.RollingBufferX: + assert len(shp) == 4 + shp[0] = 1 + shp[2] = min(shp[2], param_a) + elif sub_purpose == TensorSubPurpose.RollingBufferY: + assert len(shp) == 4 + shp[0] = 1 + shp[1] = min(shp[1], param_a) + elif sub_purpose == TensorSubPurpose.RollingBufferXY: + assert len(shp) == 4 + shp[0] = 1 + shp[2] = min(shp[2], param_a) + shp[1] = min(shp[1], param_b) + elif sub_purpose == TensorSubPurpose.Standard: + pass + else: + assert 0, "did not expect new sub purpose %s" % (sub_purpose,) + return shp + + def set_new_sub_purpose(self, sub_purpose, param_a=None, param_b=None): + self.storage_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b) + self.sub_purpose = sub_purpose + if sub_purpose == TensorSubPurpose.DoubleBuffer: + self.storage_compression_scale = self.compression_scale_for_worst_weight_stream + + def bandwidth(self): + elems = shape_num_elements(self.bandwidth_shape) + if elems is None: + return 0 + return elems * self.element_size() * self.bandwidth_compression_scale + + def consumers(self): + return self.consumer_list + + def get_address_ranges_for_coordinates(self, start_coord, end_coord): + if self.sub_purpose in set( + (TensorSubPurpose.RollingBufferX, TensorSubPurpose.RollingBufferY, TensorSubPurpose.RollingBufferXY) + ): + # build dummy coordinates that cover the entire buffer + start_coord = [0] * len(start_coord) + end_coord = [min(self.storage_shape[i], self.shape[i]) for i in range(len(end_coord))] + + start = self.address_for_coordinate(start_coord, is_top_box=False) + end = self.address_for_coordinate(end_coord, is_top_box=True) + return MemoryRangeSet(self.mem_area, start, end) + + def addresses_for_rolling_buffer(self, start_coord, end_coord): + # returns ( box_height0, box_height1, box_width, [address_tl, address_tr, address_bl, address_br] ) + + if len(start_coord) < 4: + box_height0 = 1 + box_width = 1 + + if len(start_coord) >= 2: + box_width = end_coord[-2] - start_coord[-2] + + return box_height0, box_height0, box_width, [self.address_for_coordinate(start_coord), None, None, None] + + crossing_y = numeric_util.round_up(start_coord[1] + 1, self.storage_shape[1]) + crossing_x = numeric_util.round_up(start_coord[2] + 1, self.storage_shape[2]) + + crossing_y = min(crossing_y, end_coord[1]) + crossing_x = min(crossing_x, end_coord[2]) + + box_height0 = crossing_y - start_coord[1] + box_width = crossing_x - start_coord[2] + + addresses = [None] * 4 + addresses[0] = self.address_for_coordinate(start_coord) + + if end_coord[2] > crossing_x: + addresses[1] = self.address_for_coordinate([start_coord[0], start_coord[1], crossing_x, start_coord[3]]) + raise Exception("Striping in vertical direction is not supported") + if end_coord[1] > crossing_y: + addresses[2] = self.address_for_coordinate([start_coord[0], crossing_y, start_coord[2], start_coord[3]]) + if end_coord[1] > crossing_y and end_coord[2] > crossing_x: + addresses[3] = self.address_for_coordinate([start_coord[0], crossing_y, crossing_x, start_coord[3]]) + + return box_height0, box_height0, box_width, addresses + + def address_for_coordinate(self, coord, is_top_box=False): + return self.address + self.address_offset_for_coordinate(coord, is_top_box) + + def get_strides_and_coord(self, coord=None): + if coord is None: + coord = [0] * len(self.storage_shape) + + augmented_coord = coord + augmented_shape = self.storage_shape + while len(augmented_shape) < 4: + augmented_shape = [1] + augmented_shape + + while len(augmented_coord) < 4: + augmented_coord = [0] + augmented_coord + + assert len(augmented_coord) == len(augmented_shape) + + if self.format == TensorFormat.NHWC: + augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1] + augmented_coord = [augmented_coord[0], augmented_coord[3]] + augmented_coord[1:3] + [0] + stride_order = [4, 1, 3, 2, 0] + + elif self.format == TensorFormat.NHCWB16: + channel_divisor = int(16 / self.element_size()) + augmented_shape = augmented_shape[0:4] + [1] + augmented_coord = ( + [augmented_coord[0], augmented_coord[3] // channel_divisor] + + augmented_coord[1:3] + + [augmented_coord[3] % channel_divisor] + ) + + if augmented_shape[1] == 0: + augmented_shape[1] = 1 + + else: + assert self.format in set((TensorFormat.Unknown, TensorFormat.WeightsCompressed)) + return None, None + + strides = [0] * len(augmented_shape) + stride = self.element_size() * self.storage_compression_scale + + if self.format != TensorFormat.NHCWB16: + for i in stride_order: + strides[i] = stride + stride *= augmented_shape[i] + else: + assert len(strides) == 5 + channel_divisor = int(16 / self.element_size()) + strides[4] = stride + strides[3] = channel_divisor # STRIDE_X + strides[1] = strides[3] * augmented_shape[2] # STRIDE_C + strides[2] = augmented_shape[2] * augmented_shape[3] # STRIDE_Y + strides[0] = strides[2] * augmented_shape[1] # STRIDE_N + + return strides, augmented_coord + + def get_strides(self): + strides, _ = self.get_strides_and_coord() + + return strides + + def compressed_stream_index_from_coord(self, coord): + assert self.format == TensorFormat.WeightsCompressed + assert len(self.compressed_values) > 0 + assert len(self.compressed_values) + 1 == len(self.weight_compressed_offsets) + + depth = coord[-1] + brick_depth = self.brick_size[-1] + # Clamp position at final element index + if depth > self.shape[-1]: + depth = self.shape[-1] + + # Always round up to next boundary + index = round_up_divide(depth, brick_depth) + + # Check boundaries on all but last weight set (which may be shorter + # than the brick we divided it up into) + if index < len(self.weight_compressed_offsets) - 1: + # There are no half-way points in the weights + if (depth % brick_depth) != 0: + raise Exception("Offset into weights must be aligned to a brick") + + return index + + def size_of_compressed_stream(self, index): + assert 0 <= index < len(self.compressed_values) + return len(self.compressed_values[index]) + + def is_last_index_in_compressed_stream(self, index): + assert 0 <= index < len(self.compressed_values) + return index == len(self.compressed_values) - 1 + + def address_offset_for_coordinate(self, orig_coord, is_top_box=False): + address_offset = 0 + coord = orig_coord + + coord = coord[-len(self.storage_shape) :] + + if self.sub_purpose == TensorSubPurpose.Standard: + for idx, c in enumerate(coord): + if is_top_box: + assert c > 0 and c <= self.shape[idx] + else: + assert c >= 0 and c < self.shape[idx] + + if self.format == TensorFormat.WeightsCompressed: + if len(self.weight_compressed_offsets) == 0: + return 0 + + if len(self.ops) == 1 and self.ops[0].type == "DMA" and self.sub_purpose == TensorSubPurpose.DoubleBuffer: + depth = orig_coord[-1] + brick_depth = self.brick_size[-1] + # Clamp position at final element index + if depth > self.shape[-1]: + depth = self.shape[-1] + + # Always round up to next boundary + index = round_up_divide(depth, brick_depth) + index = index % 2 + + if len(self.compressed_values) <= 2: + if is_top_box and index == 0: + for cv in self.compressed_values: + address_offset += len(cv) + else: + address_offset = index * len(self.compressed_values[0]) + else: + if is_top_box and index == 0: + address_offset = self.storage_shape[-1] + else: + address_offset = index * (self.storage_shape[-1] // 2) + else: + index = self.compressed_stream_index_from_coord(orig_coord) + assert index < len(self.weight_compressed_offsets) + address_offset = self.weight_compressed_offsets[index] + else: + if is_top_box: + coord = [c - 1 for c in coord] + + # handle wraparound for partial buffers. make sure to do this after subtracting top box: + coord = [c % self.storage_shape[idx] for idx, c in enumerate(coord)] + + strides, augmented_coord = self.get_strides_and_coord(coord) + if strides is None: + return None + + if is_top_box: + address_offset += 1 * strides[-1] # one element + + address_offset += np.dot(augmented_coord, strides) + + assert address_offset >= 0 + assert address_offset <= self.storage_size() + return address_offset + + def __str__(self): + return "" % (self.name, self.shape, self.dtype) + + __repr__ = __str__ diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py new file mode 100644 index 00000000..94aa6088 --- /dev/null +++ b/ethosu/vela/tensor_allocation.py @@ -0,0 +1,139 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been +# worked out from the allowable overlaps that are calculated by the live range analysis. + +from . import live_range +from .tensor import MemArea +import math +from . import numeric_util +import numpy as np +from .nn_graph import TensorAllocator, PassPlacement + +from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges + + +def linear_allocate_live_ranges(live_ranges, alloc_granularity=256): + total_sz = 0 + allocated_tensors = [] + + # just assign increasing addresses + for tens, lr in live_ranges.ranges.items(): + if tens in allocated_tensors: + continue + + lr.set_address(total_sz) + allocated_tensors += lr.tensors + total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity) + + return total_sz + + +def mark_sram_used_for_cascaded_passes(sg, lrs): + end_pos = max(ps.time for ps in sg.cascaded_passes) + 2 + mem_usage = np.zeros(end_pos, dtype=np.int64) + + for tens, rng in lrs.ranges.items(): + storage_size = tens.storage_size() + mem_usage[rng.start_time : rng.end_time] += storage_size + + for cps in sg.cascaded_passes: + sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1]) + cps.sram_used = sram_used + for ps in cps.passes: + ps.sram_used = sram_used + + +def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation): + if verbose_allocation: + if mem_area == MemArea.Sram: + print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs") + else: + print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)") + for start_time, start, end, name, end_time in sorted( + ( + lr.start_time, + tens.address, + tens.address + int(math.ceil(tens.storage_size())), + tens.name + " " + str(tens.purpose), + lr.end_time, + ) + for tens, lr in lrs.ranges.items() + ): + name = name.replace("\x00", "") + print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name)) + print() + + if show_minimum_possible_allocation and mem_area == MemArea.Sram: + min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes) + print( + "Min possible allocation %d bytes / %.1f KB / %.1f MB" + % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024) + ) + + +def allocate_tensors( + nng, + sg, + arch, + mem_area, + use_ifm_ofm_overlap=True, + tensor_allocator=TensorAllocator.Greedy, + verbose_allocation=False, + show_minimum_possible_allocation=False, + lr_graph=None, +): + ignore_subgraph_input_output_tensors = False + lrs = live_range.extract_live_ranges_from_cascaded_passes( + sg, + mem_area, + mark_output_tensors_overlapping_with_input_tensors=False, + use_ifm_ofm_overlap=use_ifm_ofm_overlap, + ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors, + lr_graph=lr_graph, + ) + + if lrs.ranges: + tens_alloc = tensor_allocator + if tens_alloc == TensorAllocator.Greedy: + total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation) + elif tens_alloc == TensorAllocator.LinearAlloc: + total_sz = linear_allocate_live_ranges(lrs) + else: + assert 0 + + sg.memory_used[mem_area] = total_sz + + nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges) + nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges) + + print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation) + + if mem_area == MemArea.Sram: + # Mark Sram usage for all subgraphs + for sg_ in nng.subgraphs: + mark_sram_used_for_cascaded_passes(sg_, lrs) + + if sg == nng.get_root_subgraph(): + nng.memory_used = sg.memory_used + for mem_area in nng.total_elements.keys(): + try: + nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area] + except ZeroDivisionError: + nng.bits_per_element[mem_area] = 0.0 diff --git a/ethosu/vela/tflite/AbsOptions.py b/ethosu/vela/tflite/AbsOptions.py new file mode 100644 index 00000000..0cbfb8c0 --- /dev/null +++ b/ethosu/vela/tflite/AbsOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class AbsOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsAbsOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = AbsOptions() + x.Init(buf, n + offset) + return x + + # AbsOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def AbsOptionsStart(builder): builder.StartObject(0) +def AbsOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ActivationFunctionType.py b/ethosu/vela/tflite/ActivationFunctionType.py new file mode 100644 index 00000000..6d8ec952 --- /dev/null +++ b/ethosu/vela/tflite/ActivationFunctionType.py @@ -0,0 +1,11 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class ActivationFunctionType(object): + NONE = 0 + RELU = 1 + RELU_N1_TO_1 = 2 + RELU6 = 3 + TANH = 4 + SIGN_BIT = 5 diff --git a/ethosu/vela/tflite/AddNOptions.py b/ethosu/vela/tflite/AddNOptions.py new file mode 100644 index 00000000..b5c2ddb7 --- /dev/null +++ b/ethosu/vela/tflite/AddNOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class AddNOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsAddNOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = AddNOptions() + x.Init(buf, n + offset) + return x + + # AddNOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def AddNOptionsStart(builder): builder.StartObject(0) +def AddNOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/AddOptions.py b/ethosu/vela/tflite/AddOptions.py new file mode 100644 index 00000000..d6cbfcf5 --- /dev/null +++ b/ethosu/vela/tflite/AddOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class AddOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsAddOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = AddOptions() + x.Init(buf, n + offset) + return x + + # AddOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # AddOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def AddOptionsStart(builder): builder.StartObject(1) +def AddOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def AddOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ArgMaxOptions.py b/ethosu/vela/tflite/ArgMaxOptions.py new file mode 100644 index 00000000..fbf1415e --- /dev/null +++ b/ethosu/vela/tflite/ArgMaxOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ArgMaxOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsArgMaxOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ArgMaxOptions() + x.Init(buf, n + offset) + return x + + # ArgMaxOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ArgMaxOptions + def OutputType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def ArgMaxOptionsStart(builder): builder.StartObject(1) +def ArgMaxOptionsAddOutputType(builder, outputType): builder.PrependInt8Slot(0, outputType, 0) +def ArgMaxOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ArgMinOptions.py b/ethosu/vela/tflite/ArgMinOptions.py new file mode 100644 index 00000000..120fdca2 --- /dev/null +++ b/ethosu/vela/tflite/ArgMinOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ArgMinOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsArgMinOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ArgMinOptions() + x.Init(buf, n + offset) + return x + + # ArgMinOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ArgMinOptions + def OutputType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def ArgMinOptionsStart(builder): builder.StartObject(1) +def ArgMinOptionsAddOutputType(builder, outputType): builder.PrependInt8Slot(0, outputType, 0) +def ArgMinOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/BatchToSpaceNDOptions.py b/ethosu/vela/tflite/BatchToSpaceNDOptions.py new file mode 100644 index 00000000..3ddcfd3f --- /dev/null +++ b/ethosu/vela/tflite/BatchToSpaceNDOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class BatchToSpaceNDOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsBatchToSpaceNDOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = BatchToSpaceNDOptions() + x.Init(buf, n + offset) + return x + + # BatchToSpaceNDOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def BatchToSpaceNDOptionsStart(builder): builder.StartObject(0) +def BatchToSpaceNDOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py b/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py new file mode 100644 index 00000000..8d8b7bea --- /dev/null +++ b/ethosu/vela/tflite/BidirectionalSequenceLSTMOptions.py @@ -0,0 +1,62 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class BidirectionalSequenceLSTMOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsBidirectionalSequenceLSTMOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = BidirectionalSequenceLSTMOptions() + x.Init(buf, n + offset) + return x + + # BidirectionalSequenceLSTMOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # BidirectionalSequenceLSTMOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # BidirectionalSequenceLSTMOptions + def CellClip(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # BidirectionalSequenceLSTMOptions + def ProjClip(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # BidirectionalSequenceLSTMOptions + def MergeOutputs(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + + # BidirectionalSequenceLSTMOptions + def TimeMajor(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return True + +def BidirectionalSequenceLSTMOptionsStart(builder): builder.StartObject(5) +def BidirectionalSequenceLSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def BidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0) +def BidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0) +def BidirectionalSequenceLSTMOptionsAddMergeOutputs(builder, mergeOutputs): builder.PrependBoolSlot(3, mergeOutputs, 0) +def BidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(4, timeMajor, 1) +def BidirectionalSequenceLSTMOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py b/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py new file mode 100644 index 00000000..673af6b9 --- /dev/null +++ b/ethosu/vela/tflite/BidirectionalSequenceRNNOptions.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class BidirectionalSequenceRNNOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsBidirectionalSequenceRNNOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = BidirectionalSequenceRNNOptions() + x.Init(buf, n + offset) + return x + + # BidirectionalSequenceRNNOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # BidirectionalSequenceRNNOptions + def TimeMajor(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + + # BidirectionalSequenceRNNOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # BidirectionalSequenceRNNOptions + def MergeOutputs(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def BidirectionalSequenceRNNOptionsStart(builder): builder.StartObject(3) +def BidirectionalSequenceRNNOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(0, timeMajor, 0) +def BidirectionalSequenceRNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0) +def BidirectionalSequenceRNNOptionsAddMergeOutputs(builder, mergeOutputs): builder.PrependBoolSlot(2, mergeOutputs, 0) +def BidirectionalSequenceRNNOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Buffer.py b/ethosu/vela/tflite/Buffer.py new file mode 100644 index 00000000..754dee3b --- /dev/null +++ b/ethosu/vela/tflite/Buffer.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Buffer(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsBuffer(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Buffer() + x.Init(buf, n + offset) + return x + + # Buffer + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Buffer + def Data(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1)) + return 0 + + # Buffer + def DataAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o) + return 0 + + # Buffer + def DataLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def BufferStart(builder): builder.StartObject(1) +def BufferAddData(builder, data): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(data), 0) +def BufferStartDataVector(builder, numElems): return builder.StartVector(1, numElems, 1) +def BufferEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/BuiltinOperator.py b/ethosu/vela/tflite/BuiltinOperator.py new file mode 100644 index 00000000..27136538 --- /dev/null +++ b/ethosu/vela/tflite/BuiltinOperator.py @@ -0,0 +1,131 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class BuiltinOperator(object): + ADD = 0 + AVERAGE_POOL_2D = 1 + CONCATENATION = 2 + CONV_2D = 3 + DEPTHWISE_CONV_2D = 4 + DEPTH_TO_SPACE = 5 + DEQUANTIZE = 6 + EMBEDDING_LOOKUP = 7 + FLOOR = 8 + FULLY_CONNECTED = 9 + HASHTABLE_LOOKUP = 10 + L2_NORMALIZATION = 11 + L2_POOL_2D = 12 + LOCAL_RESPONSE_NORMALIZATION = 13 + LOGISTIC = 14 + LSH_PROJECTION = 15 + LSTM = 16 + MAX_POOL_2D = 17 + MUL = 18 + RELU = 19 + RELU_N1_TO_1 = 20 + RELU6 = 21 + RESHAPE = 22 + RESIZE_BILINEAR = 23 + RNN = 24 + SOFTMAX = 25 + SPACE_TO_DEPTH = 26 + SVDF = 27 + TANH = 28 + CONCAT_EMBEDDINGS = 29 + SKIP_GRAM = 30 + CALL = 31 + CUSTOM = 32 + EMBEDDING_LOOKUP_SPARSE = 33 + PAD = 34 + UNIDIRECTIONAL_SEQUENCE_RNN = 35 + GATHER = 36 + BATCH_TO_SPACE_ND = 37 + SPACE_TO_BATCH_ND = 38 + TRANSPOSE = 39 + MEAN = 40 + SUB = 41 + DIV = 42 + SQUEEZE = 43 + UNIDIRECTIONAL_SEQUENCE_LSTM = 44 + STRIDED_SLICE = 45 + BIDIRECTIONAL_SEQUENCE_RNN = 46 + EXP = 47 + TOPK_V2 = 48 + SPLIT = 49 + LOG_SOFTMAX = 50 + DELEGATE = 51 + BIDIRECTIONAL_SEQUENCE_LSTM = 52 + CAST = 53 + PRELU = 54 + MAXIMUM = 55 + ARG_MAX = 56 + MINIMUM = 57 + LESS = 58 + NEG = 59 + PADV2 = 60 + GREATER = 61 + GREATER_EQUAL = 62 + LESS_EQUAL = 63 + SELECT = 64 + SLICE = 65 + SIN = 66 + TRANSPOSE_CONV = 67 + SPARSE_TO_DENSE = 68 + TILE = 69 + EXPAND_DIMS = 70 + EQUAL = 71 + NOT_EQUAL = 72 + LOG = 73 + SUM = 74 + SQRT = 75 + RSQRT = 76 + SHAPE = 77 + POW = 78 + ARG_MIN = 79 + FAKE_QUANT = 80 + REDUCE_PROD = 81 + REDUCE_MAX = 82 + PACK = 83 + LOGICAL_OR = 84 + ONE_HOT = 85 + LOGICAL_AND = 86 + LOGICAL_NOT = 87 + UNPACK = 88 + REDUCE_MIN = 89 + FLOOR_DIV = 90 + REDUCE_ANY = 91 + SQUARE = 92 + ZEROS_LIKE = 93 + FILL = 94 + FLOOR_MOD = 95 + RANGE = 96 + RESIZE_NEAREST_NEIGHBOR = 97 + LEAKY_RELU = 98 + SQUARED_DIFFERENCE = 99 + MIRROR_PAD = 100 + ABS = 101 + SPLIT_V = 102 + UNIQUE = 103 + CEIL = 104 + REVERSE_V2 = 105 + ADD_N = 106 + GATHER_ND = 107 + COS = 108 + WHERE = 109 + RANK = 110 + ELU = 111 + REVERSE_SEQUENCE = 112 + MATRIX_DIAG = 113 + QUANTIZE = 114 + MATRIX_SET_DIAG = 115 + ROUND = 116 + HARD_SWISH = 117 + IF = 118 + WHILE = 119 + NON_MAX_SUPPRESSION_V4 = 120 + NON_MAX_SUPPRESSION_V5 = 121 + SCATTER_ND = 122 + SELECT_V2 = 123 + DENSIFY = 124 + SEGMENT_SUM = 125 diff --git a/ethosu/vela/tflite/BuiltinOptions.py b/ethosu/vela/tflite/BuiltinOptions.py new file mode 100644 index 00000000..babbcb15 --- /dev/null +++ b/ethosu/vela/tflite/BuiltinOptions.py @@ -0,0 +1,106 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class BuiltinOptions(object): + NONE = 0 + Conv2DOptions = 1 + DepthwiseConv2DOptions = 2 + ConcatEmbeddingsOptions = 3 + LSHProjectionOptions = 4 + Pool2DOptions = 5 + SVDFOptions = 6 + RNNOptions = 7 + FullyConnectedOptions = 8 + SoftmaxOptions = 9 + ConcatenationOptions = 10 + AddOptions = 11 + L2NormOptions = 12 + LocalResponseNormalizationOptions = 13 + LSTMOptions = 14 + ResizeBilinearOptions = 15 + CallOptions = 16 + ReshapeOptions = 17 + SkipGramOptions = 18 + SpaceToDepthOptions = 19 + EmbeddingLookupSparseOptions = 20 + MulOptions = 21 + PadOptions = 22 + GatherOptions = 23 + BatchToSpaceNDOptions = 24 + SpaceToBatchNDOptions = 25 + TransposeOptions = 26 + ReducerOptions = 27 + SubOptions = 28 + DivOptions = 29 + SqueezeOptions = 30 + SequenceRNNOptions = 31 + StridedSliceOptions = 32 + ExpOptions = 33 + TopKV2Options = 34 + SplitOptions = 35 + LogSoftmaxOptions = 36 + CastOptions = 37 + DequantizeOptions = 38 + MaximumMinimumOptions = 39 + ArgMaxOptions = 40 + LessOptions = 41 + NegOptions = 42 + PadV2Options = 43 + GreaterOptions = 44 + GreaterEqualOptions = 45 + LessEqualOptions = 46 + SelectOptions = 47 + SliceOptions = 48 + TransposeConvOptions = 49 + SparseToDenseOptions = 50 + TileOptions = 51 + ExpandDimsOptions = 52 + EqualOptions = 53 + NotEqualOptions = 54 + ShapeOptions = 55 + PowOptions = 56 + ArgMinOptions = 57 + FakeQuantOptions = 58 + PackOptions = 59 + LogicalOrOptions = 60 + OneHotOptions = 61 + LogicalAndOptions = 62 + LogicalNotOptions = 63 + UnpackOptions = 64 + FloorDivOptions = 65 + SquareOptions = 66 + ZerosLikeOptions = 67 + FillOptions = 68 + BidirectionalSequenceLSTMOptions = 69 + BidirectionalSequenceRNNOptions = 70 + UnidirectionalSequenceLSTMOptions = 71 + FloorModOptions = 72 + RangeOptions = 73 + ResizeNearestNeighborOptions = 74 + LeakyReluOptions = 75 + SquaredDifferenceOptions = 76 + MirrorPadOptions = 77 + AbsOptions = 78 + SplitVOptions = 79 + UniqueOptions = 80 + ReverseV2Options = 81 + AddNOptions = 82 + GatherNdOptions = 83 + CosOptions = 84 + WhereOptions = 85 + RankOptions = 86 + ReverseSequenceOptions = 87 + MatrixDiagOptions = 88 + QuantizeOptions = 89 + MatrixSetDiagOptions = 90 + HardSwishOptions = 91 + IfOptions = 92 + WhileOptions = 93 + DepthToSpaceOptions = 94 + NonMaxSuppressionV4Options = 95 + NonMaxSuppressionV5Options = 96 + ScatterNdOptions = 97 + SelectV2Options = 98 + DensifyOptions = 99 + SegmentSumOptions = 100 diff --git a/ethosu/vela/tflite/CallOptions.py b/ethosu/vela/tflite/CallOptions.py new file mode 100644 index 00000000..5ae2eeae --- /dev/null +++ b/ethosu/vela/tflite/CallOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class CallOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsCallOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = CallOptions() + x.Init(buf, n + offset) + return x + + # CallOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # CallOptions + def Subgraph(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos) + return 0 + +def CallOptionsStart(builder): builder.StartObject(1) +def CallOptionsAddSubgraph(builder, subgraph): builder.PrependUint32Slot(0, subgraph, 0) +def CallOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/CastOptions.py b/ethosu/vela/tflite/CastOptions.py new file mode 100644 index 00000000..70ae2e37 --- /dev/null +++ b/ethosu/vela/tflite/CastOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class CastOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsCastOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = CastOptions() + x.Init(buf, n + offset) + return x + + # CastOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # CastOptions + def InDataType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # CastOptions + def OutDataType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def CastOptionsStart(builder): builder.StartObject(2) +def CastOptionsAddInDataType(builder, inDataType): builder.PrependInt8Slot(0, inDataType, 0) +def CastOptionsAddOutDataType(builder, outDataType): builder.PrependInt8Slot(1, outDataType, 0) +def CastOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/CombinerType.py b/ethosu/vela/tflite/CombinerType.py new file mode 100644 index 00000000..1e3a61f3 --- /dev/null +++ b/ethosu/vela/tflite/CombinerType.py @@ -0,0 +1,8 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class CombinerType(object): + SUM = 0 + MEAN = 1 + SQRTN = 2 diff --git a/ethosu/vela/tflite/ConcatEmbeddingsOptions.py b/ethosu/vela/tflite/ConcatEmbeddingsOptions.py new file mode 100644 index 00000000..9d26c510 --- /dev/null +++ b/ethosu/vela/tflite/ConcatEmbeddingsOptions.py @@ -0,0 +1,78 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ConcatEmbeddingsOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsConcatEmbeddingsOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ConcatEmbeddingsOptions() + x.Init(buf, n + offset) + return x + + # ConcatEmbeddingsOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ConcatEmbeddingsOptions + def NumChannels(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # ConcatEmbeddingsOptions + def NumColumnsPerChannel(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # ConcatEmbeddingsOptions + def NumColumnsPerChannelAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # ConcatEmbeddingsOptions + def NumColumnsPerChannelLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # ConcatEmbeddingsOptions + def EmbeddingDimPerChannel(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # ConcatEmbeddingsOptions + def EmbeddingDimPerChannelAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # ConcatEmbeddingsOptions + def EmbeddingDimPerChannelLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def ConcatEmbeddingsOptionsStart(builder): builder.StartObject(3) +def ConcatEmbeddingsOptionsAddNumChannels(builder, numChannels): builder.PrependInt32Slot(0, numChannels, 0) +def ConcatEmbeddingsOptionsAddNumColumnsPerChannel(builder, numColumnsPerChannel): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(numColumnsPerChannel), 0) +def ConcatEmbeddingsOptionsStartNumColumnsPerChannelVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def ConcatEmbeddingsOptionsAddEmbeddingDimPerChannel(builder, embeddingDimPerChannel): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(embeddingDimPerChannel), 0) +def ConcatEmbeddingsOptionsStartEmbeddingDimPerChannelVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def ConcatEmbeddingsOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ConcatenationOptions.py b/ethosu/vela/tflite/ConcatenationOptions.py new file mode 100644 index 00000000..c8e0b6ab --- /dev/null +++ b/ethosu/vela/tflite/ConcatenationOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ConcatenationOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsConcatenationOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ConcatenationOptions() + x.Init(buf, n + offset) + return x + + # ConcatenationOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ConcatenationOptions + def Axis(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # ConcatenationOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def ConcatenationOptionsStart(builder): builder.StartObject(2) +def ConcatenationOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0) +def ConcatenationOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0) +def ConcatenationOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Conv2DOptions.py b/ethosu/vela/tflite/Conv2DOptions.py new file mode 100644 index 00000000..ef49f751 --- /dev/null +++ b/ethosu/vela/tflite/Conv2DOptions.py @@ -0,0 +1,70 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Conv2DOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsConv2DOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Conv2DOptions() + x.Init(buf, n + offset) + return x + + # Conv2DOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Conv2DOptions + def Padding(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # Conv2DOptions + def StrideW(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # Conv2DOptions + def StrideH(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # Conv2DOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # Conv2DOptions + def DilationWFactor(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 1 + + # Conv2DOptions + def DilationHFactor(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 1 + +def Conv2DOptionsStart(builder): builder.StartObject(6) +def Conv2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0) +def Conv2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0) +def Conv2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0) +def Conv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(3, fusedActivationFunction, 0) +def Conv2DOptionsAddDilationWFactor(builder, dilationWFactor): builder.PrependInt32Slot(4, dilationWFactor, 1) +def Conv2DOptionsAddDilationHFactor(builder, dilationHFactor): builder.PrependInt32Slot(5, dilationHFactor, 1) +def Conv2DOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/CosOptions.py b/ethosu/vela/tflite/CosOptions.py new file mode 100644 index 00000000..7fbf8487 --- /dev/null +++ b/ethosu/vela/tflite/CosOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class CosOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsCosOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = CosOptions() + x.Init(buf, n + offset) + return x + + # CosOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def CosOptionsStart(builder): builder.StartObject(0) +def CosOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/CustomOptionsFormat.py b/ethosu/vela/tflite/CustomOptionsFormat.py new file mode 100644 index 00000000..c2fc07c2 --- /dev/null +++ b/ethosu/vela/tflite/CustomOptionsFormat.py @@ -0,0 +1,6 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class CustomOptionsFormat(object): + FLEXBUFFERS = 0 diff --git a/ethosu/vela/tflite/CustomQuantization.py b/ethosu/vela/tflite/CustomQuantization.py new file mode 100644 index 00000000..21ec0da4 --- /dev/null +++ b/ethosu/vela/tflite/CustomQuantization.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class CustomQuantization(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsCustomQuantization(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = CustomQuantization() + x.Init(buf, n + offset) + return x + + # CustomQuantization + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # CustomQuantization + def Custom(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1)) + return 0 + + # CustomQuantization + def CustomAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o) + return 0 + + # CustomQuantization + def CustomLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def CustomQuantizationStart(builder): builder.StartObject(1) +def CustomQuantizationAddCustom(builder, custom): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(custom), 0) +def CustomQuantizationStartCustomVector(builder, numElems): return builder.StartVector(1, numElems, 1) +def CustomQuantizationEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/DensifyOptions.py b/ethosu/vela/tflite/DensifyOptions.py new file mode 100644 index 00000000..12cbfb29 --- /dev/null +++ b/ethosu/vela/tflite/DensifyOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class DensifyOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsDensifyOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = DensifyOptions() + x.Init(buf, n + offset) + return x + + # DensifyOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def DensifyOptionsStart(builder): builder.StartObject(0) +def DensifyOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/DepthToSpaceOptions.py b/ethosu/vela/tflite/DepthToSpaceOptions.py new file mode 100644 index 00000000..97b93aa7 --- /dev/null +++ b/ethosu/vela/tflite/DepthToSpaceOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class DepthToSpaceOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsDepthToSpaceOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = DepthToSpaceOptions() + x.Init(buf, n + offset) + return x + + # DepthToSpaceOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # DepthToSpaceOptions + def BlockSize(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def DepthToSpaceOptionsStart(builder): builder.StartObject(1) +def DepthToSpaceOptionsAddBlockSize(builder, blockSize): builder.PrependInt32Slot(0, blockSize, 0) +def DepthToSpaceOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/DepthwiseConv2DOptions.py b/ethosu/vela/tflite/DepthwiseConv2DOptions.py new file mode 100644 index 00000000..9689383b --- /dev/null +++ b/ethosu/vela/tflite/DepthwiseConv2DOptions.py @@ -0,0 +1,78 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class DepthwiseConv2DOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsDepthwiseConv2DOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = DepthwiseConv2DOptions() + x.Init(buf, n + offset) + return x + + # DepthwiseConv2DOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # DepthwiseConv2DOptions + def Padding(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # DepthwiseConv2DOptions + def StrideW(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # DepthwiseConv2DOptions + def StrideH(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # DepthwiseConv2DOptions + def DepthMultiplier(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # DepthwiseConv2DOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # DepthwiseConv2DOptions + def DilationWFactor(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 1 + + # DepthwiseConv2DOptions + def DilationHFactor(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 1 + +def DepthwiseConv2DOptionsStart(builder): builder.StartObject(7) +def DepthwiseConv2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0) +def DepthwiseConv2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0) +def DepthwiseConv2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0) +def DepthwiseConv2DOptionsAddDepthMultiplier(builder, depthMultiplier): builder.PrependInt32Slot(3, depthMultiplier, 0) +def DepthwiseConv2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(4, fusedActivationFunction, 0) +def DepthwiseConv2DOptionsAddDilationWFactor(builder, dilationWFactor): builder.PrependInt32Slot(5, dilationWFactor, 1) +def DepthwiseConv2DOptionsAddDilationHFactor(builder, dilationHFactor): builder.PrependInt32Slot(6, dilationHFactor, 1) +def DepthwiseConv2DOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/DequantizeOptions.py b/ethosu/vela/tflite/DequantizeOptions.py new file mode 100644 index 00000000..5ef8b8dd --- /dev/null +++ b/ethosu/vela/tflite/DequantizeOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class DequantizeOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsDequantizeOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = DequantizeOptions() + x.Init(buf, n + offset) + return x + + # DequantizeOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def DequantizeOptionsStart(builder): builder.StartObject(0) +def DequantizeOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/DimensionMetadata.py b/ethosu/vela/tflite/DimensionMetadata.py new file mode 100644 index 00000000..c9fe7cd6 --- /dev/null +++ b/ethosu/vela/tflite/DimensionMetadata.py @@ -0,0 +1,76 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class DimensionMetadata(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsDimensionMetadata(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = DimensionMetadata() + x.Init(buf, n + offset) + return x + + # DimensionMetadata + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # DimensionMetadata + def Format(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # DimensionMetadata + def DenseSize(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # DimensionMetadata + def ArraySegmentsType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos) + return 0 + + # DimensionMetadata + def ArraySegments(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + from flatbuffers.table import Table + obj = Table(bytearray(), 0) + self._tab.Union(obj, o) + return obj + return None + + # DimensionMetadata + def ArrayIndicesType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos) + return 0 + + # DimensionMetadata + def ArrayIndices(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + from flatbuffers.table import Table + obj = Table(bytearray(), 0) + self._tab.Union(obj, o) + return obj + return None + +def DimensionMetadataStart(builder): builder.StartObject(6) +def DimensionMetadataAddFormat(builder, format): builder.PrependInt8Slot(0, format, 0) +def DimensionMetadataAddDenseSize(builder, denseSize): builder.PrependInt32Slot(1, denseSize, 0) +def DimensionMetadataAddArraySegmentsType(builder, arraySegmentsType): builder.PrependUint8Slot(2, arraySegmentsType, 0) +def DimensionMetadataAddArraySegments(builder, arraySegments): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(arraySegments), 0) +def DimensionMetadataAddArrayIndicesType(builder, arrayIndicesType): builder.PrependUint8Slot(4, arrayIndicesType, 0) +def DimensionMetadataAddArrayIndices(builder, arrayIndices): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(arrayIndices), 0) +def DimensionMetadataEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/DimensionType.py b/ethosu/vela/tflite/DimensionType.py new file mode 100644 index 00000000..310d8eed --- /dev/null +++ b/ethosu/vela/tflite/DimensionType.py @@ -0,0 +1,7 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class DimensionType(object): + DENSE = 0 + SPARSE_CSR = 1 diff --git a/ethosu/vela/tflite/DivOptions.py b/ethosu/vela/tflite/DivOptions.py new file mode 100644 index 00000000..905a3be0 --- /dev/null +++ b/ethosu/vela/tflite/DivOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class DivOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsDivOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = DivOptions() + x.Init(buf, n + offset) + return x + + # DivOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # DivOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def DivOptionsStart(builder): builder.StartObject(1) +def DivOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def DivOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py b/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py new file mode 100644 index 00000000..7d9c1442 --- /dev/null +++ b/ethosu/vela/tflite/EmbeddingLookupSparseOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class EmbeddingLookupSparseOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsEmbeddingLookupSparseOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = EmbeddingLookupSparseOptions() + x.Init(buf, n + offset) + return x + + # EmbeddingLookupSparseOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # EmbeddingLookupSparseOptions + def Combiner(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def EmbeddingLookupSparseOptionsStart(builder): builder.StartObject(1) +def EmbeddingLookupSparseOptionsAddCombiner(builder, combiner): builder.PrependInt8Slot(0, combiner, 0) +def EmbeddingLookupSparseOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/EqualOptions.py b/ethosu/vela/tflite/EqualOptions.py new file mode 100644 index 00000000..f787ef85 --- /dev/null +++ b/ethosu/vela/tflite/EqualOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class EqualOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsEqualOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = EqualOptions() + x.Init(buf, n + offset) + return x + + # EqualOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def EqualOptionsStart(builder): builder.StartObject(0) +def EqualOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ExpOptions.py b/ethosu/vela/tflite/ExpOptions.py new file mode 100644 index 00000000..eac1456e --- /dev/null +++ b/ethosu/vela/tflite/ExpOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ExpOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsExpOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ExpOptions() + x.Init(buf, n + offset) + return x + + # ExpOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def ExpOptionsStart(builder): builder.StartObject(0) +def ExpOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ExpandDimsOptions.py b/ethosu/vela/tflite/ExpandDimsOptions.py new file mode 100644 index 00000000..69d63665 --- /dev/null +++ b/ethosu/vela/tflite/ExpandDimsOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ExpandDimsOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsExpandDimsOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ExpandDimsOptions() + x.Init(buf, n + offset) + return x + + # ExpandDimsOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def ExpandDimsOptionsStart(builder): builder.StartObject(0) +def ExpandDimsOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/FakeQuantOptions.py b/ethosu/vela/tflite/FakeQuantOptions.py new file mode 100644 index 00000000..46c371c3 --- /dev/null +++ b/ethosu/vela/tflite/FakeQuantOptions.py @@ -0,0 +1,54 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class FakeQuantOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsFakeQuantOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = FakeQuantOptions() + x.Init(buf, n + offset) + return x + + # FakeQuantOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # FakeQuantOptions + def Min(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # FakeQuantOptions + def Max(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # FakeQuantOptions + def NumBits(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # FakeQuantOptions + def NarrowRange(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def FakeQuantOptionsStart(builder): builder.StartObject(4) +def FakeQuantOptionsAddMin(builder, min): builder.PrependFloat32Slot(0, min, 0.0) +def FakeQuantOptionsAddMax(builder, max): builder.PrependFloat32Slot(1, max, 0.0) +def FakeQuantOptionsAddNumBits(builder, numBits): builder.PrependInt32Slot(2, numBits, 0) +def FakeQuantOptionsAddNarrowRange(builder, narrowRange): builder.PrependBoolSlot(3, narrowRange, 0) +def FakeQuantOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/FillOptions.py b/ethosu/vela/tflite/FillOptions.py new file mode 100644 index 00000000..5a1e651a --- /dev/null +++ b/ethosu/vela/tflite/FillOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class FillOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsFillOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = FillOptions() + x.Init(buf, n + offset) + return x + + # FillOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def FillOptionsStart(builder): builder.StartObject(0) +def FillOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/FloorDivOptions.py b/ethosu/vela/tflite/FloorDivOptions.py new file mode 100644 index 00000000..64b474fb --- /dev/null +++ b/ethosu/vela/tflite/FloorDivOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class FloorDivOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsFloorDivOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = FloorDivOptions() + x.Init(buf, n + offset) + return x + + # FloorDivOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def FloorDivOptionsStart(builder): builder.StartObject(0) +def FloorDivOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/FloorModOptions.py b/ethosu/vela/tflite/FloorModOptions.py new file mode 100644 index 00000000..37c8e5a5 --- /dev/null +++ b/ethosu/vela/tflite/FloorModOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class FloorModOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsFloorModOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = FloorModOptions() + x.Init(buf, n + offset) + return x + + # FloorModOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def FloorModOptionsStart(builder): builder.StartObject(0) +def FloorModOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/FullyConnectedOptions.py b/ethosu/vela/tflite/FullyConnectedOptions.py new file mode 100644 index 00000000..a6b4e40f --- /dev/null +++ b/ethosu/vela/tflite/FullyConnectedOptions.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class FullyConnectedOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsFullyConnectedOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = FullyConnectedOptions() + x.Init(buf, n + offset) + return x + + # FullyConnectedOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # FullyConnectedOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # FullyConnectedOptions + def WeightsFormat(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # FullyConnectedOptions + def KeepNumDims(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def FullyConnectedOptionsStart(builder): builder.StartObject(3) +def FullyConnectedOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def FullyConnectedOptionsAddWeightsFormat(builder, weightsFormat): builder.PrependInt8Slot(1, weightsFormat, 0) +def FullyConnectedOptionsAddKeepNumDims(builder, keepNumDims): builder.PrependBoolSlot(2, keepNumDims, 0) +def FullyConnectedOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py b/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py new file mode 100644 index 00000000..d9a53887 --- /dev/null +++ b/ethosu/vela/tflite/FullyConnectedOptionsWeightsFormat.py @@ -0,0 +1,7 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class FullyConnectedOptionsWeightsFormat(object): + DEFAULT = 0 + SHUFFLED4x16INT8 = 1 diff --git a/ethosu/vela/tflite/GatherNdOptions.py b/ethosu/vela/tflite/GatherNdOptions.py new file mode 100644 index 00000000..f515eb5c --- /dev/null +++ b/ethosu/vela/tflite/GatherNdOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class GatherNdOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsGatherNdOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = GatherNdOptions() + x.Init(buf, n + offset) + return x + + # GatherNdOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def GatherNdOptionsStart(builder): builder.StartObject(0) +def GatherNdOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/GatherOptions.py b/ethosu/vela/tflite/GatherOptions.py new file mode 100644 index 00000000..9fbc3e40 --- /dev/null +++ b/ethosu/vela/tflite/GatherOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class GatherOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsGatherOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = GatherOptions() + x.Init(buf, n + offset) + return x + + # GatherOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # GatherOptions + def Axis(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def GatherOptionsStart(builder): builder.StartObject(1) +def GatherOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0) +def GatherOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/GreaterEqualOptions.py b/ethosu/vela/tflite/GreaterEqualOptions.py new file mode 100644 index 00000000..a29e200a --- /dev/null +++ b/ethosu/vela/tflite/GreaterEqualOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class GreaterEqualOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsGreaterEqualOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = GreaterEqualOptions() + x.Init(buf, n + offset) + return x + + # GreaterEqualOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def GreaterEqualOptionsStart(builder): builder.StartObject(0) +def GreaterEqualOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/GreaterOptions.py b/ethosu/vela/tflite/GreaterOptions.py new file mode 100644 index 00000000..59d63501 --- /dev/null +++ b/ethosu/vela/tflite/GreaterOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class GreaterOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsGreaterOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = GreaterOptions() + x.Init(buf, n + offset) + return x + + # GreaterOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def GreaterOptionsStart(builder): builder.StartObject(0) +def GreaterOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/HardSwishOptions.py b/ethosu/vela/tflite/HardSwishOptions.py new file mode 100644 index 00000000..4f6a5200 --- /dev/null +++ b/ethosu/vela/tflite/HardSwishOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class HardSwishOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsHardSwishOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = HardSwishOptions() + x.Init(buf, n + offset) + return x + + # HardSwishOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def HardSwishOptionsStart(builder): builder.StartObject(0) +def HardSwishOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/IfOptions.py b/ethosu/vela/tflite/IfOptions.py new file mode 100644 index 00000000..13f4e697 --- /dev/null +++ b/ethosu/vela/tflite/IfOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class IfOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsIfOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = IfOptions() + x.Init(buf, n + offset) + return x + + # IfOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # IfOptions + def ThenSubgraphIndex(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # IfOptions + def ElseSubgraphIndex(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def IfOptionsStart(builder): builder.StartObject(2) +def IfOptionsAddThenSubgraphIndex(builder, thenSubgraphIndex): builder.PrependInt32Slot(0, thenSubgraphIndex, 0) +def IfOptionsAddElseSubgraphIndex(builder, elseSubgraphIndex): builder.PrependInt32Slot(1, elseSubgraphIndex, 0) +def IfOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Int32Vector.py b/ethosu/vela/tflite/Int32Vector.py new file mode 100644 index 00000000..e70851b2 --- /dev/null +++ b/ethosu/vela/tflite/Int32Vector.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Int32Vector(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsInt32Vector(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Int32Vector() + x.Init(buf, n + offset) + return x + + # Int32Vector + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Int32Vector + def Values(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # Int32Vector + def ValuesAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # Int32Vector + def ValuesLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def Int32VectorStart(builder): builder.StartObject(1) +def Int32VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0) +def Int32VectorStartValuesVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def Int32VectorEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/L2NormOptions.py b/ethosu/vela/tflite/L2NormOptions.py new file mode 100644 index 00000000..38bdf573 --- /dev/null +++ b/ethosu/vela/tflite/L2NormOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class L2NormOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsL2NormOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = L2NormOptions() + x.Init(buf, n + offset) + return x + + # L2NormOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # L2NormOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def L2NormOptionsStart(builder): builder.StartObject(1) +def L2NormOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def L2NormOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LSHProjectionOptions.py b/ethosu/vela/tflite/LSHProjectionOptions.py new file mode 100644 index 00000000..ad550be2 --- /dev/null +++ b/ethosu/vela/tflite/LSHProjectionOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LSHProjectionOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLSHProjectionOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LSHProjectionOptions() + x.Init(buf, n + offset) + return x + + # LSHProjectionOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # LSHProjectionOptions + def Type(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def LSHProjectionOptionsStart(builder): builder.StartObject(1) +def LSHProjectionOptionsAddType(builder, type): builder.PrependInt8Slot(0, type, 0) +def LSHProjectionOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LSHProjectionType.py b/ethosu/vela/tflite/LSHProjectionType.py new file mode 100644 index 00000000..a7d6a313 --- /dev/null +++ b/ethosu/vela/tflite/LSHProjectionType.py @@ -0,0 +1,8 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class LSHProjectionType(object): + UNKNOWN = 0 + SPARSE = 1 + DENSE = 2 diff --git a/ethosu/vela/tflite/LSTMKernelType.py b/ethosu/vela/tflite/LSTMKernelType.py new file mode 100644 index 00000000..fd657998 --- /dev/null +++ b/ethosu/vela/tflite/LSTMKernelType.py @@ -0,0 +1,7 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class LSTMKernelType(object): + FULL = 0 + BASIC = 1 diff --git a/ethosu/vela/tflite/LSTMOptions.py b/ethosu/vela/tflite/LSTMOptions.py new file mode 100644 index 00000000..93a83093 --- /dev/null +++ b/ethosu/vela/tflite/LSTMOptions.py @@ -0,0 +1,54 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LSTMOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLSTMOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LSTMOptions() + x.Init(buf, n + offset) + return x + + # LSTMOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # LSTMOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # LSTMOptions + def CellClip(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # LSTMOptions + def ProjClip(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # LSTMOptions + def KernelType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def LSTMOptionsStart(builder): builder.StartObject(4) +def LSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def LSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0) +def LSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0) +def LSTMOptionsAddKernelType(builder, kernelType): builder.PrependInt8Slot(3, kernelType, 0) +def LSTMOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LeakyReluOptions.py b/ethosu/vela/tflite/LeakyReluOptions.py new file mode 100644 index 00000000..b61b21d5 --- /dev/null +++ b/ethosu/vela/tflite/LeakyReluOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LeakyReluOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLeakyReluOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LeakyReluOptions() + x.Init(buf, n + offset) + return x + + # LeakyReluOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # LeakyReluOptions + def Alpha(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + +def LeakyReluOptionsStart(builder): builder.StartObject(1) +def LeakyReluOptionsAddAlpha(builder, alpha): builder.PrependFloat32Slot(0, alpha, 0.0) +def LeakyReluOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LessEqualOptions.py b/ethosu/vela/tflite/LessEqualOptions.py new file mode 100644 index 00000000..d49b7289 --- /dev/null +++ b/ethosu/vela/tflite/LessEqualOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LessEqualOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLessEqualOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LessEqualOptions() + x.Init(buf, n + offset) + return x + + # LessEqualOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def LessEqualOptionsStart(builder): builder.StartObject(0) +def LessEqualOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LessOptions.py b/ethosu/vela/tflite/LessOptions.py new file mode 100644 index 00000000..469cb0b0 --- /dev/null +++ b/ethosu/vela/tflite/LessOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LessOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLessOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LessOptions() + x.Init(buf, n + offset) + return x + + # LessOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def LessOptionsStart(builder): builder.StartObject(0) +def LessOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LocalResponseNormalizationOptions.py b/ethosu/vela/tflite/LocalResponseNormalizationOptions.py new file mode 100644 index 00000000..db875603 --- /dev/null +++ b/ethosu/vela/tflite/LocalResponseNormalizationOptions.py @@ -0,0 +1,54 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LocalResponseNormalizationOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLocalResponseNormalizationOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LocalResponseNormalizationOptions() + x.Init(buf, n + offset) + return x + + # LocalResponseNormalizationOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # LocalResponseNormalizationOptions + def Radius(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # LocalResponseNormalizationOptions + def Bias(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # LocalResponseNormalizationOptions + def Alpha(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # LocalResponseNormalizationOptions + def Beta(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + +def LocalResponseNormalizationOptionsStart(builder): builder.StartObject(4) +def LocalResponseNormalizationOptionsAddRadius(builder, radius): builder.PrependInt32Slot(0, radius, 0) +def LocalResponseNormalizationOptionsAddBias(builder, bias): builder.PrependFloat32Slot(1, bias, 0.0) +def LocalResponseNormalizationOptionsAddAlpha(builder, alpha): builder.PrependFloat32Slot(2, alpha, 0.0) +def LocalResponseNormalizationOptionsAddBeta(builder, beta): builder.PrependFloat32Slot(3, beta, 0.0) +def LocalResponseNormalizationOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LogSoftmaxOptions.py b/ethosu/vela/tflite/LogSoftmaxOptions.py new file mode 100644 index 00000000..47893855 --- /dev/null +++ b/ethosu/vela/tflite/LogSoftmaxOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LogSoftmaxOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLogSoftmaxOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LogSoftmaxOptions() + x.Init(buf, n + offset) + return x + + # LogSoftmaxOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def LogSoftmaxOptionsStart(builder): builder.StartObject(0) +def LogSoftmaxOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LogicalAndOptions.py b/ethosu/vela/tflite/LogicalAndOptions.py new file mode 100644 index 00000000..cee1cdb4 --- /dev/null +++ b/ethosu/vela/tflite/LogicalAndOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LogicalAndOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLogicalAndOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LogicalAndOptions() + x.Init(buf, n + offset) + return x + + # LogicalAndOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def LogicalAndOptionsStart(builder): builder.StartObject(0) +def LogicalAndOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LogicalNotOptions.py b/ethosu/vela/tflite/LogicalNotOptions.py new file mode 100644 index 00000000..9971450c --- /dev/null +++ b/ethosu/vela/tflite/LogicalNotOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LogicalNotOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLogicalNotOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LogicalNotOptions() + x.Init(buf, n + offset) + return x + + # LogicalNotOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def LogicalNotOptionsStart(builder): builder.StartObject(0) +def LogicalNotOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/LogicalOrOptions.py b/ethosu/vela/tflite/LogicalOrOptions.py new file mode 100644 index 00000000..e94a5dec --- /dev/null +++ b/ethosu/vela/tflite/LogicalOrOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class LogicalOrOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsLogicalOrOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = LogicalOrOptions() + x.Init(buf, n + offset) + return x + + # LogicalOrOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def LogicalOrOptionsStart(builder): builder.StartObject(0) +def LogicalOrOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/MatrixDiagOptions.py b/ethosu/vela/tflite/MatrixDiagOptions.py new file mode 100644 index 00000000..0f64e657 --- /dev/null +++ b/ethosu/vela/tflite/MatrixDiagOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class MatrixDiagOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsMatrixDiagOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = MatrixDiagOptions() + x.Init(buf, n + offset) + return x + + # MatrixDiagOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def MatrixDiagOptionsStart(builder): builder.StartObject(0) +def MatrixDiagOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/MatrixSetDiagOptions.py b/ethosu/vela/tflite/MatrixSetDiagOptions.py new file mode 100644 index 00000000..14178cf8 --- /dev/null +++ b/ethosu/vela/tflite/MatrixSetDiagOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class MatrixSetDiagOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsMatrixSetDiagOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = MatrixSetDiagOptions() + x.Init(buf, n + offset) + return x + + # MatrixSetDiagOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def MatrixSetDiagOptionsStart(builder): builder.StartObject(0) +def MatrixSetDiagOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/MaximumMinimumOptions.py b/ethosu/vela/tflite/MaximumMinimumOptions.py new file mode 100644 index 00000000..f0806e2d --- /dev/null +++ b/ethosu/vela/tflite/MaximumMinimumOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class MaximumMinimumOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsMaximumMinimumOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = MaximumMinimumOptions() + x.Init(buf, n + offset) + return x + + # MaximumMinimumOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def MaximumMinimumOptionsStart(builder): builder.StartObject(0) +def MaximumMinimumOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Metadata.py b/ethosu/vela/tflite/Metadata.py new file mode 100644 index 00000000..273e51ee --- /dev/null +++ b/ethosu/vela/tflite/Metadata.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Metadata(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsMetadata(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Metadata() + x.Init(buf, n + offset) + return x + + # Metadata + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Metadata + def Name(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.String(o + self._tab.Pos) + return None + + # Metadata + def Buffer(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos) + return 0 + +def MetadataStart(builder): builder.StartObject(2) +def MetadataAddName(builder, name): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0) +def MetadataAddBuffer(builder, buffer): builder.PrependUint32Slot(1, buffer, 0) +def MetadataEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/MirrorPadMode.py b/ethosu/vela/tflite/MirrorPadMode.py new file mode 100644 index 00000000..8fb6396f --- /dev/null +++ b/ethosu/vela/tflite/MirrorPadMode.py @@ -0,0 +1,7 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class MirrorPadMode(object): + REFLECT = 0 + SYMMETRIC = 1 diff --git a/ethosu/vela/tflite/MirrorPadOptions.py b/ethosu/vela/tflite/MirrorPadOptions.py new file mode 100644 index 00000000..254ae217 --- /dev/null +++ b/ethosu/vela/tflite/MirrorPadOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class MirrorPadOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsMirrorPadOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = MirrorPadOptions() + x.Init(buf, n + offset) + return x + + # MirrorPadOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # MirrorPadOptions + def Mode(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def MirrorPadOptionsStart(builder): builder.StartObject(1) +def MirrorPadOptionsAddMode(builder, mode): builder.PrependInt8Slot(0, mode, 0) +def MirrorPadOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Model.py b/ethosu/vela/tflite/Model.py new file mode 100644 index 00000000..cc9991ba --- /dev/null +++ b/ethosu/vela/tflite/Model.py @@ -0,0 +1,150 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Model(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsModel(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Model() + x.Init(buf, n + offset) + return x + + # Model + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Model + def Version(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos) + return 0 + + # Model + def OperatorCodes(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + x = self._tab.Vector(o) + x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 + x = self._tab.Indirect(x) + from .OperatorCode import OperatorCode + obj = OperatorCode() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # Model + def OperatorCodesLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Model + def Subgraphs(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + x = self._tab.Vector(o) + x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 + x = self._tab.Indirect(x) + from .SubGraph import SubGraph + obj = SubGraph() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # Model + def SubgraphsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Model + def Description(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.String(o + self._tab.Pos) + return None + + # Model + def Buffers(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + x = self._tab.Vector(o) + x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 + x = self._tab.Indirect(x) + from .Buffer import Buffer + obj = Buffer() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # Model + def BuffersLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Model + def MetadataBuffer(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # Model + def MetadataBufferAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # Model + def MetadataBufferLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Model + def Metadata(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16)) + if o != 0: + x = self._tab.Vector(o) + x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 + x = self._tab.Indirect(x) + from .Metadata import Metadata + obj = Metadata() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # Model + def MetadataLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def ModelStart(builder): builder.StartObject(7) +def ModelAddVersion(builder, version): builder.PrependUint32Slot(0, version, 0) +def ModelAddOperatorCodes(builder, operatorCodes): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(operatorCodes), 0) +def ModelStartOperatorCodesVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def ModelAddSubgraphs(builder, subgraphs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(subgraphs), 0) +def ModelStartSubgraphsVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def ModelAddDescription(builder, description): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(description), 0) +def ModelAddBuffers(builder, buffers): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(buffers), 0) +def ModelStartBuffersVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def ModelAddMetadataBuffer(builder, metadataBuffer): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(metadataBuffer), 0) +def ModelStartMetadataBufferVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def ModelAddMetadata(builder, metadata): builder.PrependUOffsetTRelativeSlot(6, flatbuffers.number_types.UOffsetTFlags.py_type(metadata), 0) +def ModelStartMetadataVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def ModelEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/MulOptions.py b/ethosu/vela/tflite/MulOptions.py new file mode 100644 index 00000000..55b9506f --- /dev/null +++ b/ethosu/vela/tflite/MulOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class MulOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsMulOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = MulOptions() + x.Init(buf, n + offset) + return x + + # MulOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # MulOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def MulOptionsStart(builder): builder.StartObject(1) +def MulOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def MulOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/NegOptions.py b/ethosu/vela/tflite/NegOptions.py new file mode 100644 index 00000000..05d55c26 --- /dev/null +++ b/ethosu/vela/tflite/NegOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class NegOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsNegOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = NegOptions() + x.Init(buf, n + offset) + return x + + # NegOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def NegOptionsStart(builder): builder.StartObject(0) +def NegOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/NonMaxSuppressionV4Options.py b/ethosu/vela/tflite/NonMaxSuppressionV4Options.py new file mode 100644 index 00000000..6ad10a2e --- /dev/null +++ b/ethosu/vela/tflite/NonMaxSuppressionV4Options.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class NonMaxSuppressionV4Options(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsNonMaxSuppressionV4Options(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = NonMaxSuppressionV4Options() + x.Init(buf, n + offset) + return x + + # NonMaxSuppressionV4Options + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def NonMaxSuppressionV4OptionsStart(builder): builder.StartObject(0) +def NonMaxSuppressionV4OptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/NonMaxSuppressionV5Options.py b/ethosu/vela/tflite/NonMaxSuppressionV5Options.py new file mode 100644 index 00000000..99cbdbbf --- /dev/null +++ b/ethosu/vela/tflite/NonMaxSuppressionV5Options.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class NonMaxSuppressionV5Options(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsNonMaxSuppressionV5Options(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = NonMaxSuppressionV5Options() + x.Init(buf, n + offset) + return x + + # NonMaxSuppressionV5Options + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def NonMaxSuppressionV5OptionsStart(builder): builder.StartObject(0) +def NonMaxSuppressionV5OptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/NotEqualOptions.py b/ethosu/vela/tflite/NotEqualOptions.py new file mode 100644 index 00000000..4c511e93 --- /dev/null +++ b/ethosu/vela/tflite/NotEqualOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class NotEqualOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsNotEqualOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = NotEqualOptions() + x.Init(buf, n + offset) + return x + + # NotEqualOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def NotEqualOptionsStart(builder): builder.StartObject(0) +def NotEqualOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/OneHotOptions.py b/ethosu/vela/tflite/OneHotOptions.py new file mode 100644 index 00000000..793a3e75 --- /dev/null +++ b/ethosu/vela/tflite/OneHotOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class OneHotOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsOneHotOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = OneHotOptions() + x.Init(buf, n + offset) + return x + + # OneHotOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # OneHotOptions + def Axis(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def OneHotOptionsStart(builder): builder.StartObject(1) +def OneHotOptionsAddAxis(builder, axis): builder.PrependInt32Slot(0, axis, 0) +def OneHotOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Operator.py b/ethosu/vela/tflite/Operator.py new file mode 100644 index 00000000..cbae3dab --- /dev/null +++ b/ethosu/vela/tflite/Operator.py @@ -0,0 +1,177 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Operator(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsOperator(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Operator() + x.Init(buf, n + offset) + return x + + # Operator + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Operator + def OpcodeIndex(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos) + return 0 + + # Operator + def Inputs(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # Operator + def InputsAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # Operator + def InputsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Operator + def Outputs(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # Operator + def OutputsAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # Operator + def OutputsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Operator + def BuiltinOptionsType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos) + return 0 + + # Operator + def BuiltinOptions(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + from flatbuffers.table import Table + obj = Table(bytearray(), 0) + self._tab.Union(obj, o) + return obj + return None + + # Operator + def CustomOptions(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1)) + return 0 + + # Operator + def CustomOptionsAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o) + return 0 + + # Operator + def CustomOptionsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Operator + def CustomOptionsFormat(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # Operator + def MutatingVariableInputs(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.BoolFlags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1)) + return 0 + + # Operator + def MutatingVariableInputsAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.BoolFlags, o) + return 0 + + # Operator + def MutatingVariableInputsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Operator + def Intermediates(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # Operator + def IntermediatesAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # Operator + def IntermediatesLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(20)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def OperatorStart(builder): builder.StartObject(9) +def OperatorAddOpcodeIndex(builder, opcodeIndex): builder.PrependUint32Slot(0, opcodeIndex, 0) +def OperatorAddInputs(builder, inputs): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0) +def OperatorStartInputsVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def OperatorAddOutputs(builder, outputs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0) +def OperatorStartOutputsVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def OperatorAddBuiltinOptionsType(builder, builtinOptionsType): builder.PrependUint8Slot(3, builtinOptionsType, 0) +def OperatorAddBuiltinOptions(builder, builtinOptions): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(builtinOptions), 0) +def OperatorAddCustomOptions(builder, customOptions): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(customOptions), 0) +def OperatorStartCustomOptionsVector(builder, numElems): return builder.StartVector(1, numElems, 1) +def OperatorAddCustomOptionsFormat(builder, customOptionsFormat): builder.PrependInt8Slot(6, customOptionsFormat, 0) +def OperatorAddMutatingVariableInputs(builder, mutatingVariableInputs): builder.PrependUOffsetTRelativeSlot(7, flatbuffers.number_types.UOffsetTFlags.py_type(mutatingVariableInputs), 0) +def OperatorStartMutatingVariableInputsVector(builder, numElems): return builder.StartVector(1, numElems, 1) +def OperatorAddIntermediates(builder, intermediates): builder.PrependUOffsetTRelativeSlot(8, flatbuffers.number_types.UOffsetTFlags.py_type(intermediates), 0) +def OperatorStartIntermediatesVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def OperatorEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/OperatorCode.py b/ethosu/vela/tflite/OperatorCode.py new file mode 100644 index 00000000..dd525f53 --- /dev/null +++ b/ethosu/vela/tflite/OperatorCode.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class OperatorCode(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsOperatorCode(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = OperatorCode() + x.Init(buf, n + offset) + return x + + # OperatorCode + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # OperatorCode + def BuiltinCode(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # OperatorCode + def CustomCode(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.String(o + self._tab.Pos) + return None + + # OperatorCode + def Version(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 1 + +def OperatorCodeStart(builder): builder.StartObject(3) +def OperatorCodeAddBuiltinCode(builder, builtinCode): builder.PrependInt8Slot(0, builtinCode, 0) +def OperatorCodeAddCustomCode(builder, customCode): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(customCode), 0) +def OperatorCodeAddVersion(builder, version): builder.PrependInt32Slot(2, version, 1) +def OperatorCodeEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/PackOptions.py b/ethosu/vela/tflite/PackOptions.py new file mode 100644 index 00000000..6a8ee2bb --- /dev/null +++ b/ethosu/vela/tflite/PackOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class PackOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsPackOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = PackOptions() + x.Init(buf, n + offset) + return x + + # PackOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # PackOptions + def ValuesCount(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # PackOptions + def Axis(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def PackOptionsStart(builder): builder.StartObject(2) +def PackOptionsAddValuesCount(builder, valuesCount): builder.PrependInt32Slot(0, valuesCount, 0) +def PackOptionsAddAxis(builder, axis): builder.PrependInt32Slot(1, axis, 0) +def PackOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/PadOptions.py b/ethosu/vela/tflite/PadOptions.py new file mode 100644 index 00000000..d0833c68 --- /dev/null +++ b/ethosu/vela/tflite/PadOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class PadOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsPadOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = PadOptions() + x.Init(buf, n + offset) + return x + + # PadOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def PadOptionsStart(builder): builder.StartObject(0) +def PadOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/PadV2Options.py b/ethosu/vela/tflite/PadV2Options.py new file mode 100644 index 00000000..5ea0d70c --- /dev/null +++ b/ethosu/vela/tflite/PadV2Options.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class PadV2Options(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsPadV2Options(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = PadV2Options() + x.Init(buf, n + offset) + return x + + # PadV2Options + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def PadV2OptionsStart(builder): builder.StartObject(0) +def PadV2OptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Padding.py b/ethosu/vela/tflite/Padding.py new file mode 100644 index 00000000..168bf74c --- /dev/null +++ b/ethosu/vela/tflite/Padding.py @@ -0,0 +1,7 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class Padding(object): + SAME = 0 + VALID = 1 diff --git a/ethosu/vela/tflite/Pool2DOptions.py b/ethosu/vela/tflite/Pool2DOptions.py new file mode 100644 index 00000000..b8b9f178 --- /dev/null +++ b/ethosu/vela/tflite/Pool2DOptions.py @@ -0,0 +1,70 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Pool2DOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsPool2DOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Pool2DOptions() + x.Init(buf, n + offset) + return x + + # Pool2DOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Pool2DOptions + def Padding(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # Pool2DOptions + def StrideW(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # Pool2DOptions + def StrideH(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # Pool2DOptions + def FilterWidth(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # Pool2DOptions + def FilterHeight(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # Pool2DOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def Pool2DOptionsStart(builder): builder.StartObject(6) +def Pool2DOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0) +def Pool2DOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0) +def Pool2DOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0) +def Pool2DOptionsAddFilterWidth(builder, filterWidth): builder.PrependInt32Slot(3, filterWidth, 0) +def Pool2DOptionsAddFilterHeight(builder, filterHeight): builder.PrependInt32Slot(4, filterHeight, 0) +def Pool2DOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(5, fusedActivationFunction, 0) +def Pool2DOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/PowOptions.py b/ethosu/vela/tflite/PowOptions.py new file mode 100644 index 00000000..666ca488 --- /dev/null +++ b/ethosu/vela/tflite/PowOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class PowOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsPowOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = PowOptions() + x.Init(buf, n + offset) + return x + + # PowOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def PowOptionsStart(builder): builder.StartObject(0) +def PowOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/QuantizationDetails.py b/ethosu/vela/tflite/QuantizationDetails.py new file mode 100644 index 00000000..8d53af96 --- /dev/null +++ b/ethosu/vela/tflite/QuantizationDetails.py @@ -0,0 +1,7 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class QuantizationDetails(object): + NONE = 0 + CustomQuantization = 1 diff --git a/ethosu/vela/tflite/QuantizationParameters.py b/ethosu/vela/tflite/QuantizationParameters.py new file mode 100644 index 00000000..fcd686cf --- /dev/null +++ b/ethosu/vela/tflite/QuantizationParameters.py @@ -0,0 +1,145 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class QuantizationParameters(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsQuantizationParameters(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = QuantizationParameters() + x.Init(buf, n + offset) + return x + + # QuantizationParameters + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # QuantizationParameters + def Min(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # QuantizationParameters + def MinAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o) + return 0 + + # QuantizationParameters + def MinLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # QuantizationParameters + def Max(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # QuantizationParameters + def MaxAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o) + return 0 + + # QuantizationParameters + def MaxLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # QuantizationParameters + def Scale(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Float32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # QuantizationParameters + def ScaleAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Float32Flags, o) + return 0 + + # QuantizationParameters + def ScaleLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # QuantizationParameters + def ZeroPoint(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int64Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 8)) + return 0 + + # QuantizationParameters + def ZeroPointAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int64Flags, o) + return 0 + + # QuantizationParameters + def ZeroPointLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # QuantizationParameters + def DetailsType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint8Flags, o + self._tab.Pos) + return 0 + + # QuantizationParameters + def Details(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + from flatbuffers.table import Table + obj = Table(bytearray(), 0) + self._tab.Union(obj, o) + return obj + return None + + # QuantizationParameters + def QuantizedDimension(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def QuantizationParametersStart(builder): builder.StartObject(7) +def QuantizationParametersAddMin(builder, min): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(min), 0) +def QuantizationParametersStartMinVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def QuantizationParametersAddMax(builder, max): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(max), 0) +def QuantizationParametersStartMaxVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def QuantizationParametersAddScale(builder, scale): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(scale), 0) +def QuantizationParametersStartScaleVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def QuantizationParametersAddZeroPoint(builder, zeroPoint): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(zeroPoint), 0) +def QuantizationParametersStartZeroPointVector(builder, numElems): return builder.StartVector(8, numElems, 8) +def QuantizationParametersAddDetailsType(builder, detailsType): builder.PrependUint8Slot(4, detailsType, 0) +def QuantizationParametersAddDetails(builder, details): builder.PrependUOffsetTRelativeSlot(5, flatbuffers.number_types.UOffsetTFlags.py_type(details), 0) +def QuantizationParametersAddQuantizedDimension(builder, quantizedDimension): builder.PrependInt32Slot(6, quantizedDimension, 0) +def QuantizationParametersEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/QuantizeOptions.py b/ethosu/vela/tflite/QuantizeOptions.py new file mode 100644 index 00000000..28af8cc9 --- /dev/null +++ b/ethosu/vela/tflite/QuantizeOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class QuantizeOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsQuantizeOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = QuantizeOptions() + x.Init(buf, n + offset) + return x + + # QuantizeOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def QuantizeOptionsStart(builder): builder.StartObject(0) +def QuantizeOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/RNNOptions.py b/ethosu/vela/tflite/RNNOptions.py new file mode 100644 index 00000000..3cfdb6af --- /dev/null +++ b/ethosu/vela/tflite/RNNOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class RNNOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsRNNOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = RNNOptions() + x.Init(buf, n + offset) + return x + + # RNNOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # RNNOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def RNNOptionsStart(builder): builder.StartObject(1) +def RNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def RNNOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/RangeOptions.py b/ethosu/vela/tflite/RangeOptions.py new file mode 100644 index 00000000..cb705b57 --- /dev/null +++ b/ethosu/vela/tflite/RangeOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class RangeOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsRangeOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = RangeOptions() + x.Init(buf, n + offset) + return x + + # RangeOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def RangeOptionsStart(builder): builder.StartObject(0) +def RangeOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/RankOptions.py b/ethosu/vela/tflite/RankOptions.py new file mode 100644 index 00000000..4e4a5ecd --- /dev/null +++ b/ethosu/vela/tflite/RankOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class RankOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsRankOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = RankOptions() + x.Init(buf, n + offset) + return x + + # RankOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def RankOptionsStart(builder): builder.StartObject(0) +def RankOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ReducerOptions.py b/ethosu/vela/tflite/ReducerOptions.py new file mode 100644 index 00000000..93bbde17 --- /dev/null +++ b/ethosu/vela/tflite/ReducerOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ReducerOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsReducerOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ReducerOptions() + x.Init(buf, n + offset) + return x + + # ReducerOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ReducerOptions + def KeepDims(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def ReducerOptionsStart(builder): builder.StartObject(1) +def ReducerOptionsAddKeepDims(builder, keepDims): builder.PrependBoolSlot(0, keepDims, 0) +def ReducerOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ReshapeOptions.py b/ethosu/vela/tflite/ReshapeOptions.py new file mode 100644 index 00000000..157d45d9 --- /dev/null +++ b/ethosu/vela/tflite/ReshapeOptions.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ReshapeOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsReshapeOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ReshapeOptions() + x.Init(buf, n + offset) + return x + + # ReshapeOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ReshapeOptions + def NewShape(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # ReshapeOptions + def NewShapeAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # ReshapeOptions + def NewShapeLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def ReshapeOptionsStart(builder): builder.StartObject(1) +def ReshapeOptionsAddNewShape(builder, newShape): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(newShape), 0) +def ReshapeOptionsStartNewShapeVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def ReshapeOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ResizeBilinearOptions.py b/ethosu/vela/tflite/ResizeBilinearOptions.py new file mode 100644 index 00000000..fb05ca4b --- /dev/null +++ b/ethosu/vela/tflite/ResizeBilinearOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ResizeBilinearOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsResizeBilinearOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ResizeBilinearOptions() + x.Init(buf, n + offset) + return x + + # ResizeBilinearOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ResizeBilinearOptions + def AlignCorners(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + + # ResizeBilinearOptions + def HalfPixelCenters(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def ResizeBilinearOptionsStart(builder): builder.StartObject(4) +def ResizeBilinearOptionsAddAlignCorners(builder, alignCorners): builder.PrependBoolSlot(2, alignCorners, 0) +def ResizeBilinearOptionsAddHalfPixelCenters(builder, halfPixelCenters): builder.PrependBoolSlot(3, halfPixelCenters, 0) +def ResizeBilinearOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ResizeNearestNeighborOptions.py b/ethosu/vela/tflite/ResizeNearestNeighborOptions.py new file mode 100644 index 00000000..4b166e95 --- /dev/null +++ b/ethosu/vela/tflite/ResizeNearestNeighborOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ResizeNearestNeighborOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsResizeNearestNeighborOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ResizeNearestNeighborOptions() + x.Init(buf, n + offset) + return x + + # ResizeNearestNeighborOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ResizeNearestNeighborOptions + def AlignCorners(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def ResizeNearestNeighborOptionsStart(builder): builder.StartObject(1) +def ResizeNearestNeighborOptionsAddAlignCorners(builder, alignCorners): builder.PrependBoolSlot(0, alignCorners, 0) +def ResizeNearestNeighborOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ReverseSequenceOptions.py b/ethosu/vela/tflite/ReverseSequenceOptions.py new file mode 100644 index 00000000..cbaf96db --- /dev/null +++ b/ethosu/vela/tflite/ReverseSequenceOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ReverseSequenceOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsReverseSequenceOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ReverseSequenceOptions() + x.Init(buf, n + offset) + return x + + # ReverseSequenceOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ReverseSequenceOptions + def SeqDim(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # ReverseSequenceOptions + def BatchDim(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def ReverseSequenceOptionsStart(builder): builder.StartObject(2) +def ReverseSequenceOptionsAddSeqDim(builder, seqDim): builder.PrependInt32Slot(0, seqDim, 0) +def ReverseSequenceOptionsAddBatchDim(builder, batchDim): builder.PrependInt32Slot(1, batchDim, 0) +def ReverseSequenceOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ReverseV2Options.py b/ethosu/vela/tflite/ReverseV2Options.py new file mode 100644 index 00000000..dbac9362 --- /dev/null +++ b/ethosu/vela/tflite/ReverseV2Options.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ReverseV2Options(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsReverseV2Options(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ReverseV2Options() + x.Init(buf, n + offset) + return x + + # ReverseV2Options + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def ReverseV2OptionsStart(builder): builder.StartObject(0) +def ReverseV2OptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SVDFOptions.py b/ethosu/vela/tflite/SVDFOptions.py new file mode 100644 index 00000000..6f391db1 --- /dev/null +++ b/ethosu/vela/tflite/SVDFOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SVDFOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSVDFOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SVDFOptions() + x.Init(buf, n + offset) + return x + + # SVDFOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SVDFOptions + def Rank(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # SVDFOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def SVDFOptionsStart(builder): builder.StartObject(2) +def SVDFOptionsAddRank(builder, rank): builder.PrependInt32Slot(0, rank, 0) +def SVDFOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0) +def SVDFOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ScatterNdOptions.py b/ethosu/vela/tflite/ScatterNdOptions.py new file mode 100644 index 00000000..e6bf3a11 --- /dev/null +++ b/ethosu/vela/tflite/ScatterNdOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ScatterNdOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsScatterNdOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ScatterNdOptions() + x.Init(buf, n + offset) + return x + + # ScatterNdOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def ScatterNdOptionsStart(builder): builder.StartObject(0) +def ScatterNdOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SegmentSumOptions.py b/ethosu/vela/tflite/SegmentSumOptions.py new file mode 100644 index 00000000..d1c32133 --- /dev/null +++ b/ethosu/vela/tflite/SegmentSumOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SegmentSumOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSegmentSumOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SegmentSumOptions() + x.Init(buf, n + offset) + return x + + # SegmentSumOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def SegmentSumOptionsStart(builder): builder.StartObject(0) +def SegmentSumOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SelectOptions.py b/ethosu/vela/tflite/SelectOptions.py new file mode 100644 index 00000000..d67daf36 --- /dev/null +++ b/ethosu/vela/tflite/SelectOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SelectOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSelectOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SelectOptions() + x.Init(buf, n + offset) + return x + + # SelectOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def SelectOptionsStart(builder): builder.StartObject(0) +def SelectOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SelectV2Options.py b/ethosu/vela/tflite/SelectV2Options.py new file mode 100644 index 00000000..5d03fc2d --- /dev/null +++ b/ethosu/vela/tflite/SelectV2Options.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SelectV2Options(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSelectV2Options(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SelectV2Options() + x.Init(buf, n + offset) + return x + + # SelectV2Options + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def SelectV2OptionsStart(builder): builder.StartObject(0) +def SelectV2OptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SequenceRNNOptions.py b/ethosu/vela/tflite/SequenceRNNOptions.py new file mode 100644 index 00000000..74a4954a --- /dev/null +++ b/ethosu/vela/tflite/SequenceRNNOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SequenceRNNOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSequenceRNNOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SequenceRNNOptions() + x.Init(buf, n + offset) + return x + + # SequenceRNNOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SequenceRNNOptions + def TimeMajor(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + + # SequenceRNNOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def SequenceRNNOptionsStart(builder): builder.StartObject(2) +def SequenceRNNOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(0, timeMajor, 0) +def SequenceRNNOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(1, fusedActivationFunction, 0) +def SequenceRNNOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ShapeOptions.py b/ethosu/vela/tflite/ShapeOptions.py new file mode 100644 index 00000000..2d24c05f --- /dev/null +++ b/ethosu/vela/tflite/ShapeOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ShapeOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsShapeOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ShapeOptions() + x.Init(buf, n + offset) + return x + + # ShapeOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # ShapeOptions + def OutType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def ShapeOptionsStart(builder): builder.StartObject(1) +def ShapeOptionsAddOutType(builder, outType): builder.PrependInt8Slot(0, outType, 0) +def ShapeOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SkipGramOptions.py b/ethosu/vela/tflite/SkipGramOptions.py new file mode 100644 index 00000000..0e8bdc1d --- /dev/null +++ b/ethosu/vela/tflite/SkipGramOptions.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SkipGramOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSkipGramOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SkipGramOptions() + x.Init(buf, n + offset) + return x + + # SkipGramOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SkipGramOptions + def NgramSize(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # SkipGramOptions + def MaxSkipSize(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # SkipGramOptions + def IncludeAllNgrams(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def SkipGramOptionsStart(builder): builder.StartObject(3) +def SkipGramOptionsAddNgramSize(builder, ngramSize): builder.PrependInt32Slot(0, ngramSize, 0) +def SkipGramOptionsAddMaxSkipSize(builder, maxSkipSize): builder.PrependInt32Slot(1, maxSkipSize, 0) +def SkipGramOptionsAddIncludeAllNgrams(builder, includeAllNgrams): builder.PrependBoolSlot(2, includeAllNgrams, 0) +def SkipGramOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SliceOptions.py b/ethosu/vela/tflite/SliceOptions.py new file mode 100644 index 00000000..4b41568d --- /dev/null +++ b/ethosu/vela/tflite/SliceOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SliceOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSliceOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SliceOptions() + x.Init(buf, n + offset) + return x + + # SliceOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def SliceOptionsStart(builder): builder.StartObject(0) +def SliceOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SoftmaxOptions.py b/ethosu/vela/tflite/SoftmaxOptions.py new file mode 100644 index 00000000..a7168534 --- /dev/null +++ b/ethosu/vela/tflite/SoftmaxOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SoftmaxOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSoftmaxOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SoftmaxOptions() + x.Init(buf, n + offset) + return x + + # SoftmaxOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SoftmaxOptions + def Beta(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + +def SoftmaxOptionsStart(builder): builder.StartObject(1) +def SoftmaxOptionsAddBeta(builder, beta): builder.PrependFloat32Slot(0, beta, 0.0) +def SoftmaxOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SpaceToBatchNDOptions.py b/ethosu/vela/tflite/SpaceToBatchNDOptions.py new file mode 100644 index 00000000..b61ef96f --- /dev/null +++ b/ethosu/vela/tflite/SpaceToBatchNDOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SpaceToBatchNDOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSpaceToBatchNDOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SpaceToBatchNDOptions() + x.Init(buf, n + offset) + return x + + # SpaceToBatchNDOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def SpaceToBatchNDOptionsStart(builder): builder.StartObject(0) +def SpaceToBatchNDOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SpaceToDepthOptions.py b/ethosu/vela/tflite/SpaceToDepthOptions.py new file mode 100644 index 00000000..d571174a --- /dev/null +++ b/ethosu/vela/tflite/SpaceToDepthOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SpaceToDepthOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSpaceToDepthOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SpaceToDepthOptions() + x.Init(buf, n + offset) + return x + + # SpaceToDepthOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SpaceToDepthOptions + def BlockSize(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def SpaceToDepthOptionsStart(builder): builder.StartObject(1) +def SpaceToDepthOptionsAddBlockSize(builder, blockSize): builder.PrependInt32Slot(0, blockSize, 0) +def SpaceToDepthOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SparseIndexVector.py b/ethosu/vela/tflite/SparseIndexVector.py new file mode 100644 index 00000000..e2c9db78 --- /dev/null +++ b/ethosu/vela/tflite/SparseIndexVector.py @@ -0,0 +1,9 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class SparseIndexVector(object): + NONE = 0 + Int32Vector = 1 + Uint16Vector = 2 + Uint8Vector = 3 diff --git a/ethosu/vela/tflite/SparseToDenseOptions.py b/ethosu/vela/tflite/SparseToDenseOptions.py new file mode 100644 index 00000000..826eee08 --- /dev/null +++ b/ethosu/vela/tflite/SparseToDenseOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SparseToDenseOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSparseToDenseOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SparseToDenseOptions() + x.Init(buf, n + offset) + return x + + # SparseToDenseOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SparseToDenseOptions + def ValidateIndices(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def SparseToDenseOptionsStart(builder): builder.StartObject(1) +def SparseToDenseOptionsAddValidateIndices(builder, validateIndices): builder.PrependBoolSlot(0, validateIndices, 0) +def SparseToDenseOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SparsityParameters.py b/ethosu/vela/tflite/SparsityParameters.py new file mode 100644 index 00000000..de550a67 --- /dev/null +++ b/ethosu/vela/tflite/SparsityParameters.py @@ -0,0 +1,92 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SparsityParameters(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSparsityParameters(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SparsityParameters() + x.Init(buf, n + offset) + return x + + # SparsityParameters + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SparsityParameters + def TraversalOrder(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # SparsityParameters + def TraversalOrderAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # SparsityParameters + def TraversalOrderLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # SparsityParameters + def BlockMap(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # SparsityParameters + def BlockMapAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # SparsityParameters + def BlockMapLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # SparsityParameters + def DimMetadata(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + x = self._tab.Vector(o) + x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 + x = self._tab.Indirect(x) + from .DimensionMetadata import DimensionMetadata + obj = DimensionMetadata() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # SparsityParameters + def DimMetadataLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def SparsityParametersStart(builder): builder.StartObject(3) +def SparsityParametersAddTraversalOrder(builder, traversalOrder): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(traversalOrder), 0) +def SparsityParametersStartTraversalOrderVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def SparsityParametersAddBlockMap(builder, blockMap): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(blockMap), 0) +def SparsityParametersStartBlockMapVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def SparsityParametersAddDimMetadata(builder, dimMetadata): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(dimMetadata), 0) +def SparsityParametersStartDimMetadataVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def SparsityParametersEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SplitOptions.py b/ethosu/vela/tflite/SplitOptions.py new file mode 100644 index 00000000..3207525b --- /dev/null +++ b/ethosu/vela/tflite/SplitOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SplitOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSplitOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SplitOptions() + x.Init(buf, n + offset) + return x + + # SplitOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SplitOptions + def NumSplits(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def SplitOptionsStart(builder): builder.StartObject(1) +def SplitOptionsAddNumSplits(builder, numSplits): builder.PrependInt32Slot(0, numSplits, 0) +def SplitOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SplitVOptions.py b/ethosu/vela/tflite/SplitVOptions.py new file mode 100644 index 00000000..418959de --- /dev/null +++ b/ethosu/vela/tflite/SplitVOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SplitVOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSplitVOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SplitVOptions() + x.Init(buf, n + offset) + return x + + # SplitVOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SplitVOptions + def NumSplits(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def SplitVOptionsStart(builder): builder.StartObject(1) +def SplitVOptionsAddNumSplits(builder, numSplits): builder.PrependInt32Slot(0, numSplits, 0) +def SplitVOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SquareOptions.py b/ethosu/vela/tflite/SquareOptions.py new file mode 100644 index 00000000..56633f6a --- /dev/null +++ b/ethosu/vela/tflite/SquareOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SquareOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSquareOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SquareOptions() + x.Init(buf, n + offset) + return x + + # SquareOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def SquareOptionsStart(builder): builder.StartObject(0) +def SquareOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SquaredDifferenceOptions.py b/ethosu/vela/tflite/SquaredDifferenceOptions.py new file mode 100644 index 00000000..906855d1 --- /dev/null +++ b/ethosu/vela/tflite/SquaredDifferenceOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SquaredDifferenceOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSquaredDifferenceOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SquaredDifferenceOptions() + x.Init(buf, n + offset) + return x + + # SquaredDifferenceOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def SquaredDifferenceOptionsStart(builder): builder.StartObject(0) +def SquaredDifferenceOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SqueezeOptions.py b/ethosu/vela/tflite/SqueezeOptions.py new file mode 100644 index 00000000..25b294dc --- /dev/null +++ b/ethosu/vela/tflite/SqueezeOptions.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SqueezeOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSqueezeOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SqueezeOptions() + x.Init(buf, n + offset) + return x + + # SqueezeOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SqueezeOptions + def SqueezeDims(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # SqueezeOptions + def SqueezeDimsAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # SqueezeOptions + def SqueezeDimsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def SqueezeOptionsStart(builder): builder.StartObject(1) +def SqueezeOptionsAddSqueezeDims(builder, squeezeDims): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(squeezeDims), 0) +def SqueezeOptionsStartSqueezeDimsVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def SqueezeOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/StridedSliceOptions.py b/ethosu/vela/tflite/StridedSliceOptions.py new file mode 100644 index 00000000..3bbb36b8 --- /dev/null +++ b/ethosu/vela/tflite/StridedSliceOptions.py @@ -0,0 +1,62 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class StridedSliceOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsStridedSliceOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = StridedSliceOptions() + x.Init(buf, n + offset) + return x + + # StridedSliceOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # StridedSliceOptions + def BeginMask(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # StridedSliceOptions + def EndMask(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # StridedSliceOptions + def EllipsisMask(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # StridedSliceOptions + def NewAxisMask(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # StridedSliceOptions + def ShrinkAxisMask(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def StridedSliceOptionsStart(builder): builder.StartObject(5) +def StridedSliceOptionsAddBeginMask(builder, beginMask): builder.PrependInt32Slot(0, beginMask, 0) +def StridedSliceOptionsAddEndMask(builder, endMask): builder.PrependInt32Slot(1, endMask, 0) +def StridedSliceOptionsAddEllipsisMask(builder, ellipsisMask): builder.PrependInt32Slot(2, ellipsisMask, 0) +def StridedSliceOptionsAddNewAxisMask(builder, newAxisMask): builder.PrependInt32Slot(3, newAxisMask, 0) +def StridedSliceOptionsAddShrinkAxisMask(builder, shrinkAxisMask): builder.PrependInt32Slot(4, shrinkAxisMask, 0) +def StridedSliceOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SubGraph.py b/ethosu/vela/tflite/SubGraph.py new file mode 100644 index 00000000..eaa42fac --- /dev/null +++ b/ethosu/vela/tflite/SubGraph.py @@ -0,0 +1,122 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SubGraph(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSubGraph(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SubGraph() + x.Init(buf, n + offset) + return x + + # SubGraph + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SubGraph + def Tensors(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + x = self._tab.Vector(o) + x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 + x = self._tab.Indirect(x) + from .Tensor import Tensor + obj = Tensor() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # SubGraph + def TensorsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # SubGraph + def Inputs(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # SubGraph + def InputsAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # SubGraph + def InputsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # SubGraph + def Outputs(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # SubGraph + def OutputsAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # SubGraph + def OutputsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # SubGraph + def Operators(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + x = self._tab.Vector(o) + x += flatbuffers.number_types.UOffsetTFlags.py_type(j) * 4 + x = self._tab.Indirect(x) + from .Operator import Operator + obj = Operator() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # SubGraph + def OperatorsLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # SubGraph + def Name(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + return self._tab.String(o + self._tab.Pos) + return None + +def SubGraphStart(builder): builder.StartObject(5) +def SubGraphAddTensors(builder, tensors): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(tensors), 0) +def SubGraphStartTensorsVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def SubGraphAddInputs(builder, inputs): builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(inputs), 0) +def SubGraphStartInputsVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def SubGraphAddOutputs(builder, outputs): builder.PrependUOffsetTRelativeSlot(2, flatbuffers.number_types.UOffsetTFlags.py_type(outputs), 0) +def SubGraphStartOutputsVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def SubGraphAddOperators(builder, operators): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(operators), 0) +def SubGraphStartOperatorsVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def SubGraphAddName(builder, name): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0) +def SubGraphEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/SubOptions.py b/ethosu/vela/tflite/SubOptions.py new file mode 100644 index 00000000..eccd7aba --- /dev/null +++ b/ethosu/vela/tflite/SubOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class SubOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsSubOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = SubOptions() + x.Init(buf, n + offset) + return x + + # SubOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # SubOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + +def SubOptionsStart(builder): builder.StartObject(1) +def SubOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def SubOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Tensor.py b/ethosu/vela/tflite/Tensor.py new file mode 100644 index 00000000..4c39b7cb --- /dev/null +++ b/ethosu/vela/tflite/Tensor.py @@ -0,0 +1,126 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Tensor(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsTensor(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Tensor() + x.Init(buf, n + offset) + return x + + # Tensor + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Tensor + def Shape(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # Tensor + def ShapeAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # Tensor + def ShapeLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + + # Tensor + def Type(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # Tensor + def Buffer(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Uint32Flags, o + self._tab.Pos) + return 0 + + # Tensor + def Name(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return self._tab.String(o + self._tab.Pos) + return None + + # Tensor + def Quantization(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(12)) + if o != 0: + x = self._tab.Indirect(o + self._tab.Pos) + from .QuantizationParameters import QuantizationParameters + obj = QuantizationParameters() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # Tensor + def IsVariable(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(14)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + + # Tensor + def Sparsity(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(16)) + if o != 0: + x = self._tab.Indirect(o + self._tab.Pos) + from .SparsityParameters import SparsityParameters + obj = SparsityParameters() + obj.Init(self._tab.Bytes, x) + return obj + return None + + # Tensor + def ShapeSignature(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Int32Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 4)) + return 0 + + # Tensor + def ShapeSignatureAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Int32Flags, o) + return 0 + + # Tensor + def ShapeSignatureLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(18)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def TensorStart(builder): builder.StartObject(8) +def TensorAddShape(builder, shape): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(shape), 0) +def TensorStartShapeVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def TensorAddType(builder, type): builder.PrependInt8Slot(1, type, 0) +def TensorAddBuffer(builder, buffer): builder.PrependUint32Slot(2, buffer, 0) +def TensorAddName(builder, name): builder.PrependUOffsetTRelativeSlot(3, flatbuffers.number_types.UOffsetTFlags.py_type(name), 0) +def TensorAddQuantization(builder, quantization): builder.PrependUOffsetTRelativeSlot(4, flatbuffers.number_types.UOffsetTFlags.py_type(quantization), 0) +def TensorAddIsVariable(builder, isVariable): builder.PrependBoolSlot(5, isVariable, 0) +def TensorAddSparsity(builder, sparsity): builder.PrependUOffsetTRelativeSlot(6, flatbuffers.number_types.UOffsetTFlags.py_type(sparsity), 0) +def TensorAddShapeSignature(builder, shapeSignature): builder.PrependUOffsetTRelativeSlot(7, flatbuffers.number_types.UOffsetTFlags.py_type(shapeSignature), 0) +def TensorStartShapeSignatureVector(builder, numElems): return builder.StartVector(4, numElems, 4) +def TensorEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/TensorType.py b/ethosu/vela/tflite/TensorType.py new file mode 100644 index 00000000..53c011bc --- /dev/null +++ b/ethosu/vela/tflite/TensorType.py @@ -0,0 +1,15 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +class TensorType(object): + FLOAT32 = 0 + FLOAT16 = 1 + INT32 = 2 + UINT8 = 3 + INT64 = 4 + STRING = 5 + BOOL = 6 + INT16 = 7 + COMPLEX64 = 8 + INT8 = 9 diff --git a/ethosu/vela/tflite/TileOptions.py b/ethosu/vela/tflite/TileOptions.py new file mode 100644 index 00000000..ec8396dc --- /dev/null +++ b/ethosu/vela/tflite/TileOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class TileOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsTileOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = TileOptions() + x.Init(buf, n + offset) + return x + + # TileOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def TileOptionsStart(builder): builder.StartObject(0) +def TileOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/TopKV2Options.py b/ethosu/vela/tflite/TopKV2Options.py new file mode 100644 index 00000000..ccd51033 --- /dev/null +++ b/ethosu/vela/tflite/TopKV2Options.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class TopKV2Options(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsTopKV2Options(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = TopKV2Options() + x.Init(buf, n + offset) + return x + + # TopKV2Options + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def TopKV2OptionsStart(builder): builder.StartObject(0) +def TopKV2OptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/TransposeConvOptions.py b/ethosu/vela/tflite/TransposeConvOptions.py new file mode 100644 index 00000000..423571c8 --- /dev/null +++ b/ethosu/vela/tflite/TransposeConvOptions.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class TransposeConvOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsTransposeConvOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = TransposeConvOptions() + x.Init(buf, n + offset) + return x + + # TransposeConvOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # TransposeConvOptions + def Padding(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # TransposeConvOptions + def StrideW(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # TransposeConvOptions + def StrideH(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def TransposeConvOptionsStart(builder): builder.StartObject(3) +def TransposeConvOptionsAddPadding(builder, padding): builder.PrependInt8Slot(0, padding, 0) +def TransposeConvOptionsAddStrideW(builder, strideW): builder.PrependInt32Slot(1, strideW, 0) +def TransposeConvOptionsAddStrideH(builder, strideH): builder.PrependInt32Slot(2, strideH, 0) +def TransposeConvOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/TransposeOptions.py b/ethosu/vela/tflite/TransposeOptions.py new file mode 100644 index 00000000..42c596d9 --- /dev/null +++ b/ethosu/vela/tflite/TransposeOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class TransposeOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsTransposeOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = TransposeOptions() + x.Init(buf, n + offset) + return x + + # TransposeOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def TransposeOptionsStart(builder): builder.StartObject(0) +def TransposeOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Uint16Vector.py b/ethosu/vela/tflite/Uint16Vector.py new file mode 100644 index 00000000..750e52a4 --- /dev/null +++ b/ethosu/vela/tflite/Uint16Vector.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Uint16Vector(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsUint16Vector(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Uint16Vector() + x.Init(buf, n + offset) + return x + + # Uint16Vector + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Uint16Vector + def Values(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Uint16Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 2)) + return 0 + + # Uint16Vector + def ValuesAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint16Flags, o) + return 0 + + # Uint16Vector + def ValuesLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def Uint16VectorStart(builder): builder.StartObject(1) +def Uint16VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0) +def Uint16VectorStartValuesVector(builder, numElems): return builder.StartVector(2, numElems, 2) +def Uint16VectorEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/Uint8Vector.py b/ethosu/vela/tflite/Uint8Vector.py new file mode 100644 index 00000000..dc475f9f --- /dev/null +++ b/ethosu/vela/tflite/Uint8Vector.py @@ -0,0 +1,46 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class Uint8Vector(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsUint8Vector(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = Uint8Vector() + x.Init(buf, n + offset) + return x + + # Uint8Vector + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # Uint8Vector + def Values(self, j): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + a = self._tab.Vector(o) + return self._tab.Get(flatbuffers.number_types.Uint8Flags, a + flatbuffers.number_types.UOffsetTFlags.py_type(j * 1)) + return 0 + + # Uint8Vector + def ValuesAsNumpy(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.GetVectorAsNumpy(flatbuffers.number_types.Uint8Flags, o) + return 0 + + # Uint8Vector + def ValuesLength(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.VectorLen(o) + return 0 + +def Uint8VectorStart(builder): builder.StartObject(1) +def Uint8VectorAddValues(builder, values): builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(values), 0) +def Uint8VectorStartValuesVector(builder, numElems): return builder.StartVector(1, numElems, 1) +def Uint8VectorEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py b/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py new file mode 100644 index 00000000..1b0c112c --- /dev/null +++ b/ethosu/vela/tflite/UnidirectionalSequenceLSTMOptions.py @@ -0,0 +1,54 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class UnidirectionalSequenceLSTMOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsUnidirectionalSequenceLSTMOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = UnidirectionalSequenceLSTMOptions() + x.Init(buf, n + offset) + return x + + # UnidirectionalSequenceLSTMOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # UnidirectionalSequenceLSTMOptions + def FusedActivationFunction(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 0 + + # UnidirectionalSequenceLSTMOptions + def CellClip(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # UnidirectionalSequenceLSTMOptions + def ProjClip(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(8)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Float32Flags, o + self._tab.Pos) + return 0.0 + + # UnidirectionalSequenceLSTMOptions + def TimeMajor(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(10)) + if o != 0: + return bool(self._tab.Get(flatbuffers.number_types.BoolFlags, o + self._tab.Pos)) + return False + +def UnidirectionalSequenceLSTMOptionsStart(builder): builder.StartObject(4) +def UnidirectionalSequenceLSTMOptionsAddFusedActivationFunction(builder, fusedActivationFunction): builder.PrependInt8Slot(0, fusedActivationFunction, 0) +def UnidirectionalSequenceLSTMOptionsAddCellClip(builder, cellClip): builder.PrependFloat32Slot(1, cellClip, 0.0) +def UnidirectionalSequenceLSTMOptionsAddProjClip(builder, projClip): builder.PrependFloat32Slot(2, projClip, 0.0) +def UnidirectionalSequenceLSTMOptionsAddTimeMajor(builder, timeMajor): builder.PrependBoolSlot(3, timeMajor, 0) +def UnidirectionalSequenceLSTMOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/UniqueOptions.py b/ethosu/vela/tflite/UniqueOptions.py new file mode 100644 index 00000000..841c6977 --- /dev/null +++ b/ethosu/vela/tflite/UniqueOptions.py @@ -0,0 +1,30 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class UniqueOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsUniqueOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = UniqueOptions() + x.Init(buf, n + offset) + return x + + # UniqueOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # UniqueOptions + def IdxOutType(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int8Flags, o + self._tab.Pos) + return 2 + +def UniqueOptionsStart(builder): builder.StartObject(1) +def UniqueOptionsAddIdxOutType(builder, idxOutType): builder.PrependInt8Slot(0, idxOutType, 2) +def UniqueOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/UnpackOptions.py b/ethosu/vela/tflite/UnpackOptions.py new file mode 100644 index 00000000..eed40193 --- /dev/null +++ b/ethosu/vela/tflite/UnpackOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class UnpackOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsUnpackOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = UnpackOptions() + x.Init(buf, n + offset) + return x + + # UnpackOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # UnpackOptions + def Num(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # UnpackOptions + def Axis(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def UnpackOptionsStart(builder): builder.StartObject(2) +def UnpackOptionsAddNum(builder, num): builder.PrependInt32Slot(0, num, 0) +def UnpackOptionsAddAxis(builder, axis): builder.PrependInt32Slot(1, axis, 0) +def UnpackOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/WhereOptions.py b/ethosu/vela/tflite/WhereOptions.py new file mode 100644 index 00000000..ab69f6aa --- /dev/null +++ b/ethosu/vela/tflite/WhereOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class WhereOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsWhereOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = WhereOptions() + x.Init(buf, n + offset) + return x + + # WhereOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def WhereOptionsStart(builder): builder.StartObject(0) +def WhereOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/WhileOptions.py b/ethosu/vela/tflite/WhileOptions.py new file mode 100644 index 00000000..7d5a6dfa --- /dev/null +++ b/ethosu/vela/tflite/WhileOptions.py @@ -0,0 +1,38 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class WhileOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsWhileOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = WhileOptions() + x.Init(buf, n + offset) + return x + + # WhileOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + + # WhileOptions + def CondSubgraphIndex(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + + # WhileOptions + def BodySubgraphIndex(self): + o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6)) + if o != 0: + return self._tab.Get(flatbuffers.number_types.Int32Flags, o + self._tab.Pos) + return 0 + +def WhileOptionsStart(builder): builder.StartObject(2) +def WhileOptionsAddCondSubgraphIndex(builder, condSubgraphIndex): builder.PrependInt32Slot(0, condSubgraphIndex, 0) +def WhileOptionsAddBodySubgraphIndex(builder, bodySubgraphIndex): builder.PrependInt32Slot(1, bodySubgraphIndex, 0) +def WhileOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/ZerosLikeOptions.py b/ethosu/vela/tflite/ZerosLikeOptions.py new file mode 100644 index 00000000..e6aa9639 --- /dev/null +++ b/ethosu/vela/tflite/ZerosLikeOptions.py @@ -0,0 +1,22 @@ +# automatically generated by the FlatBuffers compiler, do not modify + +# namespace: tflite + +import flatbuffers + +class ZerosLikeOptions(object): + __slots__ = ['_tab'] + + @classmethod + def GetRootAsZerosLikeOptions(cls, buf, offset): + n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset) + x = ZerosLikeOptions() + x.Init(buf, n + offset) + return x + + # ZerosLikeOptions + def Init(self, buf, pos): + self._tab = flatbuffers.table.Table(buf, pos) + +def ZerosLikeOptionsStart(builder): builder.StartObject(0) +def ZerosLikeOptionsEnd(builder): return builder.EndObject() diff --git a/ethosu/vela/tflite/__init__.py b/ethosu/vela/tflite/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py new file mode 100644 index 00000000..8e46ef2e --- /dev/null +++ b/ethosu/vela/tflite_mapping.py @@ -0,0 +1,644 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# TensorFlow Lite mapping functions used by both reader and writer. +# Contains a mapping from the various TensorFlow Lite enums and options structs, generated by the FlatBuffer code +# generator, to Vela's internal format. + +import numpy as np +import struct + +from .data_type import DataType + +from .tflite.TensorType import TensorType +from .tflite.BuiltinOperator import BuiltinOperator +from .tflite.BuiltinOptions import BuiltinOptions + + +from .tflite.Padding import Padding +from .tflite.ActivationFunctionType import ActivationFunctionType + +from .tflite import Conv2DOptions +from .tflite import DepthwiseConv2DOptions +from .tflite import ConcatEmbeddingsOptions +from .tflite import LSHProjectionOptions +from .tflite import Pool2DOptions +from .tflite import SVDFOptions +from .tflite import RNNOptions +from .tflite import FullyConnectedOptions +from .tflite import SoftmaxOptions +from .tflite import ConcatenationOptions +from .tflite import AddOptions +from .tflite import L2NormOptions +from .tflite import LocalResponseNormalizationOptions +from .tflite import LSTMOptions +from .tflite import ResizeBilinearOptions +from .tflite import CallOptions +from .tflite import ReshapeOptions +from .tflite import SkipGramOptions +from .tflite import SpaceToDepthOptions +from .tflite import EmbeddingLookupSparseOptions +from .tflite import MulOptions +from .tflite import PadOptions +from .tflite import GatherOptions +from .tflite import BatchToSpaceNDOptions +from .tflite import SpaceToBatchNDOptions +from .tflite import TransposeOptions +from .tflite import ReducerOptions +from .tflite import SubOptions +from .tflite import DivOptions +from .tflite import SqueezeOptions +from .tflite import SequenceRNNOptions +from .tflite import StridedSliceOptions +from .tflite import ExpOptions +from .tflite import TopKV2Options +from .tflite import SplitOptions +from .tflite import LogSoftmaxOptions +from .tflite import CastOptions +from .tflite import DequantizeOptions +from .tflite import MaximumMinimumOptions +from .tflite import ArgMaxOptions +from .tflite import LessOptions +from .tflite import NegOptions +from .tflite import PadV2Options +from .tflite import GreaterOptions +from .tflite import GreaterEqualOptions +from .tflite import LessEqualOptions +from .tflite import SelectOptions +from .tflite import SliceOptions +from .tflite import TransposeConvOptions +from .tflite import SparseToDenseOptions +from .tflite import TileOptions +from .tflite import ExpandDimsOptions +from .tflite import EqualOptions +from .tflite import NotEqualOptions +from .tflite import ShapeOptions +from .tflite import PowOptions +from .tflite import ArgMinOptions +from .tflite import FakeQuantOptions +from .tflite import PackOptions +from .tflite import LogicalOrOptions +from .tflite import OneHotOptions +from .tflite import LogicalAndOptions +from .tflite import LogicalNotOptions +from .tflite import UnpackOptions +from .tflite import FloorDivOptions +from .tflite import SquareOptions +from .tflite import ZerosLikeOptions +from .tflite import FillOptions +from .tflite import BidirectionalSequenceLSTMOptions +from .tflite import BidirectionalSequenceRNNOptions +from .tflite import UnidirectionalSequenceLSTMOptions +from .tflite import FloorModOptions +from .tflite import RangeOptions +from .tflite import ResizeNearestNeighborOptions +from .tflite import LeakyReluOptions +from .tflite import SquaredDifferenceOptions +from .tflite import MirrorPadOptions +from .tflite import AbsOptions +from .tflite import SplitVOptions +from .tflite import UniqueOptions +from .tflite import ReverseV2Options +from .tflite import AddNOptions +from .tflite import GatherNdOptions +from .tflite import CosOptions +from .tflite import WhereOptions +from .tflite import RankOptions +from .tflite import ReverseSequenceOptions +from .tflite import MatrixDiagOptions +from .tflite import QuantizeOptions +from .tflite import MatrixSetDiagOptions +from .tflite import DensifyOptions +from .tflite import DepthToSpaceOptions +from .tflite import IfOptions +from .tflite import NonMaxSuppressionV4Options +from .tflite import NonMaxSuppressionV5Options +from .tflite import ScatterNdOptions +from .tflite import SegmentSumOptions +from .tflite import SelectV2Options +from .tflite import WhileOptions + + +def inverse_map(map): + return {v: k for k, v in map.items()} + + +datatype_map = { + TensorType.UINT8: DataType.uint8, + TensorType.INT8: DataType.int8, + TensorType.INT16: DataType.int16, + TensorType.INT32: DataType.int32, + TensorType.INT64: DataType.int64, + TensorType.FLOAT16: DataType.float16, + TensorType.FLOAT32: DataType.float32, + TensorType.STRING: DataType.string, + TensorType.BOOL: DataType.bool, + # no TensorType.COMPLEX64 for now +} + +datatype_inv_map = inverse_map(datatype_map) +datatype_inv_map[DataType.quint8] = TensorType.UINT8 + +datatype_inv_map[DataType.qint8] = TensorType.INT8 +datatype_inv_map[DataType.qint16] = TensorType.INT16 +datatype_inv_map[DataType.qint32] = TensorType.INT32 + + +datatype_map_numpy = { + TensorType.UINT8: np.uint8, + TensorType.INT8: np.int8, + TensorType.INT16: np.int16, + TensorType.INT32: np.int32, + TensorType.INT64: np.int64, + TensorType.FLOAT16: np.float16, + TensorType.FLOAT32: np.float32, + TensorType.BOOL: np.bool, +} + + +builtin_options_map = { + BuiltinOptions.Conv2DOptions: Conv2DOptions.Conv2DOptions, + BuiltinOptions.DepthwiseConv2DOptions: DepthwiseConv2DOptions.DepthwiseConv2DOptions, + BuiltinOptions.ConcatEmbeddingsOptions: ConcatEmbeddingsOptions.ConcatEmbeddingsOptions, + BuiltinOptions.LSHProjectionOptions: LSHProjectionOptions.LSHProjectionOptions, + BuiltinOptions.Pool2DOptions: Pool2DOptions.Pool2DOptions, + BuiltinOptions.SVDFOptions: SVDFOptions.SVDFOptions, + BuiltinOptions.RNNOptions: RNNOptions.RNNOptions, + BuiltinOptions.FullyConnectedOptions: FullyConnectedOptions.FullyConnectedOptions, + BuiltinOptions.SoftmaxOptions: SoftmaxOptions.SoftmaxOptions, + BuiltinOptions.ConcatenationOptions: ConcatenationOptions.ConcatenationOptions, + BuiltinOptions.AddOptions: AddOptions.AddOptions, + BuiltinOptions.L2NormOptions: L2NormOptions.L2NormOptions, + BuiltinOptions.LocalResponseNormalizationOptions: LocalResponseNormalizationOptions.LocalResponseNormalizationOptions, # noqa: E501 + BuiltinOptions.LSTMOptions: LSTMOptions.LSTMOptions, + BuiltinOptions.ResizeBilinearOptions: ResizeBilinearOptions.ResizeBilinearOptions, + BuiltinOptions.CallOptions: CallOptions.CallOptions, + BuiltinOptions.ReshapeOptions: ReshapeOptions.ReshapeOptions, + BuiltinOptions.SkipGramOptions: SkipGramOptions.SkipGramOptions, + BuiltinOptions.SpaceToDepthOptions: SpaceToDepthOptions.SpaceToDepthOptions, + BuiltinOptions.EmbeddingLookupSparseOptions: EmbeddingLookupSparseOptions.EmbeddingLookupSparseOptions, + BuiltinOptions.MulOptions: MulOptions.MulOptions, + BuiltinOptions.PadOptions: PadOptions.PadOptions, + BuiltinOptions.GatherOptions: GatherOptions.GatherOptions, + BuiltinOptions.BatchToSpaceNDOptions: BatchToSpaceNDOptions.BatchToSpaceNDOptions, + BuiltinOptions.SpaceToBatchNDOptions: SpaceToBatchNDOptions.SpaceToBatchNDOptions, + BuiltinOptions.TransposeOptions: TransposeOptions.TransposeOptions, + BuiltinOptions.ReducerOptions: ReducerOptions.ReducerOptions, + BuiltinOptions.SubOptions: SubOptions.SubOptions, + BuiltinOptions.DivOptions: DivOptions.DivOptions, + BuiltinOptions.SqueezeOptions: SqueezeOptions.SqueezeOptions, + BuiltinOptions.SequenceRNNOptions: SequenceRNNOptions.SequenceRNNOptions, + BuiltinOptions.StridedSliceOptions: StridedSliceOptions.StridedSliceOptions, + BuiltinOptions.ExpOptions: ExpOptions.ExpOptions, + BuiltinOptions.TopKV2Options: TopKV2Options.TopKV2Options, + BuiltinOptions.SplitOptions: SplitOptions.SplitOptions, + BuiltinOptions.LogSoftmaxOptions: LogSoftmaxOptions.LogSoftmaxOptions, + BuiltinOptions.CastOptions: CastOptions.CastOptions, + BuiltinOptions.DequantizeOptions: DequantizeOptions.DequantizeOptions, + BuiltinOptions.MaximumMinimumOptions: MaximumMinimumOptions.MaximumMinimumOptions, + BuiltinOptions.ArgMaxOptions: ArgMaxOptions.ArgMaxOptions, + BuiltinOptions.LessOptions: LessOptions.LessOptions, + BuiltinOptions.NegOptions: NegOptions.NegOptions, + BuiltinOptions.PadV2Options: PadV2Options.PadV2Options, + BuiltinOptions.GreaterOptions: GreaterOptions.GreaterOptions, + BuiltinOptions.GreaterEqualOptions: GreaterEqualOptions.GreaterEqualOptions, + BuiltinOptions.LessEqualOptions: LessEqualOptions.LessEqualOptions, + BuiltinOptions.SelectOptions: SelectOptions.SelectOptions, + BuiltinOptions.SliceOptions: SliceOptions.SliceOptions, + BuiltinOptions.TransposeConvOptions: TransposeConvOptions.TransposeConvOptions, + BuiltinOptions.SparseToDenseOptions: SparseToDenseOptions.SparseToDenseOptions, + BuiltinOptions.TileOptions: TileOptions.TileOptions, + BuiltinOptions.ExpandDimsOptions: ExpandDimsOptions.ExpandDimsOptions, + BuiltinOptions.EqualOptions: EqualOptions.EqualOptions, + BuiltinOptions.NotEqualOptions: NotEqualOptions.NotEqualOptions, + BuiltinOptions.ShapeOptions: ShapeOptions.ShapeOptions, + BuiltinOptions.PowOptions: PowOptions.PowOptions, + BuiltinOptions.ArgMinOptions: ArgMinOptions.ArgMinOptions, + BuiltinOptions.FakeQuantOptions: FakeQuantOptions.FakeQuantOptions, + BuiltinOptions.PackOptions: PackOptions.PackOptions, + BuiltinOptions.LogicalOrOptions: LogicalOrOptions.LogicalOrOptions, + BuiltinOptions.OneHotOptions: OneHotOptions.OneHotOptions, + BuiltinOptions.LogicalAndOptions: LogicalAndOptions.LogicalAndOptions, + BuiltinOptions.LogicalNotOptions: LogicalNotOptions.LogicalNotOptions, + BuiltinOptions.UnpackOptions: UnpackOptions.UnpackOptions, + BuiltinOptions.FloorDivOptions: FloorDivOptions.FloorDivOptions, + BuiltinOptions.SquareOptions: SquareOptions.SquareOptions, + BuiltinOptions.ZerosLikeOptions: ZerosLikeOptions.ZerosLikeOptions, + BuiltinOptions.FillOptions: FillOptions.FillOptions, + BuiltinOptions.BidirectionalSequenceLSTMOptions: BidirectionalSequenceLSTMOptions.BidirectionalSequenceLSTMOptions, + BuiltinOptions.BidirectionalSequenceRNNOptions: BidirectionalSequenceRNNOptions.BidirectionalSequenceRNNOptions, + BuiltinOptions.UnidirectionalSequenceLSTMOptions: UnidirectionalSequenceLSTMOptions.UnidirectionalSequenceLSTMOptions, # noqa: E501 + BuiltinOptions.FloorModOptions: FloorModOptions.FloorModOptions, + BuiltinOptions.RangeOptions: RangeOptions.RangeOptions, + BuiltinOptions.ResizeNearestNeighborOptions: ResizeNearestNeighborOptions.ResizeNearestNeighborOptions, + BuiltinOptions.LeakyReluOptions: LeakyReluOptions.LeakyReluOptions, + BuiltinOptions.SquaredDifferenceOptions: SquaredDifferenceOptions.SquaredDifferenceOptions, + BuiltinOptions.MirrorPadOptions: MirrorPadOptions.MirrorPadOptions, + BuiltinOptions.AbsOptions: AbsOptions.AbsOptions, + BuiltinOptions.SplitVOptions: SplitVOptions.SplitVOptions, + BuiltinOptions.UniqueOptions: UniqueOptions.UniqueOptions, + BuiltinOptions.ReverseV2Options: ReverseV2Options.ReverseV2Options, + BuiltinOptions.AddNOptions: AddNOptions.AddNOptions, + BuiltinOptions.GatherNdOptions: GatherNdOptions.GatherNdOptions, + BuiltinOptions.CosOptions: CosOptions.CosOptions, + BuiltinOptions.WhereOptions: WhereOptions.WhereOptions, + BuiltinOptions.RankOptions: RankOptions.RankOptions, + BuiltinOptions.ReverseSequenceOptions: ReverseSequenceOptions.ReverseSequenceOptions, + BuiltinOptions.MatrixDiagOptions: MatrixDiagOptions.MatrixDiagOptions, + BuiltinOptions.QuantizeOptions: QuantizeOptions.QuantizeOptions, + BuiltinOptions.MatrixSetDiagOptions: MatrixSetDiagOptions.MatrixSetDiagOptions, + BuiltinOptions.DensifyOptions: DensifyOptions.DensifyOptions, + BuiltinOptions.DepthToSpaceOptions: DepthToSpaceOptions.DepthToSpaceOptions, + BuiltinOptions.IfOptions: IfOptions.IfOptions, + BuiltinOptions.NonMaxSuppressionV4Options: NonMaxSuppressionV4Options.NonMaxSuppressionV4Options, + BuiltinOptions.NonMaxSuppressionV5Options: NonMaxSuppressionV5Options.NonMaxSuppressionV5Options, + BuiltinOptions.ScatterNdOptions: ScatterNdOptions.ScatterNdOptions, + BuiltinOptions.SegmentSumOptions: SegmentSumOptions.SegmentSumOptions, + BuiltinOptions.SelectV2Options: SelectV2Options.SelectV2Options, + BuiltinOptions.WhileOptions: WhileOptions.WhileOptions, +} + +builtin_options_inv_map = inverse_map(builtin_options_map) + + +def underscore_to_camel_case(s): + return "".join(x.title() for x in s.split("_")) + + +def padding_deserialize(x): + return padding_map[x] + + +def padding_serialize(builder, x): + return padding_inv_map[x] + + +def activation_deserialize(x): + return activation_function_map[x] + + +def activation_serialize(builder, x): + return activation_function_inv_map[x] + + +def datatype_deserialize(x): + return datatype_map[x] + + +def datatype_serialize(builder, x): + return datatype_inv_map[x] + + +def identity(x): + return x + + +def identity_serialize(builder, x): + return x + + +def write_byte_vector(builder, v): + builder.StartVector(1, len(v), 1) + for e in v[::-1]: + builder.PrependByte(e) + return builder.EndVector(len(v)) + + +def write_int_vector(builder, v): + builder.StartVector(4, len(v), 4) + for e in v[::-1]: + builder.PrependInt32(e) + return builder.EndVector(len(v)) + + +class OptionsSerializer: + def __init__(self, name, members=[]): + self.name = name + self.module = globals()[self.name] + self.cls = getattr(self.module, self.name) + self.builtin_opt_type = builtin_options_inv_map[self.cls] + self.custom_opt_format = 0 + self.members = [] + for mem in members: + deserialize = identity + serialize = identity_serialize + is_vector = False + if isinstance(mem, tuple): + if len(mem) == 3: + mem, deserialize, serialize = mem + elif len(mem) == 2: + mem, is_vector = mem + deserialize = tuple + serialize = write_int_vector + else: + assert 0 + underscore_mem = mem + camelcase_mem = underscore_to_camel_case(mem) + self.members.append((underscore_mem, camelcase_mem, deserialize, serialize, is_vector)) + + def deserialize(self, builtin_data, custom_data): + attrs = {} + if builtin_data: + tfattrs = self.cls() + tfattrs.Init(builtin_data.Bytes, builtin_data.Pos) + for underscore_mem, camelcase_mem, deserialize, serialize, is_vector in self.members: + fun = camelcase_mem + if is_vector: + fun += "AsNumpy" + + a = deserialize(getattr(tfattrs, fun)()) + attrs[underscore_mem] = a + return attrs + + def serialize(self, builder, attrs): + ser_attrs = [] + for underscore_mem, camelcase_mem, deserialize, serialize, is_vector in self.members: + a = serialize(builder, attrs[underscore_mem]) + ser_attrs.append((camelcase_mem, a)) + + getattr(self.module, self.name + "Start")(builder) + + for camelcase_mem, a in ser_attrs: + getattr(self.module, self.name + "Add" + camelcase_mem)(builder, a) + + return getattr(self.module, self.name + "End")(builder), None + + +class CustomOptionsSerializer: + def __init__(self): + self.builtin_opt_type = 0 + self.custom_opt_format = 0 + + def deserialize(self, builtin_data, custom_data): + attrs = {} + attrs["custom_options"] = custom_data + return attrs + + def serialize(self, builder, attrs): + + custom_opts = attrs.get("custom_options", []) + custom_data = [] + + # Set NPU op custom options for the TensorFlow Lite custom operator + if custom_opts["type"] == "NpuOp": + custom_data = [0x01, 0x04, 0x01] # NpuOp=1, FlexbufferFormat.UINT8=4, byte length=1 + + custom_data_bytes = struct.pack("<{0}B".format(len(custom_data)), *custom_data) + custom_offset = write_byte_vector(builder, custom_data_bytes) + + return None, custom_offset + + +padding_map = { + Padding.SAME: b"SAME", + Padding.VALID: b"VALID", +} + +padding_inv_map = inverse_map(padding_map) + + +activation_function_map = { + ActivationFunctionType.NONE: None, + ActivationFunctionType.RELU: "Relu", + ActivationFunctionType.RELU_N1_TO_1: "ReluN1To1", + ActivationFunctionType.RELU6: "Relu6", + ActivationFunctionType.TANH: "Tanh", + ActivationFunctionType.SIGN_BIT: "SignBit", +} + +activation_function_inv_map = inverse_map(activation_function_map) + +fused_act = ("fused_activation_function", activation_deserialize, activation_serialize) +padding = ("padding", padding_deserialize, padding_serialize) + +pool2d_opts = OptionsSerializer( + "Pool2DOptions", (padding, "stride_w", "stride_h", "filter_width", "filter_height", fused_act,) +) + +depthwise_opts = OptionsSerializer( + "DepthwiseConv2DOptions", + (padding, "stride_w", "stride_h", "depth_multiplier", fused_act, "dilation_w_factor", "dilation_h_factor",), +) + +conv2d_opts = OptionsSerializer( + "Conv2DOptions", (padding, "stride_w", "stride_h", fused_act, "dilation_w_factor", "dilation_h_factor",) +) + +lstm_opts = OptionsSerializer("LSTMOptions", (fused_act, "cell_clip", "proj_clip", "kernel_type")) + +unidir_seq_lstm_opts = OptionsSerializer( + "UnidirectionalSequenceLSTMOptions", (fused_act, "cell_clip", "proj_clip", "time_major") +) + +bidir_seq_lstm_opts = OptionsSerializer( + "BidirectionalSequenceLSTMOptions", (fused_act, "cell_clip", "proj_clip", "merge_outputs", "time_major") +) + +rnn_opts = OptionsSerializer("RNNOptions", (fused_act,)) + +seq_rnn_opts = OptionsSerializer("SequenceRNNOptions", ("time_major", fused_act,)) + +bidir_seq_rnn_opts = OptionsSerializer("BidirectionalSequenceRNNOptions", ("time_major", fused_act, "merge_outputs",)) + + +reducer_opts = OptionsSerializer("ReducerOptions", ("keep_dims",)) + +is_int_vec = True + +custom_prefix = "Custom_" + +builtin_operator_map = { + BuiltinOperator.ADD: ("AddAct", OptionsSerializer("AddOptions", (fused_act,))), + BuiltinOperator.AVERAGE_POOL_2D: ("AvgPoolAct", pool2d_opts), + BuiltinOperator.CONCATENATION: ("ConcatTFLite", OptionsSerializer("ConcatenationOptions", ("axis", fused_act))), + BuiltinOperator.CONV_2D: ("Conv2DBiasAct", conv2d_opts), + BuiltinOperator.DEPTHWISE_CONV_2D: ("DepthwiseConv2dBiasAct", depthwise_opts), + BuiltinOperator.DEPTH_TO_SPACE: ("DepthToSpace", OptionsSerializer("DepthToSpaceOptions", ("block_size",))), + BuiltinOperator.DEQUANTIZE: ("Dequantize", OptionsSerializer("DequantizeOptions")), + BuiltinOperator.EMBEDDING_LOOKUP: (None, None), + BuiltinOperator.FLOOR: ("Floor", None), + BuiltinOperator.FULLY_CONNECTED: ( + "FullyConnectedAct", + OptionsSerializer("FullyConnectedOptions", (fused_act, "weights_format")), + ), + BuiltinOperator.HASHTABLE_LOOKUP: (None, None), + # BuiltinOperator.L2_NORMALIZATION : "L2NormAct", + BuiltinOperator.L2_POOL_2D: (None, pool2d_opts), + BuiltinOperator.LOCAL_RESPONSE_NORMALIZATION: ( + "LRN", + OptionsSerializer("LocalResponseNormalizationOptions", ("radius", "bias", "alpha", "beta")), + ), + BuiltinOperator.LOGISTIC: ("Sigmoid", None), + # BuiltinOperator.LSH_PROJECTION : "", + BuiltinOperator.LSTM: ("LstmAct", lstm_opts), + BuiltinOperator.MAX_POOL_2D: ("MaxPool", pool2d_opts), + BuiltinOperator.MUL: ("MulAct", OptionsSerializer("MulOptions", (fused_act,))), + BuiltinOperator.RELU: ("Relu", None), + BuiltinOperator.RELU_N1_TO_1: (None, None), + BuiltinOperator.RELU6: ("Relu6", None), + BuiltinOperator.RESHAPE: ("Reshape", OptionsSerializer("ReshapeOptions", (("new_shape", is_int_vec),))), + BuiltinOperator.RESIZE_BILINEAR: ( + "ResizeBilinear", + OptionsSerializer("ResizeBilinearOptions", ("align_corners", "half_pixel_centers")), + ), + BuiltinOperator.RNN: ("RnnAct", rnn_opts), + BuiltinOperator.SOFTMAX: ("Softmax", OptionsSerializer("SoftmaxOptions", ("beta",))), + BuiltinOperator.SPACE_TO_DEPTH: ("SpaceToDepth", OptionsSerializer("SpaceToDepthOptions", ("block_size",))), + BuiltinOperator.SVDF: ("SvdfAct", OptionsSerializer("SVDFOptions", ("rank", fused_act))), + BuiltinOperator.TANH: ("Tanh", None), + # BuiltinOperator.CONCAT_EMBEDDINGS : "", + # BuiltinOperator.SKIP_GRAM : "", + # BuiltinOperator.CALL : "", + BuiltinOperator.EMBEDDING_LOOKUP_SPARSE: (None, OptionsSerializer("EmbeddingLookupSparseOptions", ("combiner",))), + BuiltinOperator.PAD: ("Pad", OptionsSerializer("PadOptions")), + BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_RNN: ("UnidirectionalSequenceRnnAct", seq_rnn_opts), + BuiltinOperator.GATHER: ("GatherV2", OptionsSerializer("GatherOptions", ("axis",))), + BuiltinOperator.BATCH_TO_SPACE_ND: ("BatchToSpaceND", OptionsSerializer("BatchToSpaceNDOptions")), + BuiltinOperator.SPACE_TO_BATCH_ND: ("SpaceToBatchND", OptionsSerializer("SpaceToBatchNDOptions")), + BuiltinOperator.TRANSPOSE: ("Transpose", OptionsSerializer("TransposeOptions")), + BuiltinOperator.MEAN: ("Mean", None), + BuiltinOperator.SUB: ("SubAct", OptionsSerializer("SubOptions", (fused_act,))), + BuiltinOperator.DIV: ("DivAct", OptionsSerializer("DivOptions", (fused_act,))), + BuiltinOperator.SQUEEZE: ("Squeeze", OptionsSerializer("SqueezeOptions", (("squeeze_dims", is_int_vec),))), + BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: ("UnidirectionalSequenceLstmAct", unidir_seq_lstm_opts), + BuiltinOperator.STRIDED_SLICE: ( + "StridedSlice", + OptionsSerializer( + "StridedSliceOptions", ("begin_mask", "end_mask", "ellipsis_mask", "new_axis_mask", "shrink_axis_mask") + ), + ), + BuiltinOperator.BIDIRECTIONAL_SEQUENCE_RNN: ("BidirectionalSequenceRnnAct", bidir_seq_rnn_opts), + BuiltinOperator.EXP: ("Exp", OptionsSerializer("ExpOptions")), + BuiltinOperator.TOPK_V2: ("TopKV2", OptionsSerializer("TopKV2Options")), + BuiltinOperator.SPLIT: ("Split", OptionsSerializer("SplitOptions", ("num_splits",))), + BuiltinOperator.LOG_SOFTMAX: ("LogSoftmax", OptionsSerializer("LogSoftmaxOptions")), + # BuiltinOperator.DELEGATE : "", + BuiltinOperator.BIDIRECTIONAL_SEQUENCE_LSTM: ("BidirectionalSequenceLstmAct", bidir_seq_lstm_opts), + BuiltinOperator.CAST: ( + "Cast", + OptionsSerializer( + "CastOptions", + ( + ("in_data_type", datatype_deserialize, datatype_serialize), + ("out_data_type", datatype_deserialize, datatype_serialize), + ), + ), + ), + # BuiltinOperator.PRELU : "", + BuiltinOperator.MAXIMUM: ("Maximum", OptionsSerializer("MaximumMinimumOptions")), + BuiltinOperator.ARG_MAX: ( + "ArgMax", + OptionsSerializer("ArgMaxOptions", (("output_type", datatype_deserialize, datatype_serialize),)), + ), + BuiltinOperator.MINIMUM: ("Minimum", OptionsSerializer("MaximumMinimumOptions")), + BuiltinOperator.LESS: ("Less", None), + BuiltinOperator.NEG: ("Neg", None), + BuiltinOperator.PADV2: ("PadV2", None), + BuiltinOperator.GREATER: ("Greater", None), + BuiltinOperator.GREATER_EQUAL: ("GreaterEqual", None), + BuiltinOperator.LESS_EQUAL: ("LessEqual", None), + BuiltinOperator.SELECT: ("Select", None), + BuiltinOperator.SLICE: ("Slice", None), + BuiltinOperator.SIN: ("Sin", None), + BuiltinOperator.TRANSPOSE_CONV: ( + "Conv2DBackpropInput", + OptionsSerializer("TransposeConvOptions", (padding, "stride_w", "stride_h")), + ), + BuiltinOperator.SPARSE_TO_DENSE: ( + "SparseToDense", + OptionsSerializer("SparseToDenseOptions", ("validate_indices",)), + ), + BuiltinOperator.TILE: ("Tile", OptionsSerializer("TileOptions")), + BuiltinOperator.EXPAND_DIMS: ("ExpandDims", None), + BuiltinOperator.EQUAL: ("Equal", None), + BuiltinOperator.NOT_EQUAL: ("NotEqual", None), + BuiltinOperator.LOG: ("Log", None), + BuiltinOperator.SUM: ("Sum", None), + BuiltinOperator.SQRT: ("Sqrt", None), + BuiltinOperator.RSQRT: ("Rsqrt", None), + BuiltinOperator.SHAPE: ( + "Shape", + OptionsSerializer("ShapeOptions", (("out_type", datatype_deserialize, datatype_serialize),)), + ), + BuiltinOperator.POW: "Pow", + BuiltinOperator.ARG_MIN: ( + "ArgMin", + OptionsSerializer("ArgMinOptions", (("output_type", datatype_deserialize, datatype_serialize),)), + ), + BuiltinOperator.FAKE_QUANT: ( + "FakeQuantWithMinMaxArgs", + OptionsSerializer("FakeQuantOptions", ("min", "max", "num_bits", "narrow_range")), + ), + BuiltinOperator.REDUCE_PROD: ("Prod", reducer_opts), + BuiltinOperator.REDUCE_MAX: ("Max", reducer_opts), + BuiltinOperator.PACK: ("Pack", OptionsSerializer("PackOptions", ("values_count", "axis"))), + BuiltinOperator.LOGICAL_OR: ("LogicalOr", None), + BuiltinOperator.ONE_HOT: ("OneHot", OptionsSerializer("OneHotOptions", ("axis",))), + BuiltinOperator.LOGICAL_AND: ("LogicalAnd", None), + BuiltinOperator.LOGICAL_NOT: ("LogicalNot", None), + BuiltinOperator.UNPACK: ("Unpack", OptionsSerializer("UnpackOptions", ("num", "axis"))), + BuiltinOperator.REDUCE_MIN: ("Min", reducer_opts), + BuiltinOperator.FLOOR_DIV: ("FloorDiv", None), + BuiltinOperator.REDUCE_ANY: ("Any", reducer_opts), + BuiltinOperator.SQUARE: ("Square", None), + BuiltinOperator.ZEROS_LIKE: ("ZerosLike", None), + BuiltinOperator.FILL: ("Fill", None), + BuiltinOperator.FLOOR_MOD: ("FloorMod", None), + BuiltinOperator.RANGE: ("Range", None), + BuiltinOperator.RESIZE_NEAREST_NEIGHBOR: ( + "ResizeNearestNeighbor", + OptionsSerializer("ResizeNearestNeighborOptions", ("align_corners",)), + ), + BuiltinOperator.LEAKY_RELU: ("LeakyRelu", OptionsSerializer("LeakyReluOptions", ("alpha",))), + BuiltinOperator.SQUARED_DIFFERENCE: ("SquaredDifference", None), + BuiltinOperator.MIRROR_PAD: ("MirrorPad", OptionsSerializer("MirrorPadOptions", ("mode",))), + BuiltinOperator.ABS: ("Abs", None), + BuiltinOperator.SPLIT_V: ("SplitV", OptionsSerializer("SplitVOptions", ("num_splits",))), + BuiltinOperator.UNIQUE: ( + "Unique", + OptionsSerializer("UniqueOptions", (("idx_out_type", datatype_deserialize, datatype_serialize),)), + ), + BuiltinOperator.CEIL: ("Ceil", None), + BuiltinOperator.REVERSE_V2: ("ReverseV2", None), + BuiltinOperator.ADD_N: ("AddN", None), + BuiltinOperator.GATHER_ND: ("GatherNd", None), + BuiltinOperator.COS: ("Cos", None), + BuiltinOperator.WHERE: ("Where", None), + BuiltinOperator.RANK: ("Rank", None), + BuiltinOperator.ELU: ("Elu", None), + BuiltinOperator.REVERSE_SEQUENCE: ( + "ReverseSequence", + OptionsSerializer("ReverseSequenceOptions", ("seq_dim", "batch_dim")), + ), + BuiltinOperator.MATRIX_DIAG: ("MatrixDiag", None), + BuiltinOperator.QUANTIZE: ("Quantize", None), + BuiltinOperator.MATRIX_SET_DIAG: ("MatrixSetDiag", None), + BuiltinOperator.IF: ("If", OptionsSerializer("IfOptions", ("then_subgraph_index", "else_subgraph_index"))), + BuiltinOperator.WHILE: ("While", OptionsSerializer("WhileOptions", ("cond_subgraph_index", "body_subgraph_index"))), + BuiltinOperator.NON_MAX_SUPPRESSION_V4: ("NonMaxSuppressionV4", OptionsSerializer("NonMaxSuppressionV4Options")), + BuiltinOperator.NON_MAX_SUPPRESSION_V5: ("NonMaxSuppressionV5", OptionsSerializer("NonMaxSuppressionV5Options")), + BuiltinOperator.SCATTER_ND: ("ScatterNd", OptionsSerializer("ScatterNdOptions")), + BuiltinOperator.SELECT_V2: ("SelectV2", OptionsSerializer("SelectV2Options")), + BuiltinOperator.DENSIFY: ("Densify", OptionsSerializer("DensifyOptions")), + BuiltinOperator.SEGMENT_SUM: ("SegmentSum", OptionsSerializer("SegmentSumOptions")), + BuiltinOperator.CUSTOM: (custom_prefix, CustomOptionsSerializer()), +} + +builtin_operator_inv_map = {v[0]: (k, v[1]) for k, v in builtin_operator_map.items()} + +builtin_operator_inv_map["NpuOp"] = (BuiltinOperator.CUSTOM, CustomOptionsSerializer()) diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py new file mode 100644 index 00000000..535847d7 --- /dev/null +++ b/ethosu/vela/tflite_reader.py @@ -0,0 +1,252 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Functions used to read from a TensorFlow Lite format file. + +from .tflite.Model import Model +from .tflite.BuiltinOperator import BuiltinOperator + +import numpy as np +import os.path +from .nn_graph import Graph, Operation, Subgraph +from .tensor import Tensor, QuantizationParameters + +from .tflite_mapping import builtin_operator_map, datatype_map, datatype_map_numpy, DataType + + +def decode_str(s): + if s is None: + return "" + return s.decode("utf-8") + + +def reshape_tensor_add_const_op(tens, reorder): + if not tens.reshaped: + original_shape = tens.shape + tens.name = tens.name + "_reshape" + tens.shape = [original_shape[idx] for idx in reorder] + tens.bandwidth_shape = tens.shape + tens.storage_shape = tens.shape + + if tens.values is not None: + tens.values = tens.values.transpose(reorder) + + if tens.quant_values is not None: + tens.quant_values = tens.quant_values.transpose(reorder) + + op = Operation("Const", tens.name) + op.outputs = [tens] + tens.ops = [op] + tens.reshaped = True + + +class TFLiteSubgraph: + def __init__(self, graph, subgraph): + self.graph = graph + self.name = decode_str(subgraph.Name()) + + self.tensors = [] + for idx in range(subgraph.TensorsLength()): + self.tensors.append(self.parse_tensor(subgraph.Tensors(idx))) + + for idx in range(subgraph.OperatorsLength()): + self.parse_operator(subgraph.Operators(idx)) + + self.outputs = [self.tensors[idx] for idx in subgraph.OutputsAsNumpy()] + self.inputs = [self.tensors[idx] for idx in subgraph.InputsAsNumpy()] + + # Fix up tensors without operations. Generate either Placeholder or Constant ops + for tens in self.inputs: + assert not tens.ops + op = Operation("Placeholder", tens.name) + op.outputs = [tens] + tens.ops = [op] + + for tens in self.tensors: + if not tens.ops: + op = Operation("Const", tens.name) + op.outputs = [tens] + tens.ops = [op] + + def parse_tensor(self, tens_data): + np_shape = tens_data.ShapeAsNumpy() + shape = list(np_shape) if type(np_shape) is np.ndarray else [] + name = decode_str(tens_data.Name()) + dtype = datatype_map[tens_data.Type()] + + tens = Tensor(shape, dtype, name) + + quant = tens_data.Quantization() + + def len1_array_to_scalar(arr): + # The following flatbuffer quantisation fields all return a scalar value of 0 if they are not definied in + # the input buffer. This is represented in Vela by using None. + # Otherwise, the fields returned are a single or multi-element array. In which case, single element arrays + # are converted to scalars + if isinstance(arr, int) and arr == 0: + return None + if len(arr) == 1: + return arr[0] + return arr + + tens.quantization = QuantizationParameters() + tens.quantization.min = len1_array_to_scalar(quant.MinAsNumpy()) + tens.quantization.max = len1_array_to_scalar(quant.MaxAsNumpy()) + tens.quantization.scale_f32 = len1_array_to_scalar(quant.ScaleAsNumpy()) + tens.quantization.zero_point = len1_array_to_scalar(quant.ZeroPointAsNumpy()) + + if dtype == DataType.uint8: + tens.quantization.quant_min = 0 + tens.quantization.quant_max = (1 << dtype.bits) - 1 + elif dtype in set((DataType.int8, DataType.int16, DataType.int32, DataType.int64)): + tens.quantization.quant_min = -(1 << (dtype.bits - 1)) + tens.quantization.quant_max = (1 << (dtype.bits - 1)) - 1 + else: + raise Exception("DataType '" + str(dtype) + "' is not supported for quantization.") + + if tens.quantization.scale_f32 is None and tens.quantization.zero_point is None: + tens.quantization = None + + tens.values = None + buf = self.graph.buffers[tens_data.Buffer()] + if buf is not None: + tens.values = np.array(buf.view(datatype_map_numpy[tens_data.Type()]).reshape(shape)) + if tens.quantization is not None: + tens.quant_values = tens.values + tens.values = tens.quantization.dequantize(tens.quant_values) + return tens + + def parse_operator(self, op_data): + op_type, opt_serializer = self.graph.operator_codes[op_data.OpcodeIndex()] + inputs = [self.tensors[idx] for idx in op_data.InputsAsNumpy()] + outputs = [self.tensors[idx] for idx in op_data.OutputsAsNumpy()] + name = "unknown_op_name" + if len(outputs): + name = outputs[0].name + op = Operation(op_type, name) + op.inputs = inputs + op.outputs = outputs + for out in op.outputs: + out.ops = [op] + + activation_function_to_split_out = None + + if op_type.startswith("DepthwiseConv2d") or op_type.startswith("Conv2D"): + reshape_tensor_add_const_op(inputs[1], (1, 2, 3, 0)) + + if op_type.startswith("FullyConnected"): + reshape_tensor_add_const_op(inputs[1], (1, 0)) + + if opt_serializer is not None: + op.attrs = opt_serializer.deserialize(op_data.BuiltinOptions(), op_data.CustomOptionsAsNumpy()) + + if "stride_w" in op.attrs: + op.attrs["strides"] = (1, op.attrs["stride_h"], op.attrs["stride_w"], 1) + if "filter_width" in op.attrs: + op.attrs["ksize"] = (1, op.attrs["filter_height"], op.attrs["filter_width"], 1) + if "dilation_w_factor" in op.attrs: + op.attrs["dilation"] = (1, op.attrs["dilation_h_factor"], op.attrs["dilation_w_factor"], 1) + if "depth_multiplier" in op.attrs: + op.attrs["channel_multiplier"] = op.attrs["depth_multiplier"] + + if "fused_activation_function" in op.attrs: + if op_type in set(("ConcatTFLite",)): + act = op.attrs["fused_activation_function"] + del op.attrs["fused_activation_function"] + if act is not None: + activation_function_to_split_out = act + + if activation_function_to_split_out is not None: + act_op = Operation(activation_function_to_split_out, name + activation_function_to_split_out) + out_tens = op.outputs[0] + intermediate_tens = out_tens.clone("_act_intermediate") + out_tens.ops = [act_op] + act_op.outputs = [out_tens] + intermediate_tens.ops = [op] + op.outputs[0] = intermediate_tens + act_op.inputs = [intermediate_tens] + + +class TFLiteGraph: + def __init__( + self, + filename, + batch_size=1, + feed_dict={}, + output_node_names=[], + initialisation_nodes=[], + ): + + self.op_times = {} + if batch_size is None: + batch_size = 1 + self.batch_size = batch_size + self.name = os.path.splitext(os.path.basename(filename))[0] + self.initialisation_nodes = initialisation_nodes + + with open(filename, "rb") as f: + buf = bytearray(f.read()) + + model = Model.GetRootAsModel(buf, 0) + + self.buffers = [] + for idx in range(model.BuffersLength()): + self.buffers.append(self.parse_buffer(model.Buffers(idx))) + + self.operator_codes = [] + for idx in range(model.OperatorCodesLength()): + self.operator_codes.append(self.parse_operator_code(model.OperatorCodes(idx))) + + self.subgraphs = [] + for idx in range(model.SubgraphsLength()): + self.subgraphs.append(TFLiteSubgraph(self, model.Subgraphs(idx))) + + self.nng = Graph(self.name, self.batch_size) + for tflite_sg in self.subgraphs: + sg = Subgraph(tflite_sg.name) + sg.original_inputs = tflite_sg.inputs # Preserve the original input order + sg.output_tensors = tflite_sg.outputs + self.nng.subgraphs.append(sg) + + def parse_buffer(self, buf_data): + if buf_data.DataLength() == 0: + return None + data = buf_data.DataAsNumpy() + return data + + def parse_operator_code(self, code): + c = code.BuiltinCode() + op_type, ser = builtin_operator_map[c] + if c == BuiltinOperator.CUSTOM: + op_type += decode_str(code.CustomCode()) + return op_type, ser + + +def read_tflite( + filename, + batch_size=1, + feed_dict={}, + output_node_names=[], + initialisation_nodes=[], +): + tflite_graph = TFLiteGraph( + filename, batch_size, feed_dict, output_node_names, initialisation_nodes + ) + nng = tflite_graph.nng + nng.refresh_after_modification() + return nng diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py new file mode 100644 index 00000000..f55d1ce5 --- /dev/null +++ b/ethosu/vela/tflite_writer.py @@ -0,0 +1,424 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Functions used to write to a TensorFlow Lite format file. Supports adding in file identifiers. + +import flatbuffers + +from .tflite import Tensor +from .tflite import QuantizationParameters +from .tflite import Model +from .tflite import SubGraph +from .tflite import OperatorCode +from .tflite import Operator +from .tflite import Buffer +from .tflite import Metadata + +import numpy as np + +from .tflite_mapping import datatype_inv_map, builtin_operator_inv_map, custom_prefix, BuiltinOperator +from .nn_graph import PassPlacement +from .tensor import TensorPurpose, MemArea +from flatbuffers.builder import UOffsetTFlags + +tflite_version = 3 +tflite_file_identifier = "TFL" + str(tflite_version) + + +import flatbuffers.number_types as N +from flatbuffers import encode + + +def FinishWithFileIdentifier(self, rootTable, fid): + if fid is None or len(fid) != 4: + raise Exception("fid must be 4 chars") + + flags = N.Uint8Flags + prepSize = 4 + self.Prep(self.minalign, prepSize + len(fid)) + for i in range(3, -1, -1): + self.head = self.head - flags.bytewidth + encode.Write(flags.packer_type, self.Bytes, self.Head(), ord(fid[i])) + + return self.Finish(rootTable) + + +flatbuffers.Builder.FinishWithFileIdentifier = FinishWithFileIdentifier + + +def make_vector(v): + try: + len(v) + return v + except TypeError: + return [v] + + +class TFLiteSerialiser: + def __init__(self, nng): + self.builder = flatbuffers.Builder(0) + self.nng = nng + + self.scratch_buf_id = 0 # Always assign scratch to buffer 0 + self.buffer_offsets_map = {} + self.buffers_to_write = [] # have an empty array there + + self.input_tensors = [] + self.ops_to_ignore = set(("Const", "Placeholder", "SubgraphInput")) + + self.tensors_to_reshape = {} + + self.subgraphs_to_write = [sg for sg in self.nng.subgraphs if sg.placement == PassPlacement.Cpu] + + all_ops = [] + for sg in self.subgraphs_to_write: + for ps in sg.passes: + for op in ps.ops: + if op.type not in self.ops_to_ignore: + all_ops.append(op) + if op.type.startswith("Conv2D") or op.type.startswith("DepthwiseConv2d"): + self.tensors_to_reshape[op.inputs[1]] = (3, 0, 1, 2) + if op.type.startswith("FullyConnected"): + self.tensors_to_reshape[op.inputs[1]] = (1, 0) + + self.operator_codes = list(sorted(set(op.type for op in all_ops))) + self.operator_code_map = {} + + def write_byte_vector(self, v, alignment=1): + builder = self.builder + builder.StartVector(1, len(v), alignment) + for e in v[::-1]: + builder.PrependByte(e) + return builder.EndVector(len(v)) + + def write_int_vector(self, v): + builder = self.builder + builder.StartVector(4, len(v), 4) + for e in v[::-1]: + builder.PrependInt32(e) + return builder.EndVector(len(v)) + + def write_long_vector(self, v): + builder = self.builder + builder.StartVector(8, len(v), 8) + for e in v[::-1]: + builder.PrependInt64(e) + return builder.EndVector(len(v)) + + def write_float_vector(self, v): + builder = self.builder + builder.StartVector(4, len(v), 4) + for e in v[::-1]: + builder.PrependFloat32(e) + return builder.EndVector(len(v)) + + def write_offset_vector(self, v): + builder = self.builder + builder.StartVector(4, len(v), 4) + for e in v[::-1]: + builder.PrependUOffsetTRelative(e) + return builder.EndVector(len(v)) + + def assign_buffers_to_tensors(self, tensors): + buffer_map = {} + scratch_tensor = [tens for tens in tensors if tens.purpose == TensorPurpose.Scratch][0] + buf_idx = 1 + + for tens in tensors: + if tens.mem_area == scratch_tensor.mem_area: + buffer_map[tens] = self.scratch_buf_id + else: + buffer_map[tens] = buf_idx + buf_idx += 1 + + # Initialize buffers_to_write to a length equal to numer of buffers so + # they can be appended at the correct index during tensor serialization + self.buffers_to_write = [None] * (buf_idx) + + return buffer_map + + def serialise_operator_code(self, idx, code): + builder = self.builder + custom_code_offset = None + if code.startswith(custom_prefix): + tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix] + custom_code_offset = builder.CreateString(code[len(custom_prefix) :]) + else: + try: + tf_code, opt_serializer = builtin_operator_inv_map[code] + except KeyError: + print( + "Warning: Writing operation %s, which does not have a direct TensorFlow Lite mapping, as a custom operation" + % (code,) + ) + tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix] + + if tf_code == BuiltinOperator.CUSTOM: + assert code == "NpuOp" # Currently only support serialising NPU operators as a custom op + custom_code_offset = builder.CreateString("ethos-u") + + self.operator_code_map[code] = (idx, tf_code, opt_serializer) + + OperatorCode.OperatorCodeStart(builder) + OperatorCode.OperatorCodeAddBuiltinCode(builder, tf_code) + if custom_code_offset is not None: + OperatorCode.OperatorCodeAddCustomCode(builder, custom_code_offset) + + return OperatorCode.OperatorCodeEnd(builder) + + def serialise_quantization_parameters(self, quant): + builder = self.builder + + min = None + max = None + scale = None + zero_point = None + if quant is not None: + if quant.min is not None: + min = self.write_float_vector(make_vector(quant.min)) + if quant.max is not None: + max = self.write_float_vector(make_vector(quant.max)) + if quant.scale_f32 is not None: + scale = self.write_float_vector(make_vector(quant.scale_f32)) + if quant.zero_point is not None: + zero_point = self.write_long_vector(make_vector(quant.zero_point)) + + QuantizationParameters.QuantizationParametersStart(builder) + if min is not None: + QuantizationParameters.QuantizationParametersAddMin(builder, min) + if max is not None: + QuantizationParameters.QuantizationParametersAddMax(builder, max) + if scale is not None: + QuantizationParameters.QuantizationParametersAddScale(builder, scale) + if zero_point is not None: + QuantizationParameters.QuantizationParametersAddZeroPoint(builder, zero_point) + return QuantizationParameters.QuantizationParametersEnd(builder) + + def serialise_tensor(self, tens): + builder = self.builder + tens_shape = tens.shape + values = tens.quant_values + if values is None: + values = tens.values + + if values is None: + values = np.empty(shape=(0), dtype=np.uint8) + + if tens in self.tensors_to_reshape: + reorder = self.tensors_to_reshape[tens] + tens_shape = [tens_shape[idx] for idx in reorder] + values = values.transpose(reorder) + + if tens.purpose == TensorPurpose.Scratch: + tens_shape = [0] + self.buffers_to_write[self.scratch_buf_id] = values.flatten().view(np.uint8) + + buf_id = self.buffer_map[tens] + if buf_id != self.scratch_buf_id: + self.buffers_to_write[buf_id] = values.flatten().view(np.uint8) + + shape = self.write_int_vector(tens_shape) + + name = builder.CreateString(tens.name) + quant = self.serialise_quantization_parameters(tens.quantization) + + Tensor.TensorStart(builder) + Tensor.TensorAddShape(builder, shape) + Tensor.TensorAddType(builder, datatype_inv_map[tens.dtype]) + # All tensors must have a valid backing buffer, even if it is empty. + # Empty buffers should be kept unique for TensorFlow Lite Micro + Tensor.TensorAddBuffer(builder, buf_id) + Tensor.TensorAddName(builder, name) + Tensor.TensorAddQuantization(builder, quant) + + res = Tensor.TensorEnd(builder) + return res + + def serialise_operator(self, op): + builder = self.builder + + inputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in op.inputs]) + outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in op.outputs]) + + op_idx, tflop, opt_serializer = self.operator_code_map[op.type] + + builtin_opt_offset = None + custom_opt_offset = None + if opt_serializer is not None: + attrs = dict(op.attrs) + if "strides" in attrs: + attrs["stride_h"] = attrs["strides"][1] + attrs["stride_w"] = attrs["strides"][2] + if "ksize" in attrs: + attrs["filter_height"] = attrs["ksize"][1] + attrs["filter_width"] = attrs["ksize"][2] + if "dilation" in attrs: + attrs["dilation_h_factor"] = attrs["dilation"][1] + attrs["dilation_w_factor"] = attrs["dilation"][2] + if "channel_multiplier" in attrs: + attrs["depth_multiplier"] = attrs["channel_multiplier"] + + builtin_opt_offset, custom_opt_offset = opt_serializer.serialize(builder, attrs) + + mutating_variable_inputs_offset = self.write_byte_vector([]) + Operator.OperatorStart(builder) + Operator.OperatorAddOpcodeIndex(builder, op_idx) + Operator.OperatorAddInputs(builder, inputs_offset) + Operator.OperatorAddOutputs(builder, outputs_offset) + + if builtin_opt_offset is not None: + Operator.OperatorAddBuiltinOptionsType(builder, opt_serializer.builtin_opt_type) + Operator.OperatorAddBuiltinOptions(builder, builtin_opt_offset) + if custom_opt_offset is not None: + Operator.OperatorAddCustomOptions(builder, custom_opt_offset) + Operator.OperatorAddCustomOptionsFormat(builder, opt_serializer.custom_opt_format) + + Operator.OperatorAddMutatingVariableInputs(builder, mutating_variable_inputs_offset) + return Operator.OperatorEnd(builder) + + def serialise_subgraph(self, sg): + builder = self.builder + tensor_set = set() + + all_ops = [] + for ps in sg.passes: + for op in ps.ops: + if op.type not in self.ops_to_ignore: + all_ops.append(op) + + for op in all_ops: + for tens in op.inputs + op.outputs: + tensor_set.add(tens) + + all_tensors = [tens for nm, idx, tens in sorted((tens.name, idx, tens) for idx, tens in enumerate(tensor_set))] + + self.tensor_map = {tens: idx for idx, tens in enumerate(all_tensors)} + self.buffer_map = self.assign_buffers_to_tensors(all_tensors) + + tensors_offset = self.write_offset_vector([self.serialise_tensor(tens) for tens in all_tensors]) + + # Add the Scratch Tensor as input to the NPU subgraph to get it allocated by TensorFlow Lite Micro + scratch_tensor_idx = [v for k, v in self.tensor_map.items() if k.name.endswith("scratch")] + + # Make sure the input_tensors haven't been modified + assert all(inp in sg.original_inputs for inp in sg.input_tensors) + inputs_offset = self.write_int_vector( + [self.tensor_map[tens] for tens in sg.original_inputs] + scratch_tensor_idx + ) + outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in sg.output_tensors]) + + operators_offset = self.write_offset_vector([self.serialise_operator(op) for op in all_ops]) + + SubGraph.SubGraphStart(builder) + SubGraph.SubGraphAddTensors(builder, tensors_offset) + SubGraph.SubGraphAddInputs(builder, inputs_offset) + SubGraph.SubGraphAddOutputs(builder, outputs_offset) + + SubGraph.SubGraphAddOperators(builder, operators_offset) + + return SubGraph.SubGraphEnd(builder) + + def write_aligned_bytes(self, buf): + builder = self.builder + builder.nested = True + data = bytes(buf) + length_bytes = UOffsetTFlags.py_type(len(data)) + builder.Prep(16, length_bytes) # Reserve aligned storage + builder.head = UOffsetTFlags.py_type(builder.Head() - length_bytes) # Update FlatBuffer internal pointer + builder.Bytes[builder.Head() : builder.Head() + length_bytes] = data # Assign bytes to aligned area + return builder.EndVector(length_bytes) + + def serialise_buffer(self, buf): + builder = self.builder + data = None + if buf is not None: + data = self.write_aligned_bytes(buf) + Buffer.BufferStart(builder) + if data is not None: + Buffer.BufferAddData(builder, data) + return Buffer.BufferEnd(builder) + + def serialise_metadata(self, metadata): + builder = self.builder + name = builder.CreateString(metadata[0]) + + Metadata.MetadataStart(builder) + Metadata.MetadataAddName(builder, name) + Metadata.MetadataAddBuffer(builder, metadata[1]) + + return Metadata.MetadataEnd(builder) + + def serialise_model(self): + builder = self.builder + operator_code_offset = self.write_offset_vector( + [self.serialise_operator_code(idx, code) for idx, code in enumerate(self.operator_codes)] + ) + + description = builder.CreateString("Vela Optimised") + + subgraph_offset = self.write_offset_vector([self.serialise_subgraph(sg) for sg in self.subgraphs_to_write]) + + # Fill the metadata buffer + version = np.int32(0) + subgraph_idx = np.int32(len(self.subgraphs_to_write)) # Only 1 supported currently + nbr_tensors = np.int32(len(self.tensor_map)) + + # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro + offsets = [np.int32(-1)] * nbr_tensors + + # Ensure that the order of the offsets match the order of the tensors + for tens, idx in self.tensor_map.items(): + if tens.mem_area == MemArea.Sram: + offsets[idx] = np.int32(tens.address) + + metadata_buffer = np.array([version, subgraph_idx, nbr_tensors] + offsets) + self.buffers_to_write.append(metadata_buffer) + + buffers_offset = self.write_offset_vector([self.serialise_buffer(buf) for buf in self.buffers_to_write]) + + metadata_list = [("OfflineMemoryAllocation", len(self.buffers_to_write) - 1)] + metadata_offset = self.write_offset_vector([self.serialise_metadata(metadata) for metadata in metadata_list]) + + Model.ModelStart(builder) + Model.ModelAddVersion(builder, tflite_version) + Model.ModelAddOperatorCodes(builder, operator_code_offset) + Model.ModelAddSubgraphs(builder, subgraph_offset) + Model.ModelAddDescription(builder, description) + Model.ModelAddBuffers(builder, buffers_offset) + Model.ModelAddMetadata(builder, metadata_offset) + return Model.ModelEnd(builder) + + def serialise(self): + + model = self.serialise_model() + + self.builder.FinishWithFileIdentifier(model, tflite_file_identifier) + + return self.builder.Output() + + def write(self, filename): + with open(self.filename, "wb") as f: + f.write(self.serialised_buf) + + +def write_tflite(nng, filename): + writer = TFLiteSerialiser(nng) + buf = writer.serialise() + + with open(filename, "wb") as f: + f.write(buf) diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py new file mode 100644 index 00000000..f07aec89 --- /dev/null +++ b/ethosu/vela/vela.py @@ -0,0 +1,334 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Main entry point for the Vela compiler. +# +# Provides command line interface, options parsing, and network loading. Before calling the compiler driver. + +import sys +import os.path +import os +import time +import subprocess +import configparser +import argparse +import ast + +from . import architecture_features +from . import stats_writer +from . import tflite_writer +from . import model_reader +from . import compiler_driver +from . import scheduler +from ._version import __version__ +from .scheduler import ParetoMetric +from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement + + +def process(fname, arch, model_reader_options, compiler_options, scheduler_options): + if compiler_options.timing: + start = time.time() + + nng = model_reader.read_model(fname, model_reader_options) + + if not nng: + print("reading of", fname, "failed") + assert False + + if compiler_options.verbose_operators: + nng.print_operators() + + if compiler_options.timing: + stop = time.time() + print("Model reading took %f s" % (stop - start)) + start = time.time() + + compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options) + + passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config) + stats_writer.write_pass_metrics_csv(nng, passes_csv_file) + + summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config) + stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch) + + stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch) + + if fname.endswith(".tflite"): + tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name)) + + if compiler_options.timing: + stop = time.time() + print("Compiler driver took %f s" % (stop - start)) + + return nng + + +def print_subgraph_io_summary(nng): + """Print a summary of all the input and output tensor sizes for all subgraphs. + Also displays the total tensor size and the memory used area for sram. + """ + + print("Subgraph IO Summary") + print("-------------------") + print("NNG: {0}".format(nng.name)) + max_sg_size = 0 + for sg in reversed(nng.subgraphs): + print(" Subgraph: {0} = {1}".format(sg.name, sg.placement)) + sg_size = 0 + + if sg.placement == PassPlacement.Npu: + for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors: + if tens in sg.input_tensors: + tens_dir = "In" + elif tens in sg.output_tensors: + tens_dir = "Out" + else: + tens_dir = "In/Out" + + size = tens.elements() * tens.element_size() / 1024.0 + sg_size = sg_size + size + print(" Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size)) + + print(" Total Size = {0} KiB".format(sg_size)) + print(" SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0)) + max_sg_size = max(sg_size, max_sg_size) + + print(" Maximum Subgraph Size = {0} KiB".format(max_sg_size)) + + +def main(args=None): + if args is None: + args = sys.argv[1:] + + parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55") + + parser.add_argument( + "network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process" + ) + + parser.add_argument("--version", action="version", version=__version__) + parser.add_argument( + "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)" + ) + parser.add_argument("--config", type=str, help="Location of vela configuration file") + parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)") + + parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter") + parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization") + parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing") + parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose") + parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format") + parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule") + parser.add_argument( + "--verbose-pareto-frontier-schedules", + action="store_true", + help="Show all schedules along the pareto frontier of optimisation criteria", + ) + parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation") + parser.add_argument( + "--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream" + ) + parser.add_argument( + "--verbose-register-command-stream", action="store_true", help="Verbose register command stream" + ) + parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list") + + parser.add_argument( + "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation" + ) + parser.add_argument( + "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU" + ) + parser.add_argument( + "--cascading", + type=ast.literal_eval, + default=True, + choices=[True, False], + help="Controls the packing of multiple passes into a cascade (default: %(default)s)", + ) + parser.add_argument( + "--ifm-ofm-overlap", + type=ast.literal_eval, + default=True, + choices=[True, False], + help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)", + ) + parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC") + parser.add_argument( + "--inter-pass-cycle-delay", + type=int, + default=0, + help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)", + ) + parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations") + parser.add_argument( + "--accelerator-config", + type=str, + default="ethos-u55-256", + choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()), + help="Accelerator configuration to use (default: %(default)s)", + ) + parser.add_argument( + "--system-config", + type=str, + default="internal-default", + help="System configuration to use (default: %(default)s)", + ) + parser.add_argument( + "--dram-bandwidth", + type=float, + default=0.0, + help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)", + ) + parser.add_argument( + "--permanent-storage", + default=MemArea.OffChipFlash, + type=lambda s: MemArea[s], + choices=list(MemArea)[3:-1], + help=( + "Memory area for permanent storage. To store the weights and other constant data in SRAM select " + "'OnChipFlash' (default: %(default)s)" + ), + ) + parser.add_argument( + "--tensor-allocator", + default=TensorAllocator.Greedy, + type=lambda s: TensorAllocator[s], + choices=list(TensorAllocator), + help="Tensor Allocator algorithm (default: %(default)s)", + ) + parser.add_argument( + "--show-subgraph-io-summary", + action="store_true", + help="Shows a summary of all the subgraphs and their inputs and outputs", + ) + parser.add_argument( + "--ifm-streaming", + type=ast.literal_eval, + default=True, + choices=[True, False], + help="Controls scheduler IFM streaming search (default: %(default)s)", + ) + parser.add_argument( + "--block-config-limit", + type=int, + default=16, + help="Limit block config search space, use zero for unlimited (default: %(default)s)", + ) + parser.add_argument( + "--global-memory-clock-scale", + type=float, + default=1.0, + help=( + "Performs an additional scaling of the individual memory clock scales specified by the system config " + "(default: %(default)s)" + ), + ) + parser.add_argument( + "--pareto-metric", + default=ParetoMetric.BwCycMem, + type=lambda s: ParetoMetric[s], + choices=list(ParetoMetric), + help="Controls the calculation of the pareto metric (default: %(default)s)", + ) + parser.add_argument( + "--recursion-limit", + type=int, + default=10000, + help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)", + ) + parser.add_argument( + "--max-block-dependency", + type=int, + default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP, + choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1), + help=( + "Set the maximum value that can be used for the block dependency between npu kernel operations " + "(default: %(default)s)" + ), + ) + + args = parser.parse_args(args=args) + + # Read configuration file + config_file = args.config + config = None + if config_file is not None: + with open(config_file) as f: + config = configparser.ConfigParser() + config.read_file(f) + + if args.network is None: + parser.error("the following argument is required: NETWORK") + + sys.setrecursionlimit(args.recursion_limit) + + if args.force_block_config: + force_block_config = architecture_features.Block.from_string(args.force_block_config) + else: + force_block_config = None + + arch = architecture_features.ArchitectureFeatures( + vela_config=config, + system_config=args.system_config, + accelerator_config=args.accelerator_config, + permanent_storage=args.permanent_storage, + inter_pass_cycle_delay=args.inter_pass_cycle_delay, + dram_bandwidth=args.dram_bandwidth, + override_block_config=force_block_config, + block_config_limit=args.block_config_limit, + global_memory_clock_scale=args.global_memory_clock_scale, + max_blockdep=args.max_block_dependency, + ) + + compiler_options = compiler_driver.CompilerOptions( + verbose_graph=args.verbose_graph, + verbose_quantization=args.verbose_quantization, + verbose_packing=args.verbose_packing, + verbose_tensor_purpose=args.verbose_tensor_purpose, + verbose_tensor_format=args.verbose_tensor_format, + verbose_allocation=args.verbose_allocation, + verbose_high_level_command_stream=args.verbose_high_level_command_stream, + verbose_register_command_stream=args.verbose_register_command_stream, + verbose_operators=args.verbose_operators, + show_minimum_possible_allocation=args.show_minimum_possible_allocation, + show_cpu_operations=args.show_cpu_operations, + tensor_allocator=args.tensor_allocator, + timing=args.timing, + output_dir=args.output_dir, + ) + + scheduler_options = scheduler.SchedulerOptions( + use_cascading=args.cascading, + use_ifm_ofm_overlap=args.ifm_ofm_overlap, + verbose_schedule=args.verbose_schedule, + verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules, + use_ifm_streaming=args.ifm_streaming, + pareto_metric=args.pareto_metric, + ) + + model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size) + + os.makedirs(args.output_dir, exist_ok=True) + + nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options) + + if args.show_subgraph_io_summary: + print_subgraph_io_summary(nng) + + return 0 diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py new file mode 100644 index 00000000..0b4ac696 --- /dev/null +++ b/ethosu/vela/weight_compressor.py @@ -0,0 +1,387 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Compresses and pads the weigths. It also calculates the scales and packs with the biases. + +import os +import sys +import enum +import math +import numpy as np +from collections import namedtuple +from .numeric_util import round_up +from .scaling import quantise_scale +from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, TensorBlockTraversal +from .operation import NpuBlockType +from .architecture_features import Block +from .nn_graph import SchedulingStrategy +from .data_type import DataType + +from ethosu import mlw_codec + + +def encode(weight_stream): + assert np.amin(weight_stream) >= -255 + assert np.amax(weight_stream) <= 255 + + # Encode flattened signed weight stream + compressed = mlw_codec.encode(weight_stream) + + # pad with 0xFF as needed so the length of the weight stream + # is a multiple of 16 + + while (len(compressed) % 16) != 0: + compressed.append(0xFF) + + return compressed + + +def generate_brick(arch, brick_weights, ofm_block, block_traversal, ifm_bitdepth): + is_depthwise = block_traversal == TensorBlockTraversal.DepthWise + is_partkernel = block_traversal == TensorBlockTraversal.PartKernelFirst + subkernel_max = arch.subkernel_max + ofm_ublock = arch.ofm_ublock + ifm_ublock = arch.ifm_ublock + # Expect weights formatted HWIO + ofm_depth = brick_weights.shape[-1] + ifm_depth = brick_weights.shape[-2] + kernel_width = brick_weights.shape[-3] + kernel_height = brick_weights.shape[-4] + # IFM block depth + if is_partkernel or (ifm_bitdepth == 16): + # IFM block depth is always 16 for part-kernel-first + ifm_block_depth = 16 + elif ifm_bitdepth == 8: + ifm_block_depth = 32 + else: + assert False + + stream = [] + + # Top level striping - OFM blocks in the entire brick's depth + for ofm_block_z in range(0, ofm_depth, ofm_block.depth): + clipped_ofm_block_depth = min(ofm_block.depth, ofm_depth - ofm_block_z) + # IFM blocks required for the brick + for ifm_block_z in range(0, (1 if is_depthwise else ifm_depth), ifm_block_depth): + if is_depthwise: + clipped_ifm_block_depth = ifm_ublock.depth + else: + clipped_ifm_block_depth = ( + min(ifm_block_depth, ifm_depth - ifm_block_z) if is_partkernel else ifm_block_depth + ) + # Weight decomposition + # Subkernel Splitting (H) + for subkernel_y in range(0, kernel_height, subkernel_max.height): + sub_height = min(kernel_height - subkernel_y, subkernel_max.height) + # Subkernel splitting (W) + for subkernel_x in range(0, kernel_width, subkernel_max.width): + sub_width = min(kernel_width - subkernel_x, subkernel_max.width) + subkernel_elements = sub_width * sub_height + # Part kernel first works across the kernel H/W and needs padding + if is_partkernel: + if ifm_bitdepth == 16 and subkernel_elements % 2 != 0: + subkernel_elements = int(math.ceil(subkernel_elements / 2) * 2) + elif ifm_bitdepth == 8 and subkernel_elements % 4 != 0: + subkernel_elements = int(math.ceil(subkernel_elements / 4) * 4) + + # Depthwise Conv requires multiple of 4 kernel elements in its weight block + # this is different from normal conv which is considered "weights depth-first" + elif is_depthwise: + subkernel_elements = int(math.ceil(subkernel_elements / 4.0) * 4) + + ifm_block_depth_outer = clipped_ifm_block_depth if is_partkernel else 1 + ifm_block_depth_inner = 1 if is_partkernel else clipped_ifm_block_depth + # IFM Ublocks in IFM-block over depth for part-kernel-first mode + # For depth-first IFM Ublocks are traversed after subkernel elements so this loop is ignored. + for ifm_ublk_outer in range(0, ifm_block_depth_outer, ifm_ublock.depth): + # OFM Ublocks in OFM-block over depth + for ofm_ublk in range(0, clipped_ofm_block_depth, ofm_ublock.depth): + # HW Kernel element traversal - cannot be a H/W loop due to element + # padding requirement on depthwise/part-kernel configurations + for element in range(subkernel_elements): + kx = element % sub_width + ky = element // sub_width + # IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise) + # In case of part-kernel-first IFM Ublock traversal have already been handled + # and this loop is ignored. + for ifm_ublk_inner in range(0, ifm_block_depth_inner, ifm_ublock.depth): + # Feed OFM ublock elements + for ofm_ublock_z in range(ofm_ublock.depth): + # Source IFM ublock elements (only 1 element deep if depthwise) + for ifm_ublock_z in range(1 if is_depthwise else ifm_ublock.depth): + # Source position within the current subkernel + wx = subkernel_x + kx + wy = subkernel_y + ky + # Source IFM/OFM slices + ifm_ublk = ifm_ublk_inner + ifm_ublk_outer + ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z + ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z + if (ifm_z >= ifm_depth) or (ofm_z >= ofm_depth) or (ky >= sub_height): + stream.append(0) + else: + stream.append(brick_weights[wy][wx][ifm_z][ofm_z]) + return stream + + +# Compress the weights +def compress_weights(tens, arch, npu_block_type, ofm_block, ofm_depth_step, min_val=None, max_val=None): + assert tens.purpose == TensorPurpose.Weights + assert tens.format == TensorFormat.WeightsCompressed + + WeightCompressionConfig = namedtuple("WeightCompressionConfig", ["npu_block_type", "ofm_block", "ofm_depth_step"]) + + # check if weights have already been compressed + wcc = tens.weight_compression_config + if wcc is not None: + assert wcc.npu_block_type == npu_block_type, "Weights not used by the same operator type" + + if wcc.ofm_block == ofm_block and wcc.ofm_depth_step == ofm_depth_step: + return + + assert tens.quantization is not None + assert tens.quantization.scale_f32 is not None + assert tens.quantization.zero_point is not None + + zero_point = tens.quantization.zero_point + quant_buf = tens.quant_values.astype(np.int64) + + # Early zero-point correction + weights = quant_buf - zero_point + + if len(weights.shape) == 2: + weights = np.expand_dims(np.expand_dims(weights, axis=0), axis=0) + weights_shape = (weights.shape[0], 1, 1, weights.shape[1]) + else: + weights_shape = weights.shape + + compression_scales = [] + compressed_offsets = [] + encoded_streams = [] + offset = 0 + max_single_buffer_len = 0 + + ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits() + ifm_depth = weights.shape[-2] + if npu_block_type == NpuBlockType.ConvolutionDepthWise: + tens.block_traversal = TensorBlockTraversal.DepthWise + if npu_block_type == NpuBlockType.ConvolutionMxN: + # Determine which block traversal strategy has better DPU utilization + kernel_size = weights_shape[0] * weights_shape[1] + depth_utilization = weights_shape[2] / round_up(weights_shape[2], 32 if ifm_bitdepth == 8 else 16) + part_kernel_utilization = (weights_shape[2] / round_up(weights_shape[2], 8)) * ( + kernel_size / round_up(kernel_size, 4 if ifm_bitdepth == 8 else 2) + ) + if part_kernel_utilization >= depth_utilization or ifm_depth <= 8: + # Part-kernel first is always better for ifm depths <= 8 + tens.block_traversal = TensorBlockTraversal.PartKernelFirst + else: + tens.block_traversal = TensorBlockTraversal.DepthFirst + + # Slice weight stream up depth-ways into bricks and compress + full_ofm_depth = quant_buf.shape[-1] + for idx in range(0, full_ofm_depth, ofm_depth_step): + # Get the weights necessary for this brick + count = min(full_ofm_depth - idx, ofm_depth_step) + brick_weights = weights[:, :, :, idx : idx + count] + + # Encode all weights into one chunk + raw_stream = generate_brick(arch, brick_weights, ofm_block, tens.block_traversal, ifm_bitdepth) + encoded = encode(raw_stream) + encoded_streams.append(encoded) + + # Remember maximum encoded length for DoubleBuffering + if max_single_buffer_len < len(encoded): + max_single_buffer_len = len(encoded) + + # Remember where we put it for linear addressing + compressed_offsets.append(offset) + offset += len(encoded) + assert offset % 16 == 0 + + # Compression scale tracking + compression_scales.append(len(encoded) / len(raw_stream)) + + # Also track complete length in the offsets array + compressed_offsets.append(offset) + + if tens.sub_purpose == TensorSubPurpose.DoubleBuffer and len(encoded_streams) > 2: + offset = 2 * max_single_buffer_len + assert offset % 16 == 0 + + tens.storage_shape = [1, 1, 1, offset] + tens.weight_compression_scales = compression_scales + tens.weight_compression_config = WeightCompressionConfig(npu_block_type, ofm_block, ofm_depth_step) + tens.weight_compressed_offsets = compressed_offsets + tens.compression_scale_for_worst_weight_stream = np.amax(compression_scales) + tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales) + tens.compressed_values = encoded_streams + tens.brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step)) + + +def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False): + assert tens.purpose == TensorPurpose.FeatureMap + assert tens.format == TensorFormat.NHWC + # the connected operator should expect a bias input unless it is a FullyConnected + assert "Bias" in tens.consumer_list[0].type or tens.consumer_list[0].type.startswith("FullyConnected") + # the input bias tensor is the same as that connected to the operator + assert tens is tens.consumer_list[0].inputs[2] + # the operator should only have a single output + assert len(tens.consumer_list[0].outputs) == 1 + + def pack_bias_and_scale(bias, scale, shift): + bias = np.int64(bias) + assert -(1 << (40 - 1)) <= bias < (1 << (40 - 1)) # signed 40-bit range + assert 0 <= scale < (1 << 32) # unsigned 32-bit range + assert 0 <= shift < (1 << 6) # unsigned 6-bit range + + # pack the 80 bit value = [0(2-bits),shift(6-bits),scale(32-bits),bias(40-bits)] + data = bytearray(10) + data[0] = (bias >> (0 * 8)) & 0xFF + data[1] = (bias >> (1 * 8)) & 0xFF + data[2] = (bias >> (2 * 8)) & 0xFF + data[3] = (bias >> (3 * 8)) & 0xFF + data[4] = (bias >> (4 * 8)) & 0xFF + data[5] = (scale >> (0 * 8)) & 0xFF + data[6] = (scale >> (1 * 8)) & 0xFF + data[7] = (scale >> (2 * 8)) & 0xFF + data[8] = (scale >> (3 * 8)) & 0xFF + data[9] = shift & 0x3F + return data + + biases = tens.quant_values + + first_consumer_op = tens.consumer_list[0] + ifm_dtype = first_consumer_op.inputs[0].dtype + ifm_scale = first_consumer_op.inputs[0].quantization.scale_f32 + ofm_scale = first_consumer_op.outputs[0].quantization.scale_f32 + weight_scales = first_consumer_op.inputs[1].quantization.scale_f32 + + # biases can have multiple consumers for rnn cells. if so, then check that they are all the same + for op in tens.consumer_list[1:]: + assert ifm_scale == op.inputs[0].quantization.scale_f32 + assert ofm_scale == op.outputs[0].quantization.scale_f32 + assert weight_scales == op.inputs[1].quantization.scale_f32 + + if not hasattr(weight_scales, "__iter__"): + # If weight_scales is not already an iterable make it into a list + weight_scales = [weight_scales] + + # Convert scales to np.double (from np.float32) to conform to TensorFlow Lite which + # uses double during scaling calculations + # TensorFlow Lite casts the scales slightly differently for uint8 and int8 + if not rescale_for_faf: + if ifm_dtype == DataType.uint8: + scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales] + elif ifm_dtype == DataType.int8: + scales = [ + (np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale) + for weight_scale in weight_scales + ] + else: + assert False, str(ifm_dtype) + " not implemented" + else: + if ifm_dtype == DataType.uint8: + scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales] + elif ifm_dtype == DataType.int8: + scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales] + else: + assert False, str(ifm_dtype) + " not implemented" + + # quantise all of the weight scales into (scale_factor, shift) + quantised_scales = [quantise_scale(scale) for scale in scales] + + for _, shift in quantised_scales: + assert shift >= 16 + + # pack the biases and scales + tens.compressed_values = [] + if len(quantised_scales) == 1: + # If only 1 quantised scale is used, repeat that value for the length of the biases + quantised_scales = [quantised_scales[0]] * len(biases) + + assert len(quantised_scales) == len(biases) + for i, bias in enumerate(biases): + tens.compressed_values.append(pack_bias_and_scale(bias, *quantised_scales[i])) + + tens.element_size_bytes = 10 + + # Figure out if we need padded storage (extra whole elements) + padding = (len(tens.compressed_values) * tens.element_size_bytes) % 16 + if padding != 0: + padding = 16 - padding + + # This adds enough padding to allow over-reads + while padding > 0: + tens.compressed_values.append(pack_bias_and_scale(0, 0, 0)) + padding = padding - tens.element_size_bytes + + tens.storage_shape = [len(tens.compressed_values)] + + +def update_pass_weight_and_scale_tensors(nng, arch): + def find_npu_usage_of_tensor(tens): + # TODO: This function is identical to the one in mark_tensors.py. A common version should be used. + for op in tens.consumers(): + if op.type == "DMA": + return find_npu_usage_of_tensor(op.outputs[0]) + if "npu_block_type" in op.attrs: + return op.attrs["npu_block_type"] + return NpuBlockType.Default + + for sg in nng.subgraphs: + for ps in sg.passes: + if ps.weight_tensor != None: + npu_usage_of_tensor = find_npu_usage_of_tensor(ps.weight_tensor) + if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise: + ps.weight_tensor.quant_values = np.transpose(ps.weight_tensor.quant_values, (0, 1, 3, 2)) + ps.weight_tensor.shape = ps.weight_tensor.storage_shape = ps.weight_tensor.bandwidth_shape = list( + ps.weight_tensor.quant_values.shape + ) + ps.weight_tensor.weight_transpose_depthwise = True + + needs_dma = len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA" + if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma: + ofm_depth_step = ps.block_config[-1] + else: + ofm_depth_step = ps.weight_tensor.shape[-1] + + compress_weights( + ps.weight_tensor, + arch, + npu_usage_of_tensor, + Block(ps.block_config[-3], ps.block_config[-4], ps.block_config[-1]), + ofm_depth_step, + ) + # Update source tensor + if len(ps.weight_tensor.ops) == 1 and ps.weight_tensor.ops[0].type == "DMA": + src_tens = ps.weight_tensor.ops[0].inputs[0] + src_tens.shape = ps.weight_tensor.shape + src_tens.weight_transpose_depthwise = ps.weight_tensor.weight_transpose_depthwise + src_tens.quant_values = ps.weight_tensor.quant_values + src_tens.compressed_values = ps.weight_tensor.compressed_values + src_tens.storage_shape = [1, 1, 1, ps.weight_tensor.weight_compressed_offsets[-1]] + src_tens.brick_size = ps.weight_tensor.brick_size + src_tens.weight_compression_scales = ps.weight_tensor.weight_compression_scales + src_tens.weight_compressed_offsets = ps.weight_tensor.weight_compressed_offsets + + if ps.scale_tensor != None: + rescale_for_faf = False + activation_ops = set(("Sigmoid", "Tanh")) + if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise): + rescale_for_faf = True + calc_scales_and_pack_biases(ps.scale_tensor, arch, ps.block_config[3], rescale_for_faf) diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..1a1ae845 --- /dev/null +++ b/setup.py @@ -0,0 +1,63 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Packaging for the Vela compiler + +from os import path +from setuptools import setup, find_namespace_packages, Extension + +# Read the contents of README.md file +this_directory = path.abspath(path.dirname(__file__)) +with open(path.join(this_directory, "README.md"), encoding="utf-8") as f: + long_description = f.read() + +mlw_module = Extension( + "ethosu.mlw_codec", + ["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"], +) + +setup( + name="ethos-u-vela", + use_scm_version=True, + description="Optimise TensorFlow Lite models for Ethos-U55 NPU.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://git.mlplatform.org/ml/ethos-u/ethos-u-vela.git/", + author="Arm Ltd.", + license="Apache License 2.0", + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: POSIX :: Linux", + "Programming Language :: C", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Compilers", + ], + keywords=["ethos-u", "vela compiler", "tflite", "npu"], + packages=find_namespace_packages(include=["ethosu.*"]), + python_requires="~=3.6", # We support only 3.6+ + install_requires=["flatbuffers==1.11.0", "numpy>=1.16.6"], + entry_points={"console_scripts": ["vela = ethosu.vela.vela:main"]}, + ext_modules=[mlw_module], + setup_requires=["setuptools_scm"], +) -- cgit v1.2.1