From ea6111a36e55501bbfb9ea022aaf8305b4d80183 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Tue, 14 Apr 2020 18:41:58 +0100 Subject: Add pre-commit support for sanity checks Use pre-commit framework [1] to run black and flake8 before the commit. black and flake8 are managed by the pre-commit framework and they can be run manually by the user using `pre-commit run` command. Fix the code base with the help of black and flake8. Fix import statements according to PEP8 guidelines [1] Both tools have the following settings (specified in the pre-commit configuration file): * line length: 120 characters * directory to exclude: ethosu/vela/tflite/ and ethosu/vela/ethos_u55_regs Updated README.md on how to install pre-commit and how to run sanity checks. Pipenv files have been updated including new dependencies for pre-commit. [1]: https://www.python.org/dev/peps/pep-0008/#imports [2]: https://github.com/pre-commit/pre-commit Change-Id: I304d9fffdf019d390ffa396a529c8a7c2437f63d Signed-off-by: Diego Russo --- ethosu/vela/_version.py | 2 +- ethosu/vela/architecture_features.py | 22 ++++++++-------- ethosu/vela/compiler_driver.py | 7 +++-- ethosu/vela/data_type.py | 3 ++- ethosu/vela/driver_actions.py | 18 +++++++------ ethosu/vela/extract_npu_subgraphs.py | 5 ++-- ethosu/vela/graph_optimiser.py | 29 ++++++++++++--------- ethosu/vela/high_level_command_stream.py | 8 +++--- ethosu/vela/high_level_command_stream_generator.py | 3 +-- ethosu/vela/insert_dma.py | 5 ++-- ethosu/vela/live_range.py | 12 ++++----- ethosu/vela/mark_tensors.py | 5 ++-- ethosu/vela/nn_graph.py | 3 --- ethosu/vela/npu_performance.py | 15 +++++------ ethosu/vela/npu_serialisation.py | 12 +++++---- ethosu/vela/numeric_util.py | 1 + ethosu/vela/pass_packing.py | 29 ++++++++++----------- ethosu/vela/range_set.py | 1 - ethosu/vela/register_command_stream_generator.py | 27 ++++++++----------- ethosu/vela/scaling.py | 3 ++- ethosu/vela/scheduler.py | 30 ++++++++-------------- ethosu/vela/shared_buffer_allocation.py | 5 ++-- ethosu/vela/stats_writer.py | 11 +++++--- ethosu/vela/tensor.py | 7 ++--- ethosu/vela/tensor_allocation.py | 9 ++++--- ethosu/vela/tflite_mapping.py | 17 +++++------- ethosu/vela/tflite_reader.py | 28 +++++++------------- ethosu/vela/tflite_writer.py | 18 ++++++------- ethosu/vela/vela.py | 8 +++--- ethosu/vela/weight_compressor.py | 13 +++++----- 30 files changed, 169 insertions(+), 187 deletions(-) (limited to 'ethosu/vela') diff --git a/ethosu/vela/_version.py b/ethosu/vela/_version.py index f3888c31..b670819d 100644 --- a/ethosu/vela/_version.py +++ b/ethosu/vela/_version.py @@ -16,4 +16,4 @@ import pkg_resources -__version__ = pkg_resources.get_distribution("ethos-u-vela").version \ No newline at end of file +__version__ = pkg_resources.get_distribution("ethos-u-vela").version diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py index 51c632e0..69f95fa2 100644 --- a/ethosu/vela/architecture_features.py +++ b/ethosu/vela/architecture_features.py @@ -18,13 +18,17 @@ # Description: # Holds a container for Ethos-U55/System architecture parameters. -from .nn_graph import MemArea, TensorPurpose, NpuBlockType, TensorFormat -from .numeric_util import round_up, round_up_divide +import enum from collections import namedtuple from configparser import ConfigParser -from .supported_operators import SupportedOperators + import numpy as np -import enum + +from .tensor import MemArea, TensorPurpose, TensorFormat +from .operation import NpuBlockType +from .numeric_util import round_up, round_up_divide +from .supported_operators import SupportedOperators + PointXY = namedtuple("PointXY", "x y") PointXYZ = namedtuple("PointXYZ", "x y z") @@ -151,7 +155,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions accelerator_config = accelerator_config.lower() self.vela_config = vela_config self.accelerator_config = accelerator_config - if not self.accelerator_config in ArchitectureFeatures.accelerator_configs: + if self.accelerator_config not in ArchitectureFeatures.accelerator_configs: raise Exception("Unknown accelerator configuration " + self.accelerator_config) accel_config = ArchitectureFeatures.accelerator_configs[self.accelerator_config] self.config = accel_config @@ -450,7 +454,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions ) # Calculate how many IFM blocks this OFM block requires (i.e how many jobs) - ifm_block = self.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, self.ofm_block_max) ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth) ifm_depth_blocks = 1 # Overwrite with 1 to force OFM block dependency, not IFM @@ -476,7 +479,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window # of IFM area overlaps with any previous OFM block generation. elapsed_jobs = 0 - ifm_depth = ifm.size().depth for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): # This is the IFM block we want to sample from in_area = self.get_first_job_input_volume( @@ -533,7 +535,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions n_elements = op.inputs[0].elements() cycles = intercept + n_elements * slope return cycles - except: + except Exception: print("Error: Reading CPU cycle estimate in vela configuration file, section {}".format(section)) raise @@ -554,7 +556,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions print("Warning: Using default values for system configuration") else: section_key = "SysConfig." + self.system_config - if not section_key in self.vela_config: + if section_key not in self.vela_config: raise Exception("Unknown system configuration " + self.system_config) try: @@ -585,7 +587,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions + " (must be 'OnChipFlash' or 'OffChipFlash'). To store the weights and other constant data in SRAM" " select 'OnChipFlash'" ) - except: + except Exception: print("Error: Reading System Configuration in vela configuration file, section {}".format(section_key)) raise diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index db669ac7..6fc3b653 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -18,6 +18,8 @@ # Description: # Contains the main sequencing of the compiler. +import time + from . import graph_optimiser from . import mark_tensors from . import insert_dma @@ -25,9 +27,6 @@ from . import pass_packing from . import scheduler from . import tensor_allocation from . import npu_performance -import time - -from . import high_level_command_stream from . import high_level_command_stream_generator from . import register_command_stream_generator from . import extract_npu_subgraphs @@ -36,7 +35,7 @@ from . import weight_compressor from . import live_range from .tensor import MemArea from .nn_graph import TensorAllocator, PassPlacement -from .rewrite_graph import verify_graph_health, verify_subgraph_health +from .rewrite_graph import verify_graph_health class CompilerOptions: diff --git a/ethosu/vela/data_type.py b/ethosu/vela/data_type.py index 1d3e94ed..6dfe2167 100644 --- a/ethosu/vela/data_type.py +++ b/ethosu/vela/data_type.py @@ -18,9 +18,10 @@ # Description: # Defines the basic numeric type classes for tensors. -from .numeric_util import round_up_divide import enum +from .numeric_util import round_up_divide + class BaseType(enum.Flag): Signed = 1 diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py index 86c4a369..bd15af20 100644 --- a/ethosu/vela/driver_actions.py +++ b/ethosu/vela/driver_actions.py @@ -18,9 +18,11 @@ # Description: # Creates driver actions that are embedded in the custom operator payload. -import numpy as np from typing import List -from .ethos_u55_regs.ethos_u55_regs import * + +import numpy as np + +from .ethos_u55_regs.ethos_u55_regs import config_r, id_r, ARCH_VER class DACommands: @@ -43,8 +45,8 @@ def make_da_tag(id: int, reserved: int, param: int) -> int: def emit_fourcc(data: List[int], fourcc: str): - assert data != None - assert fourcc != None + assert data is not None + assert fourcc is not None assert len(fourcc) == 4 value: int = 0 value = fourcc[0].encode()[0] @@ -75,14 +77,14 @@ def build_config_word(arch): def emit_config(data: List[int], rel: int, patch: int, arch): - assert data != None + assert data is not None data.append(make_da_tag(DACommands.Config, 0, (patch << DACommands.Config_PatchShift) | rel)) data.append(build_config_word(arch)) data.append(build_id_word()) def emit_cmd_stream_header(data: List[int], length: int): - assert data != None + assert data is not None # Insert NOPs to align start of command stream to 16 bytes num_nops = 4 - ((len(data) + 1) % 4) for _ in range(num_nops): @@ -95,7 +97,7 @@ def emit_cmd_stream_header(data: List[int], length: int): def emit_reg_read(data: List[int], reg_index: int, reg_count: int = 1): - assert data != None + assert data is not None assert reg_index >= 0 assert reg_count >= 1 payload: int = (reg_index & DACommands.ReadAPB_IndexMask) | ((reg_count << DACommands.ReadAPB_CountShift) - 1) @@ -103,5 +105,5 @@ def emit_reg_read(data: List[int], reg_index: int, reg_count: int = 1): def emit_dump_shram(data: List[int]): - assert data != None + assert data is not None data.append(make_da_tag(DACommands.DumpSHRAM, 0, 0)) diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py index 5b9ba8b0..ab3db21f 100644 --- a/ethosu/vela/extract_npu_subgraphs.py +++ b/ethosu/vela/extract_npu_subgraphs.py @@ -23,10 +23,11 @@ # by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and # attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do. -from .nn_graph import Pass, PassPlacement, NpuBlockType, Subgraph -from .operation import Operation import numpy as np +from .nn_graph import Pass, PassPlacement, Subgraph +from .operation import Operation, NpuBlockType + def make_npu_call_op_pass(npu_subgraph): op = Operation("NpuOp", "call_" + npu_subgraph.name) diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index a4ed39ff..b29a3823 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -19,12 +19,15 @@ # Early optimisation of the network graph, using the rewrite_graph module to do the traversal of the graph. These are # split into two parts optimise_graph_a and optimise_graph_b. -from .nn_graph import Operation, NpuBlockType, Tensor -from . import rewrite_graph -from .data_type import BaseType, DataType -import numpy as np import math -from .numeric_util import round_up_divide + +import numpy as np + +from . import rewrite_graph +from .operation import Operation, NpuBlockType +from .tensor import Tensor +from .data_type import DataType + passthrough_nodes = set(("Identity",)) @@ -83,7 +86,7 @@ def rewrite_split(tens, arch): # For Split the offset cannot be extracted from the tensor so it has to # be calculated from the index of the output tensor - if axis != None: + if axis is not None: # Get the start and end of the split offset_start = [0] * len(tens.shape) offset_end = [0] * len(tens.shape) @@ -316,6 +319,7 @@ elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum", "Leaky activation_ops = set(("Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh")) memory_only_ops = set(("Reshape",)) + # Check if the op can be reordered def get_prepend_op(op): inp = op.inputs[0] @@ -326,7 +330,7 @@ def get_prepend_op(op): prep_op = prev_op inp = prev_op.inputs[0] prev_op = inp.ops[-1] - if prev_op != None and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1: + if prev_op is not None and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1: return prep_op return None @@ -384,7 +388,7 @@ def convert_depthwise_to_conv(op, arch): def fixup_act_reorder(op, arch): if op.type in activation_ops: prep_op = get_prepend_op(op) - if prep_op != None: + if prep_op is not None: act_op = op.clone("_reordered") act_op.inputs = [prep_op.inputs[0]] act_op_out = act_op.inputs[0].clone("_acted") @@ -400,7 +404,7 @@ def fixup_act_reorder(op, arch): def convert_mul_max_to_abs_or_lrelu(op, arch): - """Whenever there is a subgraph with this topology: + r"""Whenever there is a subgraph with this topology: Input X For X = -1 or X > 0 | \ / This subgraph can be replaced with either @@ -487,24 +491,25 @@ def optimise_graph_a(nng, arch, verbose_graph=False): for idx, sg in enumerate(nng.subgraphs): # rewrite graph pass nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - sg, arch, [fixup_unpack_output,], op_rewrite_list, rewrite_unsupported=False + sg, arch, [fixup_unpack_output], op_rewrite_list, rewrite_unsupported=False ) for idx, sg in enumerate(nng.subgraphs): # remove passthrough tensors - nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [remove_passthrough_tensor,], []) + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [remove_passthrough_tensor], []) if verbose_graph: nng.print_graph() return nng + def optimise_graph_b(nng, arch, verbose_graph=False): if verbose_graph: nng.print_graph() for idx, sg in enumerate(nng.subgraphs): # combined rewrite graph pass - nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [rewrite_concat, rewrite_split,], []) + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [rewrite_concat, rewrite_split], []) if verbose_graph: nng.print_graph() diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py index 952e2033..bdb04904 100644 --- a/ethosu/vela/high_level_command_stream.py +++ b/ethosu/vela/high_level_command_stream.py @@ -18,8 +18,10 @@ # Description: # Contains classes that hold commands for the high-level command stream (one command per DMA or NPU stripe). -from enum import Enum, IntEnum +from enum import IntEnum + import numpy as np + from .operation import NpuBlockType from .numeric_util import round_up_divide from .range_set import MemoryAccessSet, AccessDirection @@ -42,12 +44,12 @@ class Box: new_start_coord[concat_axis] -= concat_offset new_end_coord[concat_axis] -= concat_offset - if split_offset != None: + if split_offset is not None: for idx in range(len(split_offset)): new_start_coord[idx] += split_offset[idx] new_end_coord[idx] += split_offset[idx] - if split_offset == None and npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)): + if split_offset is None and npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)): # these types of operations do a "dot product" over the entire IFM new_start_coord[-1] = 0 new_end_coord[-1] = ifm_shape[-1] diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py index 364df6f8..47392c0b 100644 --- a/ethosu/vela/high_level_command_stream_generator.py +++ b/ethosu/vela/high_level_command_stream_generator.py @@ -22,9 +22,8 @@ # calc_allowed_ofm_ifm_overlap_for_cascaded_pass(). from .nn_graph import SchedulingStrategy, PassPlacement -import numpy as np from .operation import NpuBlockType -from .high_level_command_stream import Box, CommandType, Command, NpuStripe, DMA +from .high_level_command_stream import Box, NpuStripe, DMA def need_dma(tens): diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py index b63c1ea1..33f1a02c 100644 --- a/ethosu/vela/insert_dma.py +++ b/ethosu/vela/insert_dma.py @@ -18,13 +18,14 @@ # Description: # Insert DMA operations into the graph for transfering weights. -from .nn_graph import Operation, MemArea, TensorPurpose, NpuBlockType from . import rewrite_graph +from .tensor import MemArea, TensorPurpose +from .operation import Operation, NpuBlockType def insert_dma_cmd(op, arch): if op.type == "DMA": - return op # Already rewritten + return op # Already rewritten for idx, tens in enumerate(op.inputs): if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area: diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py index 24f1f64c..54c15ba9 100644 --- a/ethosu/vela/live_range.py +++ b/ethosu/vela/live_range.py @@ -20,7 +20,7 @@ # Can work with either a pass packed subgraph or a scheduled subgraph. from .tensor import Tensor, MemArea -from .nn_graph import TensorPurpose, PassPlacement +from .nn_graph import PassPlacement from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass @@ -90,9 +90,9 @@ class LiveRange: if tens.address == 0: tens.address = address # Also need to set the address to the tensor's cpu/npu clones - if tens.cpu_tensor != None: + if tens.cpu_tensor is not None: tens.cpu_tensor.address = address - if tens.npu_tensor != None: + if tens.npu_tensor is not None: tens.npu_tensor.address = address def get_alignment(self): @@ -115,8 +115,8 @@ def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_ar output_tensor = ps.outputs[0] # If the input or output tensor is tied to a Cpu tensor, i.e. a subgraph input # or output, fuse the live-range with the Cpu tensors' live-range instead. - input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor != None else input_tensor - output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor != None else output_tensor + input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor is not None else input_tensor + output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor is not None else output_tensor if not tensor_should_be_ignored(input_tensor, target_mem_area) and not tensor_should_be_ignored( output_tensor, target_mem_area ): @@ -221,7 +221,7 @@ def extract_live_ranges_from_cascaded_passes( ignore_subgraph_input_output_tensors=False, lr_graph=None, ): - if lr_graph == None: + if lr_graph is None: lr_graph = LiveRangeGraph() if sg in lr_graph.processed_subgraphs: diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py index 9b1824b5..c42a28df 100644 --- a/ethosu/vela/mark_tensors.py +++ b/ethosu/vela/mark_tensors.py @@ -21,7 +21,7 @@ from . import rewrite_graph from . import weight_compressor from .architecture_features import Block -from .nn_graph import TensorPurpose, TensorFormat, PassPlacement +from .tensor import TensorPurpose, TensorFormat from .operation import NpuBlockType @@ -55,6 +55,7 @@ def inputs_from_output(op, idx): print("Warning: Propagating unknown tensor purpose", op) return res + tensor_purposes = [ # ops, input_purpose ( set( @@ -327,7 +328,7 @@ def mark_tensor_format(nng, arch, verbose_tensor_format=False): return NpuBlockType.Default def visit_tens(tens, ps): - if not tens in formats_for_tensor: + if tens not in formats_for_tensor: fmt = init_tens(tens) else: fmt = formats_for_tensor[tens] diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py index 8d335bd8..e7820fe6 100644 --- a/ethosu/vela/nn_graph.py +++ b/ethosu/vela/nn_graph.py @@ -24,9 +24,6 @@ # Graph - A full neural network graph with one or more Subgraphs. import enum -from .data_type import BaseType, DataType -from .tensor import MemArea, TensorPurpose, TensorSubPurpose, TensorFormat, Tensor -from .operation import Operation, NpuBlockType class PassPlacement(enum.Enum): diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 84cc4931..11f1e92b 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -23,12 +23,13 @@ # estimate. import enum -from . import numeric_util + import numpy as np -from .tensor import TensorPurpose, MemArea, TensorFormat, shape_num_elements, Tensor, TensorBlockTraversal -from .operation import Operation -from .data_type import DataType, BaseType -from .nn_graph import PassPlacement, NpuBlockType, SchedulerRewrite, Pass + +from . import numeric_util +from .tensor import TensorPurpose, MemArea, shape_num_elements, TensorBlockTraversal +from .nn_graph import PassPlacement, SchedulerRewrite +from .operation import NpuBlockType from .architecture_features import Block, Kernel @@ -357,9 +358,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f n_kernel_xy, 4 ) # need at least 4, as this is the minimum duty cycle for secondary accumulator writes if weight_tensor is not None: - n_kernel_xy = numeric_util.round_up( - n_kernel_xy, 4 - ) # weights need to be read in blocks of 4 + n_kernel_xy = numeric_util.round_up(n_kernel_xy, 4) # weights need to be read in blocks of 4 num_mac_ops = 0 for n_blocks_for_size, block_size in block_setup: diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py index 4542c25b..29ede842 100644 --- a/ethosu/vela/npu_serialisation.py +++ b/ethosu/vela/npu_serialisation.py @@ -18,13 +18,15 @@ # Description: # Serialises and packs an NPU subgraph into tensors. +import struct + +import numpy as np + +from . import driver_actions from .nn_graph import PassPlacement from .tensor import MemArea, Tensor, TensorPurpose, TensorFormat from .operation import Operation from .data_type import DataType -import numpy as np -from . import driver_actions -import struct def make_memory_tensor(name, mem_area, sz, want_values, arch): @@ -75,7 +77,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens) nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size - if flash_tens == scratch_tens == None: + if flash_tens == scratch_tens is None: # First Npu subgraph, create scratch and flash tensors sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch) sg.scratch_tensor.purpose = TensorPurpose.Scratch @@ -88,7 +90,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens) for cps in sg.cascaded_passes: for ps in cps.passes: - if ps.placement == PassPlacement.Npu and ps.weight_tensor != None: + if ps.placement == PassPlacement.Npu and ps.weight_tensor is not None: # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address # is pointing at the destination address of where the weights should be placed in SRAM. # This ensures that the Flash weight tensor is used instead and thus gets the correct address. diff --git a/ethosu/vela/numeric_util.py b/ethosu/vela/numeric_util.py index e5bc88b8..4e61b4c5 100644 --- a/ethosu/vela/numeric_util.py +++ b/ethosu/vela/numeric_util.py @@ -19,6 +19,7 @@ # Numerical utilities for various types of rounding etc. import math + import numpy as np diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py index 663520fc..bae81517 100644 --- a/ethosu/vela/pass_packing.py +++ b/ethosu/vela/pass_packing.py @@ -18,10 +18,12 @@ # Description: # Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations. -from .nn_graph import Operation, Pass, PassPlacement, TensorPurpose, NpuBlockType, Tensor -import collections import enum -from .data_type import BaseType, DataType +import collections + +from .nn_graph import Pass, PassPlacement +from .tensor import TensorPurpose +from .operation import Operation, NpuBlockType class PassFlags(enum.Flag): @@ -104,10 +106,7 @@ elem_wise_ops = elem_wise_main_ops | activation_ops | set(("Sigmoid", "Tanh")) quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min")) -cpu_ops = ( - set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN")) - | quantization_ops -) +cpu_ops = set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN")) | quantization_ops npu_dma_ops = set(("DMA",)) startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput")) @@ -183,7 +182,7 @@ test_sequence = [ # flags_to_set PassFlags.Npu | PassFlags.Dma, # flags_to_clear - PassFlags.Empty + PassFlags.Empty, ), ( # ops_set @@ -203,7 +202,7 @@ test_sequence = [ # flags_to_set PassFlags.MemoryOnly | PassFlags.Main, # flags_to_clear - PassFlags.Empty + PassFlags.Empty, ), ( # ops_set @@ -213,9 +212,9 @@ test_sequence = [ # flags_to_set PassFlags.Cpu | PassFlags.Main, # flags_to_clear - PassFlags.Empty + PassFlags.Empty, ), - ( # This last one is a fallback for unrecognised operations + ( # This last one is a fallback for unrecognised operations # ops_set None, # incompatible_pack_flags @@ -223,7 +222,7 @@ test_sequence = [ # flags_to_set PassFlags.Cpu | PassFlags.Main, # flags_to_clear - PassFlags.Empty + PassFlags.Empty, ), ] @@ -346,7 +345,7 @@ def pack_into_passes(nng, arch, verbose_packing=False): is_element_wise = True for op in reverse_ops_list: - if not op.type in elem_wise_ops and not op.type in npu_dma_ops: + if op.type not in elem_wise_ops and op.type not in npu_dma_ops: is_element_wise = False break @@ -368,9 +367,9 @@ def pack_into_passes(nng, arch, verbose_packing=False): ops_list = list(reversed(reverse_ops_list)) intermediates = list(reversed(reverse_intermediates)) - if primary_op == None: + if primary_op is None: primary_op = create_primary_op(ops_list) - if primary_op != None: + if primary_op is not None: visit_tensor_refcount[primary_op.inputs[0]] += 1 npu_block_type = primary_op.attrs["npu_block_type"] for input_tens in primary_op.inputs: diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py index 64de9709..d7623c5a 100644 --- a/ethosu/vela/range_set.py +++ b/ethosu/vela/range_set.py @@ -19,7 +19,6 @@ # Helper classes to track memory accesses for calculating dependencies between Commands. from enum import IntEnum -from collections import defaultdict from functools import lru_cache diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index 120cf8b1..460cf016 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -22,25 +22,19 @@ from collections import defaultdict from enum import Enum, IntEnum + +import numpy as np + +from . import scaling from .high_level_command_stream import CommandType -from .ethos_u55_regs.ethos_u55_regs import * -from .tensor import MemArea, TensorBlockTraversal +from .ethos_u55_regs.ethos_u55_regs import cmd0, cmd1, acc_format, elementwise_mode, rounding, activation, ifm_precision +from .tensor import MemArea, TensorBlockTraversal, TensorFormat from .operation import NpuBlockType from .numeric_util import quantise_float32, round_up, round_away_zero, round_up_to_int, clamp_sigmoid, clamp_tanh from .data_type import BaseType, DataType -import numpy as np from .shared_buffer_allocation import SharedBufferAllocation from .architecture_features import SharedBufferArea, SHRAMElements, ArchitectureFeatures -from .nn_graph import TensorFormat, SchedulingStrategy -from .range_set import ( - MemoryAccessSet, - AccessDirection, -) -from .mark_tensors import ( - reshape_operations, -) from .architecture_features import Block, Kernel, Rect -from . import scaling class RegisterMachine: @@ -372,7 +366,6 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): param = relative_dep[CommandType.DMA][0] param = min(param, 0xF) # Clamp to allowable wait amount emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0]) - prev_cmd = None # Clear any dependency for cmd in cmd_stream: if cmd.cmdtype == CommandType.DMA: @@ -684,7 +677,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): ifm_max = cmd.ifm_tensor.quantization.max # Emit commands for any fused activation function - if faf == None: + if faf is None: emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE) # Even if no activation function, values need to be set to override previous values faf_min = ofm_quant_qmin @@ -765,13 +758,13 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): ), ): - if tens == None: + if tens is None: continue - need_zero_point = (faf != None) or (fmf == "ConcatSliceWrite") + need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite") if ( primary_op.type in set(("AvgPool", "AvgPoolAct")) and not need_zero_point - ) or tens.quantization == None: + ) or tens.quantization is None: # Actual integer operation, just set scale to 1 and zero point to 0 emit.cmd0_with_param(zero_point_op, 0) else: diff --git a/ethosu/vela/scaling.py b/ethosu/vela/scaling.py index ce0259a5..3b749ddd 100644 --- a/ethosu/vela/scaling.py +++ b/ethosu/vela/scaling.py @@ -19,9 +19,10 @@ # Contains various scaling calculations for weights, elementwise operations, pooling etc. import math -from .numeric_util import round_away_zero from enum import IntEnum +from .numeric_util import round_away_zero + class OperandToScale(IntEnum): OPa = 1 diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index d51b5ac6..fe31a463 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -19,24 +19,17 @@ # The scheduler costs various strategies for scheduling the network in order to select the block configuration. import enum -from .nn_graph import ( - TensorPurpose, - TensorSubPurpose, - TensorFormat, - MemArea, - SchedulingStrategy, - CascadedPass, - PassPlacement, - SchedulerRewrite, - Operation, - NpuBlockType, -) -from . import live_range +import copy + import numpy as np + +from . import live_range from . import npu_performance from . import stats_writer +from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, MemArea +from .operation import NpuBlockType +from .nn_graph import SchedulingStrategy, CascadedPass, PassPlacement, SchedulerRewrite from .npu_performance import make_bandwidth_array, make_macs_array, make_cycles_array, make_metrics_arrays, PassCycles -import time, copy from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_pass_list from .shared_buffer_allocation import ( find_block_configs_suitable_for_pass_and_shared_buffer, @@ -279,7 +272,6 @@ class DynamicProgrammingScheduler: if len(candidates) <= 1: return candidates assert remove_equally_good_candidates - start = time.time() pareto_vals = np.zeros((len(candidates), DynamicProgrammingScheduler.num_pareto_metrics)) ids = np.arange(len(candidates), dtype=np.int32) for idx, cand in enumerate(candidates): @@ -713,7 +705,7 @@ class DynamicProgrammingScheduler: def get_block_configs(self, ps): if ps.placement != PassPlacement.Npu: - return [(1, 1, 1, 1)] # default + return [(1, 1, 1, 1)] # default block_configs = find_block_configs_suitable_for_pass_and_shared_buffer(self.arch, ps) @@ -764,9 +756,7 @@ class DynamicProgrammingScheduler: for tens in ps.intermediates: if tens.mem_area == self.mem_area: if tens.purpose == TensorPurpose.Weights: - sram_used += tens.storage_size_for_sub_purpose( - TensorSubPurpose.DoubleBuffer, block_config[3] - ) + sram_used += tens.storage_size_for_sub_purpose(TensorSubPurpose.DoubleBuffer, block_config[3]) rewrite_list.append( ( SchedulerRewrite.ChangeTensorSubPurpose, @@ -884,7 +874,7 @@ class DynamicProgrammingScheduler: % (len(self.sg.passes), len(pass_to_cascaded_pass)) ) for ps in self.sg.passes: - if not ps in pass_to_cascaded_pass: + if ps not in pass_to_cascaded_pass: print("%3d pass missing cascaded pass %s" % (ps.time, ps)) assert len(pass_to_cascaded_pass) == len(self.sg.passes) diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py index b5408d19..29be6d8d 100644 --- a/ethosu/vela/shared_buffer_allocation.py +++ b/ethosu/vela/shared_buffer_allocation.py @@ -19,10 +19,9 @@ # Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass. import numpy as np -from .nn_graph import NpuBlockType -from .numeric_util import round_up_divide, round_up + +from .operation import NpuBlockType from .architecture_features import Block, Kernel, SHRAMElements, SharedBufferArea, ArchitectureFeatures -from . import pass_packing class SharedBufferAllocation: diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py index c4b4cd9e..3fd29d12 100644 --- a/ethosu/vela/stats_writer.py +++ b/ethosu/vela/stats_writer.py @@ -18,12 +18,15 @@ # Description: # Writes out per-pass and summary performance statistics to CSV files. +import csv +import sys + import numpy as np -from .nn_graph import MemArea, TensorPurpose, PassPlacement + +from .tensor import MemArea, TensorPurpose +from .nn_graph import PassPlacement from .npu_performance import PassCycles, MacCount, BandwidthDirection -import csv from .numeric_util import round_up_to_int -import sys def write_summary_metrics_csv(nng, summary_filename, arch): @@ -246,7 +249,7 @@ def print_performance_metrics_for_strat( print(file=f) for mem_area, label in mem_area_labels: - if not mem_area in memory_used: + if mem_area not in memory_used: continue aug_label = label + " used" diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 5d0206cc..5cebf4d0 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -19,10 +19,11 @@ # Internal representation of a Neural Network Tensor. import enum -from . import numeric_util -import numpy as np -from . import data_type import uuid + +import numpy as np + +from . import numeric_util from .range_set import MemoryRangeSet from .numeric_util import round_up_divide diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index 94aa6088..255156e6 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -19,13 +19,14 @@ # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been # worked out from the allowable overlaps that are calculated by the live range analysis. -from . import live_range -from .tensor import MemArea import math -from . import numeric_util + import numpy as np -from .nn_graph import TensorAllocator, PassPlacement +from . import live_range +from . import numeric_util +from .tensor import MemArea +from .nn_graph import TensorAllocator from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py index e2b90765..e8b40bdb 100644 --- a/ethosu/vela/tflite_mapping.py +++ b/ethosu/vela/tflite_mapping.py @@ -20,19 +20,11 @@ # Contains a mapping from the various TensorFlow Lite enums and options structs, generated by the FlatBuffer code # generator, to Vela's internal format. -import numpy as np import struct -from .data_type import DataType - -from .tflite.TensorType import TensorType -from .tflite.BuiltinOperator import BuiltinOperator -from .tflite.BuiltinOptions import BuiltinOptions - - -from .tflite.Padding import Padding -from .tflite.ActivationFunctionType import ActivationFunctionType +import numpy as np +from .data_type import DataType from .tflite import Conv2DOptions from .tflite import DepthwiseConv2DOptions from .tflite import ConcatEmbeddingsOptions @@ -132,6 +124,11 @@ from .tflite import ScatterNdOptions from .tflite import SegmentSumOptions from .tflite import SelectV2Options from .tflite import WhileOptions +from .tflite.TensorType import TensorType +from .tflite.BuiltinOperator import BuiltinOperator +from .tflite.BuiltinOptions import BuiltinOptions +from .tflite.Padding import Padding +from .tflite.ActivationFunctionType import ActivationFunctionType def inverse_map(map): diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py index 535847d7..4456d5a0 100644 --- a/ethosu/vela/tflite_reader.py +++ b/ethosu/vela/tflite_reader.py @@ -18,14 +18,15 @@ # Description: # Functions used to read from a TensorFlow Lite format file. -from .tflite.Model import Model -from .tflite.BuiltinOperator import BuiltinOperator +import os.path import numpy as np -import os.path -from .nn_graph import Graph, Operation, Subgraph -from .tensor import Tensor, QuantizationParameters +from .tflite.Model import Model +from .tflite.BuiltinOperator import BuiltinOperator +from .nn_graph import Graph, Subgraph +from .operation import Operation +from .tensor import Tensor, QuantizationParameters from .tflite_mapping import builtin_operator_map, datatype_map, datatype_map_numpy, DataType @@ -184,12 +185,7 @@ class TFLiteSubgraph: class TFLiteGraph: def __init__( - self, - filename, - batch_size=1, - feed_dict={}, - output_node_names=[], - initialisation_nodes=[], + self, filename, batch_size=1, feed_dict={}, output_node_names=[], initialisation_nodes=[], ): self.op_times = {} @@ -238,15 +234,9 @@ class TFLiteGraph: def read_tflite( - filename, - batch_size=1, - feed_dict={}, - output_node_names=[], - initialisation_nodes=[], + filename, batch_size=1, feed_dict={}, output_node_names=[], initialisation_nodes=[], ): - tflite_graph = TFLiteGraph( - filename, batch_size, feed_dict, output_node_names, initialisation_nodes - ) + tflite_graph = TFLiteGraph(filename, batch_size, feed_dict, output_node_names, initialisation_nodes) nng = tflite_graph.nng nng.refresh_after_modification() return nng diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py index f55d1ce5..1f072424 100644 --- a/ethosu/vela/tflite_writer.py +++ b/ethosu/vela/tflite_writer.py @@ -18,7 +18,13 @@ # Description: # Functions used to write to a TensorFlow Lite format file. Supports adding in file identifiers. +import numpy as np import flatbuffers +from flatbuffers.builder import UOffsetTFlags + +# ugh, the python flatbuffer interface is missing a method to add in file identifier. patching it in here: +import flatbuffers.number_types as N +from flatbuffers import encode from .tflite import Tensor from .tflite import QuantizationParameters @@ -28,22 +34,14 @@ from .tflite import OperatorCode from .tflite import Operator from .tflite import Buffer from .tflite import Metadata - -import numpy as np - from .tflite_mapping import datatype_inv_map, builtin_operator_inv_map, custom_prefix, BuiltinOperator from .nn_graph import PassPlacement from .tensor import TensorPurpose, MemArea -from flatbuffers.builder import UOffsetTFlags tflite_version = 3 tflite_file_identifier = "TFL" + str(tflite_version) -import flatbuffers.number_types as N -from flatbuffers import encode - - def FinishWithFileIdentifier(self, rootTable, fid): if fid is None or len(fid) != 4: raise Exception("fid must be 4 chars") @@ -163,8 +161,8 @@ class TFLiteSerialiser: tf_code, opt_serializer = builtin_operator_inv_map[code] except KeyError: print( - "Warning: Writing operation %s, which does not have a direct TensorFlow Lite mapping, as a custom operation" - % (code,) + "Warning: Writing operation %s, which does not have a direct TensorFlow Lite mapping," + "as a custom operation" % (code,) ) tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix] diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index f07aec89..07772e66 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -20,11 +20,10 @@ # # Provides command line interface, options parsing, and network loading. Before calling the compiler driver. -import sys -import os.path import os +import os.path +import sys import time -import subprocess import configparser import argparse import ast @@ -37,7 +36,8 @@ from . import compiler_driver from . import scheduler from ._version import __version__ from .scheduler import ParetoMetric -from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement +from .nn_graph import TensorAllocator, PassPlacement +from .tensor import MemArea def process(fname, arch, model_reader_options, compiler_options, scheduler_options): diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py index 92197248..ee554b5c 100644 --- a/ethosu/vela/weight_compressor.py +++ b/ethosu/vela/weight_compressor.py @@ -18,12 +18,11 @@ # Description: # Compresses and pads the weigths. It also calculates the scales and packs with the biases. -import os -import sys -import enum import math -import numpy as np from collections import namedtuple + +import numpy as np + from .numeric_util import round_up from .scaling import quantise_scale, reduced_quantise_scale from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, TensorBlockTraversal @@ -44,7 +43,7 @@ def encode(weight_stream): # pad with 0xFF as needed so the length of the weight stream # is a multiple of 16 - + while (len(compressed) % 16) != 0: compressed.append(0xFF) @@ -348,7 +347,7 @@ def update_pass_weight_and_scale_tensors(nng, arch): for sg in nng.subgraphs: for ps in sg.passes: - if ps.weight_tensor != None: + if ps.weight_tensor is not None: npu_usage_of_tensor = find_npu_usage_of_tensor(ps.weight_tensor) if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise: ps.weight_tensor.quant_values = np.transpose(ps.weight_tensor.quant_values, (0, 1, 3, 2)) @@ -382,7 +381,7 @@ def update_pass_weight_and_scale_tensors(nng, arch): src_tens.weight_compression_scales = ps.weight_tensor.weight_compression_scales src_tens.weight_compressed_offsets = ps.weight_tensor.weight_compressed_offsets - if ps.scale_tensor != None: + if ps.scale_tensor is not None: rescale_for_faf = False activation_ops = set(("Sigmoid", "Tanh")) if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise): -- cgit v1.2.1