From ea6111a36e55501bbfb9ea022aaf8305b4d80183 Mon Sep 17 00:00:00 2001
From: Diego Russo <diego.russo@arm.com>
Date: Tue, 14 Apr 2020 18:41:58 +0100
Subject: Add pre-commit support for sanity checks

Use pre-commit framework [1] to run black and flake8 before the commit.
black and flake8 are managed by the pre-commit framework and they can be
run manually by the user using `pre-commit run` command.

Fix the code base with the help of black and flake8.
Fix import statements according to PEP8 guidelines [1]
Both tools have the following settings (specified in the pre-commit
configuration file):
* line length: 120 characters
* directory to exclude: ethosu/vela/tflite/ and ethosu/vela/ethos_u55_regs

Updated README.md on how to install pre-commit and how to run sanity checks.
Pipenv files have been updated including new dependencies for pre-commit.

[1]: https://www.python.org/dev/peps/pep-0008/#imports
[2]: https://github.com/pre-commit/pre-commit

Change-Id: I304d9fffdf019d390ffa396a529c8a7c2437f63d
Signed-off-by: Diego Russo <diego.russo@arm.com>
---
 ethosu/vela/_version.py                            |  2 +-
 ethosu/vela/architecture_features.py               | 22 ++++++++--------
 ethosu/vela/compiler_driver.py                     |  7 +++--
 ethosu/vela/data_type.py                           |  3 ++-
 ethosu/vela/driver_actions.py                      | 18 +++++++------
 ethosu/vela/extract_npu_subgraphs.py               |  5 ++--
 ethosu/vela/graph_optimiser.py                     | 29 ++++++++++++---------
 ethosu/vela/high_level_command_stream.py           |  8 +++---
 ethosu/vela/high_level_command_stream_generator.py |  3 +--
 ethosu/vela/insert_dma.py                          |  5 ++--
 ethosu/vela/live_range.py                          | 12 ++++-----
 ethosu/vela/mark_tensors.py                        |  5 ++--
 ethosu/vela/nn_graph.py                            |  3 ---
 ethosu/vela/npu_performance.py                     | 15 +++++------
 ethosu/vela/npu_serialisation.py                   | 12 +++++----
 ethosu/vela/numeric_util.py                        |  1 +
 ethosu/vela/pass_packing.py                        | 29 ++++++++++-----------
 ethosu/vela/range_set.py                           |  1 -
 ethosu/vela/register_command_stream_generator.py   | 27 ++++++++-----------
 ethosu/vela/scaling.py                             |  3 ++-
 ethosu/vela/scheduler.py                           | 30 ++++++++--------------
 ethosu/vela/shared_buffer_allocation.py            |  5 ++--
 ethosu/vela/stats_writer.py                        | 11 +++++---
 ethosu/vela/tensor.py                              |  7 ++---
 ethosu/vela/tensor_allocation.py                   |  9 ++++---
 ethosu/vela/tflite_mapping.py                      | 17 +++++-------
 ethosu/vela/tflite_reader.py                       | 28 +++++++-------------
 ethosu/vela/tflite_writer.py                       | 18 ++++++-------
 ethosu/vela/vela.py                                |  8 +++---
 ethosu/vela/weight_compressor.py                   | 13 +++++-----
 30 files changed, 169 insertions(+), 187 deletions(-)

(limited to 'ethosu/vela')

diff --git a/ethosu/vela/_version.py b/ethosu/vela/_version.py
index f3888c31..b670819d 100644
--- a/ethosu/vela/_version.py
+++ b/ethosu/vela/_version.py
@@ -16,4 +16,4 @@
 
 import pkg_resources
 
-__version__ = pkg_resources.get_distribution("ethos-u-vela").version
\ No newline at end of file
+__version__ = pkg_resources.get_distribution("ethos-u-vela").version
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 51c632e0..69f95fa2 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -18,13 +18,17 @@
 # Description:
 # Holds a container for Ethos-U55/System architecture parameters.
 
-from .nn_graph import MemArea, TensorPurpose, NpuBlockType, TensorFormat
-from .numeric_util import round_up, round_up_divide
+import enum
 from collections import namedtuple
 from configparser import ConfigParser
-from .supported_operators import SupportedOperators
+
 import numpy as np
-import enum
+
+from .tensor import MemArea, TensorPurpose, TensorFormat
+from .operation import NpuBlockType
+from .numeric_util import round_up, round_up_divide
+from .supported_operators import SupportedOperators
+
 
 PointXY = namedtuple("PointXY", "x y")
 PointXYZ = namedtuple("PointXYZ", "x y z")
@@ -151,7 +155,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
         accelerator_config = accelerator_config.lower()
         self.vela_config = vela_config
         self.accelerator_config = accelerator_config
-        if not self.accelerator_config in ArchitectureFeatures.accelerator_configs:
+        if self.accelerator_config not in ArchitectureFeatures.accelerator_configs:
             raise Exception("Unknown accelerator configuration " + self.accelerator_config)
         accel_config = ArchitectureFeatures.accelerator_configs[self.accelerator_config]
         self.config = accel_config
@@ -450,7 +454,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions
         )
 
         # Calculate how many IFM blocks this OFM block requires (i.e how many jobs)
-        ifm_block = self.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, self.ofm_block_max)
         ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth)
         ifm_depth_blocks = 1  # Overwrite with 1 to force OFM block dependency, not IFM
 
@@ -476,7 +479,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions
         # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
         # of IFM area overlaps with any previous OFM block generation.
         elapsed_jobs = 0
-        ifm_depth = ifm.size().depth
         for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
             # This is the IFM block we want to sample from
             in_area = self.get_first_job_input_volume(
@@ -533,7 +535,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
                 n_elements = op.inputs[0].elements()
                 cycles = intercept + n_elements * slope
                 return cycles
-            except:
+            except Exception:
                 print("Error: Reading CPU cycle estimate in vela configuration file, section {}".format(section))
                 raise
 
@@ -554,7 +556,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
             print("Warning: Using default values for system configuration")
         else:
             section_key = "SysConfig." + self.system_config
-            if not section_key in self.vela_config:
+            if section_key not in self.vela_config:
                 raise Exception("Unknown system configuration " + self.system_config)
 
         try:
@@ -585,7 +587,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
                     + " (must be 'OnChipFlash' or 'OffChipFlash'). To store the weights and other constant data in SRAM"
                     " select 'OnChipFlash'"
                 )
-        except:
+        except Exception:
             print("Error: Reading System Configuration in vela configuration file, section {}".format(section_key))
             raise
 
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index db669ac7..6fc3b653 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -18,6 +18,8 @@
 # Description:
 # Contains the main sequencing of the compiler.
 
+import time
+
 from . import graph_optimiser
 from . import mark_tensors
 from . import insert_dma
@@ -25,9 +27,6 @@ from . import pass_packing
 from . import scheduler
 from . import tensor_allocation
 from . import npu_performance
-import time
-
-from . import high_level_command_stream
 from . import high_level_command_stream_generator
 from . import register_command_stream_generator
 from . import extract_npu_subgraphs
@@ -36,7 +35,7 @@ from . import weight_compressor
 from . import live_range
 from .tensor import MemArea
 from .nn_graph import TensorAllocator, PassPlacement
-from .rewrite_graph import verify_graph_health, verify_subgraph_health
+from .rewrite_graph import verify_graph_health
 
 
 class CompilerOptions:
diff --git a/ethosu/vela/data_type.py b/ethosu/vela/data_type.py
index 1d3e94ed..6dfe2167 100644
--- a/ethosu/vela/data_type.py
+++ b/ethosu/vela/data_type.py
@@ -18,9 +18,10 @@
 # Description:
 # Defines the basic numeric type classes for tensors.
 
-from .numeric_util import round_up_divide
 import enum
 
+from .numeric_util import round_up_divide
+
 
 class BaseType(enum.Flag):
     Signed = 1
diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py
index 86c4a369..bd15af20 100644
--- a/ethosu/vela/driver_actions.py
+++ b/ethosu/vela/driver_actions.py
@@ -18,9 +18,11 @@
 # Description:
 # Creates driver actions that are embedded in the custom operator payload.
 
-import numpy as np
 from typing import List
-from .ethos_u55_regs.ethos_u55_regs import *
+
+import numpy as np
+
+from .ethos_u55_regs.ethos_u55_regs import config_r, id_r, ARCH_VER
 
 
 class DACommands:
@@ -43,8 +45,8 @@ def make_da_tag(id: int, reserved: int, param: int) -> int:
 
 
 def emit_fourcc(data: List[int], fourcc: str):
-    assert data != None
-    assert fourcc != None
+    assert data is not None
+    assert fourcc is not None
     assert len(fourcc) == 4
     value: int = 0
     value = fourcc[0].encode()[0]
@@ -75,14 +77,14 @@ def build_config_word(arch):
 
 
 def emit_config(data: List[int], rel: int, patch: int, arch):
-    assert data != None
+    assert data is not None
     data.append(make_da_tag(DACommands.Config, 0, (patch << DACommands.Config_PatchShift) | rel))
     data.append(build_config_word(arch))
     data.append(build_id_word())
 
 
 def emit_cmd_stream_header(data: List[int], length: int):
-    assert data != None
+    assert data is not None
     # Insert NOPs to align start of command stream to 16 bytes
     num_nops = 4 - ((len(data) + 1) % 4)
     for _ in range(num_nops):
@@ -95,7 +97,7 @@ def emit_cmd_stream_header(data: List[int], length: int):
 
 
 def emit_reg_read(data: List[int], reg_index: int, reg_count: int = 1):
-    assert data != None
+    assert data is not None
     assert reg_index >= 0
     assert reg_count >= 1
     payload: int = (reg_index & DACommands.ReadAPB_IndexMask) | ((reg_count << DACommands.ReadAPB_CountShift) - 1)
@@ -103,5 +105,5 @@ def emit_reg_read(data: List[int], reg_index: int, reg_count: int = 1):
 
 
 def emit_dump_shram(data: List[int]):
-    assert data != None
+    assert data is not None
     data.append(make_da_tag(DACommands.DumpSHRAM, 0, 0))
diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py
index 5b9ba8b0..ab3db21f 100644
--- a/ethosu/vela/extract_npu_subgraphs.py
+++ b/ethosu/vela/extract_npu_subgraphs.py
@@ -23,10 +23,11 @@
 # by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
 # attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
 
-from .nn_graph import Pass, PassPlacement, NpuBlockType, Subgraph
-from .operation import Operation
 import numpy as np
 
+from .nn_graph import Pass, PassPlacement, Subgraph
+from .operation import Operation, NpuBlockType
+
 
 def make_npu_call_op_pass(npu_subgraph):
     op = Operation("NpuOp", "call_" + npu_subgraph.name)
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index a4ed39ff..b29a3823 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -19,12 +19,15 @@
 # Early optimisation of the network graph, using the rewrite_graph module to do the traversal of the graph. These are
 # split into two parts optimise_graph_a and optimise_graph_b.
 
-from .nn_graph import Operation, NpuBlockType, Tensor
-from . import rewrite_graph
-from .data_type import BaseType, DataType
-import numpy as np
 import math
-from .numeric_util import round_up_divide
+
+import numpy as np
+
+from . import rewrite_graph
+from .operation import Operation, NpuBlockType
+from .tensor import Tensor
+from .data_type import DataType
+
 
 passthrough_nodes = set(("Identity",))
 
@@ -83,7 +86,7 @@ def rewrite_split(tens, arch):
 
         # For Split the offset cannot be extracted from the tensor so it has to
         # be calculated from the index of the output tensor
-        if axis != None:
+        if axis is not None:
             # Get the start and end of the split
             offset_start = [0] * len(tens.shape)
             offset_end = [0] * len(tens.shape)
@@ -316,6 +319,7 @@ elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum", "Leaky
 activation_ops = set(("Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh"))
 memory_only_ops = set(("Reshape",))
 
+
 # Check if the op can be reordered
 def get_prepend_op(op):
     inp = op.inputs[0]
@@ -326,7 +330,7 @@ def get_prepend_op(op):
         prep_op = prev_op
         inp = prev_op.inputs[0]
         prev_op = inp.ops[-1]
-    if prev_op != None and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1:
+    if prev_op is not None and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1:
         return prep_op
 
     return None
@@ -384,7 +388,7 @@ def convert_depthwise_to_conv(op, arch):
 def fixup_act_reorder(op, arch):
     if op.type in activation_ops:
         prep_op = get_prepend_op(op)
-        if prep_op != None:
+        if prep_op is not None:
             act_op = op.clone("_reordered")
             act_op.inputs = [prep_op.inputs[0]]
             act_op_out = act_op.inputs[0].clone("_acted")
@@ -400,7 +404,7 @@ def fixup_act_reorder(op, arch):
 
 
 def convert_mul_max_to_abs_or_lrelu(op, arch):
-    """Whenever there is a subgraph with this topology:
+    r"""Whenever there is a subgraph with this topology:
 
        Input    X   For X = -1 or X > 0
        |   \   /    This subgraph can be replaced with either
@@ -487,24 +491,25 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
     for idx, sg in enumerate(nng.subgraphs):
         # rewrite graph pass
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            sg, arch, [fixup_unpack_output,], op_rewrite_list, rewrite_unsupported=False
+            sg, arch, [fixup_unpack_output], op_rewrite_list, rewrite_unsupported=False
         )
 
     for idx, sg in enumerate(nng.subgraphs):
         # remove passthrough tensors
-        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [remove_passthrough_tensor,], [])
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [remove_passthrough_tensor], [])
 
     if verbose_graph:
         nng.print_graph()
     return nng
 
+
 def optimise_graph_b(nng, arch, verbose_graph=False):
     if verbose_graph:
         nng.print_graph()
 
     for idx, sg in enumerate(nng.subgraphs):
         # combined rewrite graph pass
-        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [rewrite_concat, rewrite_split,], [])
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [rewrite_concat, rewrite_split], [])
 
     if verbose_graph:
         nng.print_graph()
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index 952e2033..bdb04904 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -18,8 +18,10 @@
 # Description:
 # Contains classes that hold commands for the high-level command stream (one command per DMA or NPU stripe).
 
-from enum import Enum, IntEnum
+from enum import IntEnum
+
 import numpy as np
+
 from .operation import NpuBlockType
 from .numeric_util import round_up_divide
 from .range_set import MemoryAccessSet, AccessDirection
@@ -42,12 +44,12 @@ class Box:
         new_start_coord[concat_axis] -= concat_offset
         new_end_coord[concat_axis] -= concat_offset
 
-        if split_offset != None:
+        if split_offset is not None:
             for idx in range(len(split_offset)):
                 new_start_coord[idx] += split_offset[idx]
                 new_end_coord[idx] += split_offset[idx]
 
-        if split_offset == None and npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
+        if split_offset is None and npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
             # these types of operations do a "dot product" over the entire IFM
             new_start_coord[-1] = 0
             new_end_coord[-1] = ifm_shape[-1]
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 364df6f8..47392c0b 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -22,9 +22,8 @@
 # calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
 
 from .nn_graph import SchedulingStrategy, PassPlacement
-import numpy as np
 from .operation import NpuBlockType
-from .high_level_command_stream import Box, CommandType, Command, NpuStripe, DMA
+from .high_level_command_stream import Box, NpuStripe, DMA
 
 
 def need_dma(tens):
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
index b63c1ea1..33f1a02c 100644
--- a/ethosu/vela/insert_dma.py
+++ b/ethosu/vela/insert_dma.py
@@ -18,13 +18,14 @@
 # Description:
 # Insert DMA operations into the graph for transfering weights.
 
-from .nn_graph import Operation, MemArea, TensorPurpose, NpuBlockType
 from . import rewrite_graph
+from .tensor import MemArea, TensorPurpose
+from .operation import Operation, NpuBlockType
 
 
 def insert_dma_cmd(op, arch):
     if op.type == "DMA":
-        return op # Already rewritten
+        return op  # Already rewritten
     for idx, tens in enumerate(op.inputs):
 
         if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area:
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 24f1f64c..54c15ba9 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -20,7 +20,7 @@
 # Can work with either a pass packed subgraph or a scheduled subgraph.
 
 from .tensor import Tensor, MemArea
-from .nn_graph import TensorPurpose, PassPlacement
+from .nn_graph import PassPlacement
 from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass
 
 
@@ -90,9 +90,9 @@ class LiveRange:
             if tens.address == 0:
                 tens.address = address
                 # Also need to set the address to the tensor's cpu/npu clones
-                if tens.cpu_tensor != None:
+                if tens.cpu_tensor is not None:
                     tens.cpu_tensor.address = address
-                if tens.npu_tensor != None:
+                if tens.npu_tensor is not None:
                     tens.npu_tensor.address = address
 
     def get_alignment(self):
@@ -115,8 +115,8 @@ def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_ar
             output_tensor = ps.outputs[0]
             # If the input or output tensor is tied to a Cpu tensor, i.e. a subgraph input
             # or output, fuse the live-range with the Cpu tensors' live-range instead.
-            input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor != None else input_tensor
-            output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor != None else output_tensor
+            input_tensor = input_tensor.cpu_tensor if input_tensor.cpu_tensor is not None else input_tensor
+            output_tensor = output_tensor.cpu_tensor if output_tensor.cpu_tensor is not None else output_tensor
             if not tensor_should_be_ignored(input_tensor, target_mem_area) and not tensor_should_be_ignored(
                 output_tensor, target_mem_area
             ):
@@ -221,7 +221,7 @@ def extract_live_ranges_from_cascaded_passes(
     ignore_subgraph_input_output_tensors=False,
     lr_graph=None,
 ):
-    if lr_graph == None:
+    if lr_graph is None:
         lr_graph = LiveRangeGraph()
 
     if sg in lr_graph.processed_subgraphs:
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index 9b1824b5..c42a28df 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -21,7 +21,7 @@
 from . import rewrite_graph
 from . import weight_compressor
 from .architecture_features import Block
-from .nn_graph import TensorPurpose, TensorFormat, PassPlacement
+from .tensor import TensorPurpose, TensorFormat
 from .operation import NpuBlockType
 
 
@@ -55,6 +55,7 @@ def inputs_from_output(op, idx):
         print("Warning: Propagating unknown tensor purpose", op)
     return res
 
+
 tensor_purposes = [  # ops, input_purpose
     (
         set(
@@ -327,7 +328,7 @@ def mark_tensor_format(nng, arch, verbose_tensor_format=False):
             return NpuBlockType.Default
 
     def visit_tens(tens, ps):
-        if not tens in formats_for_tensor:
+        if tens not in formats_for_tensor:
             fmt = init_tens(tens)
         else:
             fmt = formats_for_tensor[tens]
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 8d335bd8..e7820fe6 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -24,9 +24,6 @@
 # Graph - A full neural network graph with one or more Subgraphs.
 
 import enum
-from .data_type import BaseType, DataType
-from .tensor import MemArea, TensorPurpose, TensorSubPurpose, TensorFormat, Tensor
-from .operation import Operation, NpuBlockType
 
 
 class PassPlacement(enum.Enum):
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 84cc4931..11f1e92b 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -23,12 +23,13 @@
 # estimate.
 
 import enum
-from . import numeric_util
+
 import numpy as np
-from .tensor import TensorPurpose, MemArea, TensorFormat, shape_num_elements, Tensor, TensorBlockTraversal
-from .operation import Operation
-from .data_type import DataType, BaseType
-from .nn_graph import PassPlacement, NpuBlockType, SchedulerRewrite, Pass
+
+from . import numeric_util
+from .tensor import TensorPurpose, MemArea, shape_num_elements, TensorBlockTraversal
+from .nn_graph import PassPlacement, SchedulerRewrite
+from .operation import NpuBlockType
 from .architecture_features import Block, Kernel
 
 
@@ -357,9 +358,7 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
                     n_kernel_xy, 4
                 )  # need at least 4, as this is the minimum duty cycle for secondary accumulator writes
                 if weight_tensor is not None:
-                    n_kernel_xy = numeric_util.round_up(
-                        n_kernel_xy, 4
-                    )  # weights need to be read in blocks of 4
+                    n_kernel_xy = numeric_util.round_up(n_kernel_xy, 4)  # weights need to be read in blocks of 4
 
             num_mac_ops = 0
             for n_blocks_for_size, block_size in block_setup:
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index 4542c25b..29ede842 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -18,13 +18,15 @@
 # Description:
 # Serialises and packs an NPU subgraph into tensors.
 
+import struct
+
+import numpy as np
+
+from . import driver_actions
 from .nn_graph import PassPlacement
 from .tensor import MemArea, Tensor, TensorPurpose, TensorFormat
 from .operation import Operation
 from .data_type import DataType
-import numpy as np
-from . import driver_actions
-import struct
 
 
 def make_memory_tensor(name, mem_area, sz, want_values, arch):
@@ -75,7 +77,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
     nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
     nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
 
-    if flash_tens == scratch_tens == None:
+    if flash_tens == scratch_tens is None:
         # First Npu subgraph, create scratch and flash tensors
         sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch)
         sg.scratch_tensor.purpose = TensorPurpose.Scratch
@@ -88,7 +90,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
 
     for cps in sg.cascaded_passes:
         for ps in cps.passes:
-            if ps.placement == PassPlacement.Npu and ps.weight_tensor != None:
+            if ps.placement == PassPlacement.Npu and ps.weight_tensor is not None:
                 # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
                 # is pointing at the destination address of where the weights should be placed in SRAM.
                 # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
diff --git a/ethosu/vela/numeric_util.py b/ethosu/vela/numeric_util.py
index e5bc88b8..4e61b4c5 100644
--- a/ethosu/vela/numeric_util.py
+++ b/ethosu/vela/numeric_util.py
@@ -19,6 +19,7 @@
 # Numerical utilities for various types of rounding etc.
 
 import math
+
 import numpy as np
 
 
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 663520fc..bae81517 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -18,10 +18,12 @@
 # Description:
 # Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.
 
-from .nn_graph import Operation, Pass, PassPlacement, TensorPurpose, NpuBlockType, Tensor
-import collections
 import enum
-from .data_type import BaseType, DataType
+import collections
+
+from .nn_graph import Pass, PassPlacement
+from .tensor import TensorPurpose
+from .operation import Operation, NpuBlockType
 
 
 class PassFlags(enum.Flag):
@@ -104,10 +106,7 @@ elem_wise_ops = elem_wise_main_ops | activation_ops | set(("Sigmoid", "Tanh"))
 
 
 quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
-cpu_ops = (
-    set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN"))
-    | quantization_ops
-)
+cpu_ops = set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN")) | quantization_ops
 
 npu_dma_ops = set(("DMA",))
 startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
@@ -183,7 +182,7 @@ test_sequence = [
         # flags_to_set
         PassFlags.Npu | PassFlags.Dma,
         # flags_to_clear
-        PassFlags.Empty
+        PassFlags.Empty,
     ),
     (
         # ops_set
@@ -203,7 +202,7 @@ test_sequence = [
         # flags_to_set
         PassFlags.MemoryOnly | PassFlags.Main,
         # flags_to_clear
-        PassFlags.Empty
+        PassFlags.Empty,
     ),
     (
         # ops_set
@@ -213,9 +212,9 @@ test_sequence = [
         # flags_to_set
         PassFlags.Cpu | PassFlags.Main,
         # flags_to_clear
-        PassFlags.Empty
+        PassFlags.Empty,
     ),
-    (   # This last one is a fallback for unrecognised operations
+    (  # This last one is a fallback for unrecognised operations
         # ops_set
         None,
         # incompatible_pack_flags
@@ -223,7 +222,7 @@ test_sequence = [
         # flags_to_set
         PassFlags.Cpu | PassFlags.Main,
         # flags_to_clear
-        PassFlags.Empty
+        PassFlags.Empty,
     ),
 ]
 
@@ -346,7 +345,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
 
         is_element_wise = True
         for op in reverse_ops_list:
-            if not op.type in elem_wise_ops and not op.type in npu_dma_ops:
+            if op.type not in elem_wise_ops and op.type not in npu_dma_ops:
                 is_element_wise = False
                 break
 
@@ -368,9 +367,9 @@ def pack_into_passes(nng, arch, verbose_packing=False):
         ops_list = list(reversed(reverse_ops_list))
         intermediates = list(reversed(reverse_intermediates))
 
-        if primary_op == None:
+        if primary_op is None:
             primary_op = create_primary_op(ops_list)
-            if primary_op != None:
+            if primary_op is not None:
                 visit_tensor_refcount[primary_op.inputs[0]] += 1
                 npu_block_type = primary_op.attrs["npu_block_type"]
                 for input_tens in primary_op.inputs:
diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py
index 64de9709..d7623c5a 100644
--- a/ethosu/vela/range_set.py
+++ b/ethosu/vela/range_set.py
@@ -19,7 +19,6 @@
 # Helper classes to track memory accesses for calculating dependencies between Commands.
 
 from enum import IntEnum
-from collections import defaultdict
 from functools import lru_cache
 
 
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 120cf8b1..460cf016 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -22,25 +22,19 @@
 
 from collections import defaultdict
 from enum import Enum, IntEnum
+
+import numpy as np
+
+from . import scaling
 from .high_level_command_stream import CommandType
-from .ethos_u55_regs.ethos_u55_regs import *
-from .tensor import MemArea, TensorBlockTraversal
+from .ethos_u55_regs.ethos_u55_regs import cmd0, cmd1, acc_format, elementwise_mode, rounding, activation, ifm_precision
+from .tensor import MemArea, TensorBlockTraversal, TensorFormat
 from .operation import NpuBlockType
 from .numeric_util import quantise_float32, round_up, round_away_zero, round_up_to_int, clamp_sigmoid, clamp_tanh
 from .data_type import BaseType, DataType
-import numpy as np
 from .shared_buffer_allocation import SharedBufferAllocation
 from .architecture_features import SharedBufferArea, SHRAMElements, ArchitectureFeatures
-from .nn_graph import TensorFormat, SchedulingStrategy
-from .range_set import (
-    MemoryAccessSet,
-    AccessDirection,
-)
-from .mark_tensors import (
-    reshape_operations,
-)
 from .architecture_features import Block, Kernel, Rect
-from . import scaling
 
 
 class RegisterMachine:
@@ -372,7 +366,6 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
             param = relative_dep[CommandType.DMA][0]
             param = min(param, 0xF)  # Clamp to allowable wait amount
             emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, param, absolute_dep[CommandType.DMA][0])
-            prev_cmd = None  # Clear any dependency
 
     for cmd in cmd_stream:
         if cmd.cmdtype == CommandType.DMA:
@@ -684,7 +677,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
             ifm_max = cmd.ifm_tensor.quantization.max
 
             # Emit commands for any fused activation function
-            if faf == None:
+            if faf is None:
                 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
                 # Even if no activation function, values need to be set to override previous values
                 faf_min = ofm_quant_qmin
@@ -765,13 +758,13 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
                 ),
             ):
 
-                if tens == None:
+                if tens is None:
                     continue
 
-                need_zero_point = (faf != None) or (fmf == "ConcatSliceWrite")
+                need_zero_point = (faf is not None) or (fmf == "ConcatSliceWrite")
                 if (
                     primary_op.type in set(("AvgPool", "AvgPoolAct")) and not need_zero_point
-                ) or tens.quantization == None:
+                ) or tens.quantization is None:
                     # Actual integer operation, just set scale to 1 and zero point to 0
                     emit.cmd0_with_param(zero_point_op, 0)
                 else:
diff --git a/ethosu/vela/scaling.py b/ethosu/vela/scaling.py
index ce0259a5..3b749ddd 100644
--- a/ethosu/vela/scaling.py
+++ b/ethosu/vela/scaling.py
@@ -19,9 +19,10 @@
 # Contains various scaling calculations for weights, elementwise operations, pooling etc.
 
 import math
-from .numeric_util import round_away_zero
 from enum import IntEnum
 
+from .numeric_util import round_away_zero
+
 
 class OperandToScale(IntEnum):
     OPa = 1
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index d51b5ac6..fe31a463 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -19,24 +19,17 @@
 # The scheduler costs various strategies for scheduling the network in order to select the block configuration.
 
 import enum
-from .nn_graph import (
-    TensorPurpose,
-    TensorSubPurpose,
-    TensorFormat,
-    MemArea,
-    SchedulingStrategy,
-    CascadedPass,
-    PassPlacement,
-    SchedulerRewrite,
-    Operation,
-    NpuBlockType,
-)
-from . import live_range
+import copy
+
 import numpy as np
+
+from . import live_range
 from . import npu_performance
 from . import stats_writer
+from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, MemArea
+from .operation import NpuBlockType
+from .nn_graph import SchedulingStrategy, CascadedPass, PassPlacement, SchedulerRewrite
 from .npu_performance import make_bandwidth_array, make_macs_array, make_cycles_array, make_metrics_arrays, PassCycles
-import time, copy
 from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_pass_list
 from .shared_buffer_allocation import (
     find_block_configs_suitable_for_pass_and_shared_buffer,
@@ -279,7 +272,6 @@ class DynamicProgrammingScheduler:
         if len(candidates) <= 1:
             return candidates
         assert remove_equally_good_candidates
-        start = time.time()
         pareto_vals = np.zeros((len(candidates), DynamicProgrammingScheduler.num_pareto_metrics))
         ids = np.arange(len(candidates), dtype=np.int32)
         for idx, cand in enumerate(candidates):
@@ -713,7 +705,7 @@ class DynamicProgrammingScheduler:
 
     def get_block_configs(self, ps):
         if ps.placement != PassPlacement.Npu:
-            return [(1, 1, 1, 1)] # default
+            return [(1, 1, 1, 1)]  # default
 
         block_configs = find_block_configs_suitable_for_pass_and_shared_buffer(self.arch, ps)
 
@@ -764,9 +756,7 @@ class DynamicProgrammingScheduler:
             for tens in ps.intermediates:
                 if tens.mem_area == self.mem_area:
                     if tens.purpose == TensorPurpose.Weights:
-                        sram_used += tens.storage_size_for_sub_purpose(
-                            TensorSubPurpose.DoubleBuffer, block_config[3]
-                        )
+                        sram_used += tens.storage_size_for_sub_purpose(TensorSubPurpose.DoubleBuffer, block_config[3])
                         rewrite_list.append(
                             (
                                 SchedulerRewrite.ChangeTensorSubPurpose,
@@ -884,7 +874,7 @@ class DynamicProgrammingScheduler:
                 % (len(self.sg.passes), len(pass_to_cascaded_pass))
             )
             for ps in self.sg.passes:
-                if not ps in pass_to_cascaded_pass:
+                if ps not in pass_to_cascaded_pass:
                     print("%3d pass missing cascaded pass %s" % (ps.time, ps))
 
             assert len(pass_to_cascaded_pass) == len(self.sg.passes)
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
index b5408d19..29be6d8d 100644
--- a/ethosu/vela/shared_buffer_allocation.py
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -19,10 +19,9 @@
 # Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.
 
 import numpy as np
-from .nn_graph import NpuBlockType
-from .numeric_util import round_up_divide, round_up
+
+from .operation import NpuBlockType
 from .architecture_features import Block, Kernel, SHRAMElements, SharedBufferArea, ArchitectureFeatures
-from . import pass_packing
 
 
 class SharedBufferAllocation:
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index c4b4cd9e..3fd29d12 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -18,12 +18,15 @@
 # Description:
 # Writes out per-pass and summary performance statistics to CSV files.
 
+import csv
+import sys
+
 import numpy as np
-from .nn_graph import MemArea, TensorPurpose, PassPlacement
+
+from .tensor import MemArea, TensorPurpose
+from .nn_graph import PassPlacement
 from .npu_performance import PassCycles, MacCount, BandwidthDirection
-import csv
 from .numeric_util import round_up_to_int
-import sys
 
 
 def write_summary_metrics_csv(nng, summary_filename, arch):
@@ -246,7 +249,7 @@ def print_performance_metrics_for_strat(
 
     print(file=f)
     for mem_area, label in mem_area_labels:
-        if not mem_area in memory_used:
+        if mem_area not in memory_used:
             continue
 
         aug_label = label + " used"
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 5d0206cc..5cebf4d0 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -19,10 +19,11 @@
 # Internal representation of a Neural Network Tensor.
 
 import enum
-from . import numeric_util
-import numpy as np
-from . import data_type
 import uuid
+
+import numpy as np
+
+from . import numeric_util
 from .range_set import MemoryRangeSet
 from .numeric_util import round_up_divide
 
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 94aa6088..255156e6 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -19,13 +19,14 @@
 # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
 # worked out from the allowable overlaps that are calculated by the live range analysis.
 
-from . import live_range
-from .tensor import MemArea
 import math
-from . import numeric_util
+
 import numpy as np
-from .nn_graph import TensorAllocator, PassPlacement
 
+from . import live_range
+from . import numeric_util
+from .tensor import MemArea
+from .nn_graph import TensorAllocator
 from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
 
 
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
index e2b90765..e8b40bdb 100644
--- a/ethosu/vela/tflite_mapping.py
+++ b/ethosu/vela/tflite_mapping.py
@@ -20,19 +20,11 @@
 # Contains a mapping from the various TensorFlow Lite enums and options structs, generated by the FlatBuffer code
 # generator, to Vela's internal format.
 
-import numpy as np
 import struct
 
-from .data_type import DataType
-
-from .tflite.TensorType import TensorType
-from .tflite.BuiltinOperator import BuiltinOperator
-from .tflite.BuiltinOptions import BuiltinOptions
-
-
-from .tflite.Padding import Padding
-from .tflite.ActivationFunctionType import ActivationFunctionType
+import numpy as np
 
+from .data_type import DataType
 from .tflite import Conv2DOptions
 from .tflite import DepthwiseConv2DOptions
 from .tflite import ConcatEmbeddingsOptions
@@ -132,6 +124,11 @@ from .tflite import ScatterNdOptions
 from .tflite import SegmentSumOptions
 from .tflite import SelectV2Options
 from .tflite import WhileOptions
+from .tflite.TensorType import TensorType
+from .tflite.BuiltinOperator import BuiltinOperator
+from .tflite.BuiltinOptions import BuiltinOptions
+from .tflite.Padding import Padding
+from .tflite.ActivationFunctionType import ActivationFunctionType
 
 
 def inverse_map(map):
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
index 535847d7..4456d5a0 100644
--- a/ethosu/vela/tflite_reader.py
+++ b/ethosu/vela/tflite_reader.py
@@ -18,14 +18,15 @@
 # Description:
 # Functions used to read from a TensorFlow Lite format file.
 
-from .tflite.Model import Model
-from .tflite.BuiltinOperator import BuiltinOperator
+import os.path
 
 import numpy as np
-import os.path
-from .nn_graph import Graph, Operation, Subgraph
-from .tensor import Tensor, QuantizationParameters
 
+from .tflite.Model import Model
+from .tflite.BuiltinOperator import BuiltinOperator
+from .nn_graph import Graph, Subgraph
+from .operation import Operation
+from .tensor import Tensor, QuantizationParameters
 from .tflite_mapping import builtin_operator_map, datatype_map, datatype_map_numpy, DataType
 
 
@@ -184,12 +185,7 @@ class TFLiteSubgraph:
 
 class TFLiteGraph:
     def __init__(
-        self,
-        filename,
-        batch_size=1,
-        feed_dict={},
-        output_node_names=[],
-        initialisation_nodes=[],
+        self, filename, batch_size=1, feed_dict={}, output_node_names=[], initialisation_nodes=[],
     ):
 
         self.op_times = {}
@@ -238,15 +234,9 @@ class TFLiteGraph:
 
 
 def read_tflite(
-    filename,
-    batch_size=1,
-    feed_dict={},
-    output_node_names=[],
-    initialisation_nodes=[],
+    filename, batch_size=1, feed_dict={}, output_node_names=[], initialisation_nodes=[],
 ):
-    tflite_graph = TFLiteGraph(
-        filename, batch_size, feed_dict, output_node_names, initialisation_nodes
-    )
+    tflite_graph = TFLiteGraph(filename, batch_size, feed_dict, output_node_names, initialisation_nodes)
     nng = tflite_graph.nng
     nng.refresh_after_modification()
     return nng
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index f55d1ce5..1f072424 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -18,7 +18,13 @@
 # Description:
 # Functions used to write to a TensorFlow Lite format file. Supports adding in file identifiers.
 
+import numpy as np
 import flatbuffers
+from flatbuffers.builder import UOffsetTFlags
+
+# ugh, the python flatbuffer interface is missing a method to add in file identifier. patching it in here:
+import flatbuffers.number_types as N
+from flatbuffers import encode
 
 from .tflite import Tensor
 from .tflite import QuantizationParameters
@@ -28,22 +34,14 @@ from .tflite import OperatorCode
 from .tflite import Operator
 from .tflite import Buffer
 from .tflite import Metadata
-
-import numpy as np
-
 from .tflite_mapping import datatype_inv_map, builtin_operator_inv_map, custom_prefix, BuiltinOperator
 from .nn_graph import PassPlacement
 from .tensor import TensorPurpose, MemArea
-from flatbuffers.builder import UOffsetTFlags
 
 tflite_version = 3
 tflite_file_identifier = "TFL" + str(tflite_version)
 
 
-import flatbuffers.number_types as N
-from flatbuffers import encode
-
-
 def FinishWithFileIdentifier(self, rootTable, fid):
     if fid is None or len(fid) != 4:
         raise Exception("fid must be 4 chars")
@@ -163,8 +161,8 @@ class TFLiteSerialiser:
                 tf_code, opt_serializer = builtin_operator_inv_map[code]
             except KeyError:
                 print(
-                    "Warning: Writing operation %s, which does not have a direct TensorFlow Lite mapping, as a custom operation"
-                    % (code,)
+                    "Warning: Writing operation %s, which does not have a direct TensorFlow Lite mapping,"
+                    "as a custom operation" % (code,)
                 )
                 tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix]
 
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index f07aec89..07772e66 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -20,11 +20,10 @@
 #
 # Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
 
-import sys
-import os.path
 import os
+import os.path
+import sys
 import time
-import subprocess
 import configparser
 import argparse
 import ast
@@ -37,7 +36,8 @@ from . import compiler_driver
 from . import scheduler
 from ._version import __version__
 from .scheduler import ParetoMetric
-from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement
+from .nn_graph import TensorAllocator, PassPlacement
+from .tensor import MemArea
 
 
 def process(fname, arch, model_reader_options, compiler_options, scheduler_options):
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index 92197248..ee554b5c 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -18,12 +18,11 @@
 # Description:
 # Compresses and pads the weigths. It also calculates the scales and packs with the biases.
 
-import os
-import sys
-import enum
 import math
-import numpy as np
 from collections import namedtuple
+
+import numpy as np
+
 from .numeric_util import round_up
 from .scaling import quantise_scale, reduced_quantise_scale
 from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, TensorBlockTraversal
@@ -44,7 +43,7 @@ def encode(weight_stream):
 
     # pad with 0xFF as needed so the length of the weight stream
     # is a multiple of 16
-  
+
     while (len(compressed) % 16) != 0:
         compressed.append(0xFF)
 
@@ -348,7 +347,7 @@ def update_pass_weight_and_scale_tensors(nng, arch):
 
     for sg in nng.subgraphs:
         for ps in sg.passes:
-            if ps.weight_tensor != None:
+            if ps.weight_tensor is not None:
                 npu_usage_of_tensor = find_npu_usage_of_tensor(ps.weight_tensor)
                 if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise:
                     ps.weight_tensor.quant_values = np.transpose(ps.weight_tensor.quant_values, (0, 1, 3, 2))
@@ -382,7 +381,7 @@ def update_pass_weight_and_scale_tensors(nng, arch):
                     src_tens.weight_compression_scales = ps.weight_tensor.weight_compression_scales
                     src_tens.weight_compressed_offsets = ps.weight_tensor.weight_compressed_offsets
 
-            if ps.scale_tensor != None:
+            if ps.scale_tensor is not None:
                 rescale_for_faf = False
                 activation_ops = set(("Sigmoid", "Tanh"))
                 if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):
-- 
cgit v1.2.1