Update version of Black to 22.3.0

Update version of Black to 22.3.0 due to updated dependencies. Updates to fix reported issues due to new version. Signed-off-by: Jonas Ohlsson <jonas.ohlsson@arm.com> Change-Id: I60056aae452093ce8dcea1f499ecced22b25eef1
author: Jonas Ohlsson <jonas.ohlsson@arm.com> 2022-03-30 10:30:25 +0200
committer: Jonas Ohlsson <jonas.ohlsson@arm.com> 2022-03-30 15:54:14 +0200
commit: d85750702229af97c0b0bbda6e397a23254b6144 (patch)
tree: 389962105a35d5cef595cfeb5d640bd59a0d0ff8
parent: cc5f4de1c35ba44fca7ff6295c6ae846f8242344 (diff)
download: ethos-u-vela-d85750702229af97c0b0bbda6e397a23254b6144.tar.gz
28 files changed, 598 insertions, 123 deletions
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ae2bae5..9c70706 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@ repos:
     -   id: reorder-python-imports
 
 -   repo: https://github.com/psf/black
-    rev: 19.10b0
+    rev: 22.3.0
     hooks:
     -   id: black
         language_version: python3
diff --git a/ethosu/mlw_codec/test/test_mlw_codec.py b/ethosu/mlw_codec/test/test_mlw_codec.py
index 3ff26e5..d77c82a 100644
--- a/ethosu/mlw_codec/test/test_mlw_codec.py
+++ b/ethosu/mlw_codec/test/test_mlw_codec.py
@@ -24,7 +24,7 @@ from ethosu import mlw_codec
 
 
 class TestMLWCodec:
-    """ This class is responsible to test the mlw_codec library
+    """This class is responsible to test the mlw_codec library
     It mainly tests the two methods encode() and decode() with different inputs"""
 
     weights = [0, 2, 3, 0, -1, -2, -3, 0, 0, 0, 1, -250, 240] * 3
diff --git a/ethosu/vela/api.py b/ethosu/vela/api.py
index f49df25..3382ea9 100644
--- a/ethosu/vela/api.py
+++ b/ethosu/vela/api.py
@@ -139,11 +139,11 @@ class NpuDataType(Enum):
         return self.value[1]
 
     def size_in_bits(self) -> int:
-        """ Size of the data type in bits"""
+        """Size of the data type in bits"""
         return self.value[0]
 
     def size_in_bytes(self) -> int:
-        """ Size of the data type in bytes"""
+        """Size of the data type in bytes"""
         return self.value[0] // 8
 
     def min_value(self) -> int:
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index e79ed72..08ff260 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -647,7 +647,9 @@ class ArchitectureFeatures:
 
         else:
             raise CliOptionError(
-                "--system-config", self.system_config, f"Section {sys_cfg_section} not found in Vela config file",
+                "--system-config",
+                self.system_config,
+                f"Section {sys_cfg_section} not found in Vela config file",
             )
 
         # read the memory mode
@@ -678,7 +680,9 @@ class ArchitectureFeatures:
 
         else:
             raise CliOptionError(
-                "--memory-mode", self.memory_mode, f"Section {mem_mode_section} not found in Vela config file",
+                "--memory-mode",
+                self.memory_mode,
+                f"Section {mem_mode_section} not found in Vela config file",
             )
 
         # override sram to onchipflash
@@ -777,7 +781,8 @@ class ArchitectureFeatures:
             # check for recursion loop
             if inheritance_section == section:
                 raise ConfigOptionError(
-                    "inherit", f"{inheritance_section}. This references its own section and recursion is not allowed",
+                    "inherit",
+                    f"{inheritance_section}. This references its own section and recursion is not allowed",
                 )
             result = self._read_config(inheritance_section, key, result, found)
 
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index cf26eb3..2715c8f 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -44,10 +44,9 @@ from .tensor import Tensor
 class CompilerOptions:
     """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
 
-Note the difference between ArchitectureFeatures and CompilerOptions
-- ArchitectureFeatures is for changing the Ethos-U and system architecture
-- CompilerOptions is for changing the behaviour of the compiler
-"""
+    Note the difference between ArchitectureFeatures and CompilerOptions
+    - ArchitectureFeatures is for changing the Ethos-U and system architecture
+    - CompilerOptions is for changing the behaviour of the compiler"""
 
     def __init__(
         self,
@@ -194,7 +193,10 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type):
     # Calculate live ranges for all constant Npu tensors, in permanent storage
     for sg in npu_subgraphs:
         lr_graph_flash = live_range.create_linear_live_range_graph(
-            sg, permanent_storage, MemType.Permanent_NPU, lr_graph=lr_graph_flash,
+            sg,
+            permanent_storage,
+            MemType.Permanent_NPU,
+            lr_graph=lr_graph_flash,
         )
 
     if npu_subgraphs:
diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py
index 90af02c..4ad2a33 100644
--- a/ethosu/vela/driver_actions.py
+++ b/ethosu/vela/driver_actions.py
@@ -119,8 +119,7 @@ def emit_dump_shram(data: List[int]):
 
 
 def create_driver_payload(register_command_stream: List[int], arch: ArchitectureFeatures) -> bytes:
-    """Creates driver header and includes the given command
-    """
+    """Creates driver header and includes the given command"""
     # Prepare driver actions for this command tensor
     da_list: List[int] = []
     emit_fourcc(da_list, "COP1")
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index eef4e6d..81c0d5b 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -67,7 +67,13 @@ def generate_high_level_commands_for_sched_op(sched_op, schedule):
     ofm_tensor = ps.ofm_tensor
 
     # Get Tensors and Full Shapes
-    (ifm_tensor, ifm2_tensor, uncomp_weight_tensor, _, _,) = parent_op.get_ifm_ifm2_weights_biases_ofm()
+    (
+        ifm_tensor,
+        ifm2_tensor,
+        uncomp_weight_tensor,
+        _,
+        _,
+    ) = parent_op.get_ifm_ifm2_weights_biases_ofm()
     ifm = sched_op.ifm
     ifm2 = sched_op.ifm2
     ofm_shape = sched_op.ofm.shape
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 45baf44..ccf4929 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -200,7 +200,11 @@ def merge_elementwise_op_ranges(sg, sched_op, lr_graph, target_mem_area, target_
 
 
 def extract_live_ranges_from_cascaded_passes(
-    sg, target_mem_area, target_mem_type_set, lr_graph=None, cpu_tensor_alignment=Tensor.AllocationQuantum,
+    sg,
+    target_mem_area,
+    target_mem_type_set,
+    lr_graph=None,
+    cpu_tensor_alignment=Tensor.AllocationQuantum,
 ):
     if lr_graph is None:
         lr_graph = LiveRangeGraph()
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 4ffca49..0c8a907 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -59,14 +59,26 @@ class PassCycles(IntEnum):
     Size = auto()
 
     def display_name(self):
-        return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[
-            self.value
-        ]
+        return (
+            "NPU",
+            "SRAM Access",
+            "DRAM Access",
+            "On-chip Flash Access",
+            "Off-chip Flash Access",
+            "Total",
+            "Size",
+        )[self.value]
 
     def identifier_name(self):
-        return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[
-            self.value
-        ]
+        return (
+            "npu",
+            "sram_access",
+            "dram_access",
+            "on_chip_flash_access",
+            "off_chip_flash_access",
+            "total",
+            "size",
+        )[self.value]
 
     @staticmethod
     def all():
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 1fefdf4..8535fa0 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -87,7 +87,14 @@ quantization_ops = set((Op.Dequantize, Op.Max, Op.Min))
 cpu_ops = set((Op.Softmax, Op.LRN, Op.Shape, Op.Pad, Op.AddN)) | quantization_ops
 
 startup_init_ops = set((Op.Const, Op.Placeholder, Op.SubgraphInput))
-memory_only_ops = set((Op.Squeeze, Op.Reshape, Op.QuantizedReshape, Op.ExpandDims,))
+memory_only_ops = set(
+    (
+        Op.Squeeze,
+        Op.Reshape,
+        Op.QuantizedReshape,
+        Op.ExpandDims,
+    )
+)
 
 
 test_sequence = [
diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py
index f03174e..6b28282 100644
--- a/ethosu/vela/range_set.py
+++ b/ethosu/vela/range_set.py
@@ -21,7 +21,7 @@ from functools import lru_cache
 
 class RangeSet:
     """A Range set class to track ranges and whether they intersect.
-Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas."""
+    Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas."""
 
     def __init__(self, start=None, end=None, ranges=None):
         if ranges is None:
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 3be2898..be01a75 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -521,7 +521,8 @@ def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], a
 
 
 def generate_block_config(
-    emit: CommandStreamEmitter, block_config: NpuShape3D,
+    emit: CommandStreamEmitter,
+    block_config: NpuShape3D,
 ):
     """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
     emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
@@ -530,7 +531,9 @@ def generate_block_config(
 
 
 def generate_shram_registers(
-    emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig,
+    emit: CommandStreamEmitter,
+    npu_op: NpuBlockOperation,
+    arch_block_config: ArchitectureBlockConfig,
 ):
     """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
     emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
@@ -775,9 +778,13 @@ def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElem
             if use_advanced_scaling:
                 # Use advanced implementation only when input/output scales differ,
                 # or when we can't guarantee the absence of rounding errors
-                (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
-                    input_scale, input2_scale, output_scale, bitdepth
-                )
+                (
+                    opa_scale,
+                    opa_shift,
+                    ofm_scale,
+                    shift,
+                    op_to_scale,
+                ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
                 opb_scale = 0  # Unused for this case
                 if npu_op.reversed_operands:
                     # If the operand order is reversed we also have to swap which operand is scaled
diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py
index 83126ea..b2c84d7 100644
--- a/ethosu/vela/register_command_stream_util.py
+++ b/ethosu/vela/register_command_stream_util.py
@@ -204,7 +204,16 @@ def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
     strides = get_strides(fm)
     height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
     height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
-    t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
+    t0 = get_address_range(
+        fm,
+        strides,
+        0,
+        0,
+        0,
+        min(height, height_0) - 1,
+        min(width, width_0) - 1,
+        depth - 1,
+    )
     if width > width_0:
         t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
     else:
@@ -443,7 +452,9 @@ def get_first_job_input_volume(
     # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
     start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
     end_coord = PointXYZ(
-        x=start_coord[0] + ifm_block.width, y=start_coord[1] + ifm_block.height, z=start_coord[2] + ifm_block.depth,
+        x=start_coord[0] + ifm_block.width,
+        y=start_coord[1] + ifm_block.height,
+        z=start_coord[2] + ifm_block.depth,
     )
     return (start_coord, end_coord, 1)  # start, end, total jobs
 
@@ -456,12 +467,18 @@ def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
     if start_coord is None:
         return None
     end_coord = PointXYZ(
-        x=start_coord.x + ofm_block.width, y=start_coord.y + ofm_block.height, z=start_coord.z + ofm_block.depth,
+        x=start_coord.x + ofm_block.width,
+        y=start_coord.y + ofm_block.height,
+        z=start_coord.z + ofm_block.depth,
     )
     return (start_coord, end_coord, 1)  # start, end, total jobs for this OFM block
 
 
-def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
+def calc_blockdep(
+    arch: ArchitectureFeatures,
+    prev_op: Optional[NpuBlockOperation],
+    npu_op: NpuBlockOperation,
+) -> int:
     """Calculates the value of the BLOCKDEP register"""
     if prev_op is None:
         return 0
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index fe2d711..a19d053 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -113,7 +113,13 @@ class SchedulerOpInfo:
         self.full_weight_transfer_cycles = 0
 
     def copy(self):
-        res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,)
+        res = SchedulerOpInfo(
+            self.block_config,
+            self.weights_size,
+            self.stripe_input,
+            self.stripe_input2,
+            self.stripe,
+        )
         res.cascade = self.cascade
         return res
 
@@ -135,7 +141,10 @@ class SchedulerOptions:
     """Contains options for the Scheduler"""
 
     def __init__(
-        self, optimization_strategy, sram_target, verbose_schedule,
+        self,
+        optimization_strategy,
+        sram_target,
+        verbose_schedule,
     ):
         self.optimization_strategy = optimization_strategy
         self.optimization_sram_limit = sram_target
@@ -175,15 +184,28 @@ class SchedulerOperation:
         )
         self.ifm_ublock = arch.ifm_ublock
 
-        self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,)
+        self.ifm = SchedulerTensor(
+            ps.ifm_shapes[0],
+            ps.ifm_tensor.dtype,
+            ps.ifm_tensor.mem_area,
+            ps.ifm_tensor.format,
+        )
 
         self.ifm2 = None
         if ps.ifm2_tensor:
             self.ifm2 = SchedulerTensor(
-                ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format,
+                ps.ifm_shapes[1],
+                ps.ifm2_tensor.dtype,
+                ps.ifm2_tensor.mem_area,
+                ps.ifm2_tensor.format,
             )
 
-        self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,)
+        self.ofm = SchedulerTensor(
+            ps.ofm_shapes[0],
+            ps.ofm_tensor.dtype,
+            ps.ofm_tensor.mem_area,
+            ps.ofm_tensor.format,
+        )
 
         # Input volume width and height required to produce the smallest possible stripe
         self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
@@ -481,7 +503,11 @@ class Scheduler:
         lr_graph = live_range.LiveRangeGraph()
         for mem_area, mem_type_set in memories_list:
             live_range.extract_live_ranges_from_cascaded_passes(
-                self.nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum,
+                self.nng.get_root_subgraph(),
+                mem_area,
+                mem_type_set,
+                lr_graph,
+                Tensor.AllocationQuantum,
             )
 
         # Populate time-array with memory used by live ranges
@@ -923,7 +949,11 @@ class Scheduler:
         return best_schedule
 
     def optimize_schedule(
-        self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions,
+        self,
+        schedule: Schedule,
+        max_sched: Schedule,
+        max_template: Schedule,
+        options: SchedulerOptions,
     ) -> Schedule:
         """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
         sram_limit = options.optimization_sram_limit
@@ -994,7 +1024,11 @@ class Scheduler:
         lr_graph = live_range.LiveRangeGraph()
         for mem_area, mem_type_set in memories_list:
             live_range.extract_live_ranges_from_cascaded_passes(
-                self.nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum,
+                self.nng.get_root_subgraph(),
+                mem_area,
+                mem_type_set,
+                lr_graph,
+                Tensor.AllocationQuantum,
             )
         max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
 
@@ -1252,7 +1286,14 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
             cascaded_passes = []
             for idx, ps in enumerate(sg.passes):
                 cps = CascadedPass(
-                    ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False,
+                    ps.name,
+                    SchedulingStrategy.WeightStream,
+                    ps.inputs,
+                    [],
+                    ps.outputs,
+                    [ps],
+                    ps.placement,
+                    False,
                 )
 
                 cps.time = idx
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 711c1e0..9565bc5 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -300,11 +300,20 @@ class SoftMax:
 
         # PASS 5 - Sub
         headroom_offset = create_const_tensor(
-            "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant,
+            "headroom_offset_const",
+            [1, 1, 1, 1],
+            DataType.int32,
+            [12 + 31 - 8],
+            np.int32,
+            quantization=no_scale_quant,
         )
         right_shift = add_op_get_ofm(
             create_sub(
-                f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation,
+                f"{self.op.name}_sub{pass_number}",
+                headroom_offset,
+                headroom_plus_one,
+                no_scale_quant,
+                activation,
             )
         )
 
@@ -329,7 +338,13 @@ class SoftMax:
 
         # PASS 9 - SHL
         shifted_sum_minus_one = add_op_get_ofm(
-            create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,)
+            create_shl(
+                f"{self.op.name}_shl{pass_number}",
+                shifted_sum_minus_one,
+                one,
+                no_scale_quant,
+                activation,
+            )
         )
 
         # PASS 10 - Add
@@ -353,7 +368,11 @@ class SoftMax:
         )
         rescaled = add_op_get_ofm(
             create_mul(
-                f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2,
+                f"{self.op.name}_mul{pass_number}",
+                half_denominator,
+                neg_32_over_17,
+                two_scale_quant,
+                activation2,
             )
         )
 
@@ -362,7 +381,13 @@ class SoftMax:
             "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant
         )
         rescale_w_offset = add_op_get_ofm(
-            create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,)
+            create_add(
+                f"{self.op.name}_add{pass_number}",
+                rescaled,
+                const_48_over_17,
+                one_scale_quant,
+                activation,
+            )
         )
 
         # PASS 13 - 27
@@ -376,12 +401,22 @@ class SoftMax:
         for _ in range(3):
             # PASS 13, 18, 23 - MUL
             half_denominator_times_x = add_op_get_ofm(
-                create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,)
+                create_mul(
+                    f"{self.op.name}_mul{pass_number}",
+                    nr_x,
+                    half_denominator,
+                    two_scale_quant,
+                    activation2,
+                )
             )
             # PASS 14, 19, 24 - SUB
             one_minus_half_denominator_times_x = add_op_get_ofm(
                 create_sub(
-                    f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation,
+                    f"{self.op.name}_sub{pass_number}",
+                    F2_one,
+                    half_denominator_times_x,
+                    one_scale_quant,
+                    activation,
                 )
             )
             # PASS 15, 20, 25 - MUL
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index d8a274b..22605a6 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -256,7 +256,8 @@ def print_performance_metrics_for_strat(
         label += " bandwidth"
         bandwidth = arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
         print(
-            f"Design peak {label:25}    {bandwidth:12.2f} GB/s", file=f,
+            f"Design peak {label:25}    {bandwidth:12.2f} GB/s",
+            file=f,
         )
     print(file=f)
     for mem_area, label in mem_area_labels:
@@ -302,7 +303,8 @@ def print_performance_metrics_for_strat(
         fm_bws = bws[TensorPurpose.FeatureMap]
         aug_label = label + " bandwidth"
         print(
-            f"Average {aug_label:25}        {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s", file=f,
+            f"Average {aug_label:25}        {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s",
+            file=f,
         )
         print(
             f"Input   {aug_label:25}        {np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0:12.2f} MB/batch",
@@ -328,10 +330,12 @@ def print_performance_metrics_for_strat(
         print(file=f)
 
     print(
-        f"Neural network macs                      {int(macs):12d} MACs/batch", file=f,
+        f"Neural network macs                      {int(macs):12d} MACs/batch",
+        file=f,
     )
     print(
-        f"Network Tops/s                           {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s", file=f,
+        f"Network Tops/s                           {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s",
+        file=f,
     )
     print(file=f)
 
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index c8b5129..ab65740 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -128,7 +128,12 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m
     print("\n" + "#" * 80)
     sg_placement = (
         sg.placement.name
-        if mem_type_set.intersection((MemType.Permanent_NPU, MemType.Permanent_CPU,))
+        if mem_type_set.intersection(
+            (
+                MemType.Permanent_NPU,
+                MemType.Permanent_CPU,
+            )
+        )
         else "Cpu and Npu"
     )
     print(
@@ -141,7 +146,15 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m
     min_mem_usage_for_alloc = max(memory_hist)
     print("Start Time -   End Time: Start Addr -   End Addr: Tensor Size: Memory Usage:  Tensor Purpose: Tensor Name")
     for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
-        (lr.start_time, lr.end_time, lr.size, tens.address, tens.address + lr.size, tens.purpose, tens.name,)
+        (
+            lr.start_time,
+            lr.end_time,
+            lr.size,
+            tens.address,
+            tens.address + lr.size,
+            tens.purpose,
+            tens.name,
+        )
         for tens, lr in lrs.ranges.items()
     ):
         print(
@@ -184,7 +197,11 @@ def allocate(
 ):
     # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
     lrs = live_range.extract_live_ranges_from_cascaded_passes(
-        sg, mem_area, mem_type_set, lr_graph=lr_graph, cpu_tensor_alignment=cpu_tensor_alignment,
+        sg,
+        mem_area,
+        mem_type_set,
+        lr_graph=lr_graph,
+        cpu_tensor_alignment=cpu_tensor_alignment,
     )
     total_sz = 0
     if lrs.ranges:
diff --git a/ethosu/vela/test/extapi/test_extapi_encode_weights.py b/ethosu/vela/test/extapi/test_extapi_encode_weights.py
index 6367cb3..87c504f 100644
--- a/ethosu/vela/test/extapi/test_extapi_encode_weights.py
+++ b/ethosu/vela/test/extapi/test_extapi_encode_weights.py
@@ -24,7 +24,8 @@ from ethosu.vela.api import NpuBlockTraversal
 
 
 @pytest.mark.parametrize(
-    "arch", list(NpuAccelerator),
+    "arch",
+    list(NpuAccelerator),
 )
 @pytest.mark.parametrize("dilation_x", [1, 2])
 @pytest.mark.parametrize("dilation_y", [1, 2])
@@ -32,7 +33,12 @@ from ethosu.vela.api import NpuBlockTraversal
 @pytest.mark.parametrize("depth_control", [1, 2, 3])
 @pytest.mark.parametrize("weights_shape_and_block_depth", [((16, 16, 16, 16), 8), ((3, 3, 25, 16), 8)])
 def test_encode_weights(
-    arch, weights_shape_and_block_depth, dilation_x, dilation_y, ifm_bitdepth, depth_control,
+    arch,
+    weights_shape_and_block_depth,
+    dilation_x,
+    dilation_y,
+    ifm_bitdepth,
+    depth_control,
 ):
     """
     This unit test checks the interface of the API function but not the functionality.
diff --git a/ethosu/vela/test/test_register_command_stream_util.py b/ethosu/vela/test/test_register_command_stream_util.py
index 985523f..86a48ff 100644
--- a/ethosu/vela/test/test_register_command_stream_util.py
+++ b/ethosu/vela/test/test_register_command_stream_util.py
@@ -131,14 +131,34 @@ def test_calc_blockdep0():
     op2 takes 1 block to complete, which results in blockdep 0
     """
     op1 = NpuElementWiseOperation(NpuElementWiseOp.CLZ)
-    op1.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x60, layout=NpuLayout.NHCWB16,)
-    intermediate_fm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xA0, layout=NpuLayout.NHCWB16,)
+    op1.ifm = create_feature_map(
+        NpuShape3D(height=1, width=1, depth=1),
+        1,
+        0x60,
+        layout=NpuLayout.NHCWB16,
+    )
+    intermediate_fm = create_feature_map(
+        NpuShape3D(height=1, width=1, depth=1),
+        1,
+        0xA0,
+        layout=NpuLayout.NHCWB16,
+    )
     op1.ofm = intermediate_fm
     op1.block_config = NpuShape3D(height=1, width=1, depth=4)
     op2 = NpuElementWiseOperation(NpuElementWiseOp.SUB)
-    op2.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x39AC0, layout=NpuLayout.NHCWB16,)
+    op2.ifm = create_feature_map(
+        NpuShape3D(height=1, width=1, depth=1),
+        1,
+        0x39AC0,
+        layout=NpuLayout.NHCWB16,
+    )
     op2.ifm2 = intermediate_fm
-    op2.ofm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xE0, layout=NpuLayout.NHCWB16,)
+    op2.ofm = create_feature_map(
+        NpuShape3D(height=1, width=1, depth=1),
+        1,
+        0xE0,
+        layout=NpuLayout.NHCWB16,
+    )
     op2.block_config = NpuShape3D(height=1, width=1, depth=4)
     arch = create_default_arch(Accelerator.Ethos_U55_128)
     block_dep = calc_blockdep(arch, op1, op2)
@@ -153,8 +173,18 @@ def test_calc_blockdep2():
     which results in blockdep 2
     """
     op1 = NpuConv2DOperation()
-    op1.ifm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,)
-    op1.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,)
+    op1.ifm = create_feature_map(
+        NpuShape3D(height=4, width=48, depth=8),
+        1,
+        0x4C80,
+        layout=NpuLayout.NHCWB16,
+    )
+    op1.ofm = create_feature_map(
+        NpuShape3D(height=4, width=48, depth=16),
+        1,
+        0x6480,
+        layout=NpuLayout.NHCWB16,
+    )
     op1.kernel = NpuKernel(1, 1)
     op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=208)]
     op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=160)]
@@ -162,10 +192,20 @@ def test_calc_blockdep2():
     op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
     op1.block_config = NpuShape3D(height=4, width=6, depth=16)
     op2 = NpuConvDepthWiseOperation()
-    op2.ifm = create_feature_map(NpuShape3D(height=3, width=48, depth=16), 1, 0, layout=NpuLayout.NHCWB16,)
+    op2.ifm = create_feature_map(
+        NpuShape3D(height=3, width=48, depth=16),
+        1,
+        0,
+        layout=NpuLayout.NHCWB16,
+    )
     # op2 has two tiles, the lower tile is produced by op1
     op2.ifm.tiles = NpuTileBox(height_0=2, height_1=2, width_0=48, addresses=[0x7680, 0, 0x6480, 0])
-    op2.ofm = create_feature_map(NpuShape3D(height=1, width=24, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,)
+    op2.ofm = create_feature_map(
+        NpuShape3D(height=1, width=24, depth=16),
+        1,
+        0x6480,
+        layout=NpuLayout.NHCWB16,
+    )
     op2.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2)
     op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=208)]
     op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=160)]
@@ -183,8 +223,18 @@ def test_calc_blockdep3():
     which results in blockdep 3
     """
     op1 = NpuConv2DOperation()
-    op1.ifm = create_feature_map(NpuShape3D(height=13, width=96, depth=1), 1, 0, layout=NpuLayout.NHWC,)
-    op1.ofm = create_feature_map(NpuShape3D(height=6, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,)
+    op1.ifm = create_feature_map(
+        NpuShape3D(height=13, width=96, depth=1),
+        1,
+        0,
+        layout=NpuLayout.NHWC,
+    )
+    op1.ofm = create_feature_map(
+        NpuShape3D(height=6, width=48, depth=8),
+        1,
+        0x7C80,
+        layout=NpuLayout.NHCWB16,
+    )
     op1.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2)
     op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=144)]
     op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=80)]
@@ -192,8 +242,18 @@ def test_calc_blockdep3():
     op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
     op1.block_config = NpuShape3D(height=6, width=3, depth=8)
     op2 = NpuConvDepthWiseOperation()
-    op2.ifm = create_feature_map(NpuShape3D(height=5, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,)
-    op2.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,)
+    op2.ifm = create_feature_map(
+        NpuShape3D(height=5, width=48, depth=8),
+        1,
+        0x7C80,
+        layout=NpuLayout.NHCWB16,
+    )
+    op2.ofm = create_feature_map(
+        NpuShape3D(height=4, width=48, depth=8),
+        1,
+        0x4C80,
+        layout=NpuLayout.NHCWB16,
+    )
     op2.kernel = NpuKernel(3, 3)
     op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=112)]
     op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=80)]
diff --git a/ethosu/vela/test/test_tflite_model_semantic.py b/ethosu/vela/test/test_tflite_model_semantic.py
index 84f9916..1e5dbd4 100644
--- a/ethosu/vela/test/test_tflite_model_semantic.py
+++ b/ethosu/vela/test/test_tflite_model_semantic.py
@@ -128,7 +128,14 @@ def test_constraint_quant_scale_inf():
 def test_constraint_ofm_scale_too_small():
     # Tests handling of OFM scale < 1e-38
     shp = [1, 10, 20, 16]
-    op = testutil.create_elemwise_op(Op.Mul, "mul", shp, shp, shp, ofm_quant=testutil.default_quant_params(),)
+    op = testutil.create_elemwise_op(
+        Op.Mul,
+        "mul",
+        shp,
+        shp,
+        shp,
+        ofm_quant=testutil.default_quant_params(),
+    )
     assert semantic_checker.is_operator_semantic_valid(op)
     op.ofm.quantization.scale_f32 = 1e-43
     assert not semantic_checker.is_operator_semantic_valid(op)
@@ -245,7 +252,12 @@ def create_strided_slice_op(in_shape, out_shape, start_offsets, end_offsets):
 
 
 def create_pad_op(
-    in_shape, out_shape, padding, in_dtype=DataType.int8, out_dtype=DataType.int8, pad_dtype=DataType.int32,
+    in_shape,
+    out_shape,
+    padding,
+    in_dtype=DataType.int8,
+    out_dtype=DataType.int8,
+    pad_dtype=DataType.int32,
 ):
     qp = testutil.default_quant_params()
     in0 = Tensor(in_shape, in_dtype, "in")
@@ -259,7 +271,11 @@ def create_pad_op(
 
 def test_constraint_pad_input_count():
     # Incorrect number of input tensors (2)
-    op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[0, 0], [1, 1], [1, 1], [0, 0]],)
+    op = create_pad_op(
+        in_shape=[1, 1, 1, 1],
+        out_shape=[1, 3, 3, 1],
+        padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+    )
     assert semantic_checker.is_operator_semantic_valid(op)
     op.add_input_tensor(op.inputs[0].clone())
     assert not semantic_checker.is_operator_semantic_valid(op)
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index e3db791..04d3cba 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -345,7 +345,12 @@ def test_constraint_concat_pass():
 
 
 def create_pad_op(
-    in_shape, out_shape, padding, in_dtype=DataType.int8, out_dtype=DataType.int8, pad_dtype=DataType.int32,
+    in_shape,
+    out_shape,
+    padding,
+    in_dtype=DataType.int8,
+    out_dtype=DataType.int8,
+    pad_dtype=DataType.int32,
 ):
     qp = testutil.default_quant_params()
     in0 = Tensor(in_shape, in_dtype, "in")
@@ -359,11 +364,23 @@ def create_pad_op(
 
 def test_constraint_padded_dimensions():
     # Incorrect padding dimensions, can only pad width and height
-    op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [1, 1], [0, 0]],)
+    op = create_pad_op(
+        in_shape=[1, 1, 1, 1],
+        out_shape=[1, 3, 3, 1],
+        padding=[[1, 1], [1, 1], [1, 1], [0, 0]],
+    )
     assert not support.is_operator_supported(op)
-    op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 0]],)
+    op = create_pad_op(
+        in_shape=[1, 1, 1, 1],
+        out_shape=[1, 3, 3, 1],
+        padding=[[1, 1], [1, 1], [0, 0]],
+    )
     assert support.is_operator_supported(op)
-    op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 1]],)
+    op = create_pad_op(
+        in_shape=[1, 1, 1, 1],
+        out_shape=[1, 3, 3, 1],
+        padding=[[1, 1], [1, 1], [0, 1]],
+    )
     assert not support.is_operator_supported(op)
 
 
@@ -371,12 +388,20 @@ def test_constraint_pad_shape():
     # PAD operator must be of shape (3,2) or (4,2)
     op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 0]])
     assert support.is_operator_supported(op)
-    op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[0, 0], [1, 1], [1, 1], [0, 0], [0, 0]],)
+    op = create_pad_op(
+        in_shape=[1, 1, 1, 1],
+        out_shape=[1, 3, 3, 1],
+        padding=[[0, 0], [1, 1], [1, 1], [0, 0], [0, 0]],
+    )
     assert not support.is_operator_supported(op)
 
 
 def test_constraint_pad_none():
-    op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[],)
+    op = create_pad_op(
+        in_shape=[1, 1, 1, 1],
+        out_shape=[1, 3, 3, 1],
+        padding=[],
+    )
     assert not support.is_operator_supported(op)
 
 
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index fb8a08c..88d58a3 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -512,7 +512,10 @@ def add_padding_fields(op, arch, nng):
                 )
             else:
                 padding, skirt = calc_padding_and_skirt(
-                    op.attrs["padding"], op.kernel, input_shape, op.attrs.get("explicit_padding"),
+                    op.attrs["padding"],
+                    op.kernel,
+                    input_shape,
+                    op.attrs.get("explicit_padding"),
                 )
 
             op.attrs["explicit_padding"] = padding
@@ -642,11 +645,11 @@ def convert_softmax(op, arch, nng):
 def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
     r"""Whenever there is a subgraph with this topology:
 
-       Input    X   For X = -1 or X > 0
-       |   \   /    This subgraph can be replaced with either
-       |    Mul     an Abs (if X = -1) or a LeakyReLU (if X > 0)
-       |   /
-       Max
+    Input    X   For X = -1 or X > 0
+    |   \   /    This subgraph can be replaced with either
+    |    Mul     an Abs (if X = -1) or a LeakyReLU (if X > 0)
+    |   /
+    Max
     """
 
     if op.type == Op.Maximum:
@@ -1246,7 +1249,12 @@ def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
                     quant = QuantizationParameters()
                     quant.zero_point = 0
                     bias_term_tens = create_const_tensor(
-                        op.name + "_bias", [1, 1, 1, 1], DataType.int16, [bias_term], np.int16, quantization=quant,
+                        op.name + "_bias",
+                        [1, 1, 1, 1],
+                        DataType.int16,
+                        [bias_term],
+                        np.int16,
+                        quantization=quant,
                     )
                     add_op.add_input_tensor(bias_term_tens)
                     add_op.set_output_tensor(op.ofm)
@@ -1370,7 +1378,12 @@ def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
             bias_shape = [shape[-1]]
             op.set_input_tensor(
                 create_const_tensor(
-                    "bias", bias_shape, inp.dtype, np.ones(bias_shape) * bias, value_dtype=np.int32, quantization=None,
+                    "bias",
+                    bias_shape,
+                    inp.dtype,
+                    np.ones(bias_shape) * bias,
+                    value_dtype=np.int32,
+                    quantization=None,
                 ),
                 2,
             )
@@ -1392,7 +1405,12 @@ def tflite_optimise_graph(nng, arch):
 
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [],
+            pre_process_list,
+            rewrite_unsupported=False,
         )
 
     # Handle Concat Ops
@@ -1413,13 +1431,23 @@ def tflite_optimise_graph(nng, arch):
 
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [rewrite_split_ops],
+            [],
+            rewrite_unsupported=False,
         )
 
     # Handle sg input output
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], [fix_sg_input_output], rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [],
+            [fix_sg_input_output],
+            rewrite_unsupported=False,
         )
 
     # Removal of memory only operators
@@ -1452,7 +1480,12 @@ def tflite_optimise_graph(nng, arch):
 
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [],
+            op_rewrite_list,
+            rewrite_unsupported=False,
         )
 
     for idx, sg in enumerate(nng.subgraphs):
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
index 2a70b06..a5f7fa2 100644
--- a/ethosu/vela/tflite_mapping.py
+++ b/ethosu/vela/tflite_mapping.py
@@ -669,7 +669,10 @@ builtin_operator_map = {
     BuiltinOperator.DIV: (Op.Div, OptionsSerializer("DivOptions", (fused_act,)), TFLITE_NO_INDICES),
     BuiltinOperator.SQUEEZE: (
         Op.Squeeze,
-        OptionsSerializer("SqueezeOptions", (("squeeze_dims", is_int_vec),),),
+        OptionsSerializer(
+            "SqueezeOptions",
+            (("squeeze_dims", is_int_vec),),
+        ),
         TFLITE_IFM_INDICES,
     ),
     BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: (
@@ -915,7 +918,13 @@ builtin_operator_map = {
     ),
     BuiltinOperator.VAR_HANDLE: (
         Op.VarHandle,
-        OptionsSerializer("VarHandleOptions", ("container", "shared_name",),),
+        OptionsSerializer(
+            "VarHandleOptions",
+            (
+                "container",
+                "shared_name",
+            ),
+        ),
         TFLITE_NO_INDICES,
     ),
     BuiltinOperator.READ_VARIABLE: (Op.ReadVariable, OptionsSerializer("ReadVariableOptions"), TFLITE_NO_INDICES),
@@ -923,7 +932,13 @@ builtin_operator_map = {
     BuiltinOperator.BROADCAST_ARGS: (Op.BroadcastArgs, None, TFLITE_NO_INDICES),
     BuiltinOperator.RANDOM_STANDARD_NORMAL: (
         Op.RandomStandardNormal,
-        OptionsSerializer("RandomOptions", ("seed", "seed2",),),
+        OptionsSerializer(
+            "RandomOptions",
+            (
+                "seed",
+                "seed2",
+            ),
+        ),
         TFLITE_NO_INDICES,
     ),
     BuiltinOperator.CUSTOM: (Op.Custom, CustomOptionsSerializer(), TFLITE_NO_INDICES),
diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py
index 3b7f248..b264479 100644
--- a/ethosu/vela/tflite_model_semantic.py
+++ b/ethosu/vela/tflite_model_semantic.py
@@ -41,7 +41,13 @@ def _optype_formatter(op_list):
 
 class TFLiteSemantic:
     # Categorised lists of operators
-    convolution_ops = set((Op.Conv2DBias, Op.Conv2D, Op.QuantizedConv2D,))
+    convolution_ops = set(
+        (
+            Op.Conv2DBias,
+            Op.Conv2D,
+            Op.QuantizedConv2D,
+        )
+    )
     depthwise_convolution_ops = set((Op.DepthwiseConv2DBias,))
     transpose_convolution_ops = set((Op.Conv2DBackpropInput,))
     convolution_like_ops = convolution_ops | depthwise_convolution_ops | transpose_convolution_ops
@@ -49,13 +55,36 @@ class TFLiteSemantic:
     avg_pooling_ops = Op.op_set(Op.is_avgpool_op)
     pooling_ops = set((Op.ReduceSum,)) | max_pooling_ops | avg_pooling_ops
     unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op)
-    binary_elem_wise_min_max_ops = set((Op.Minimum, Op.Maximum,))
-    binary_elem_wise_shift_ops = set((Op.SHL, Op.SHR,))
-    binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.Sub,))
+    binary_elem_wise_min_max_ops = set(
+        (
+            Op.Minimum,
+            Op.Maximum,
+        )
+    )
+    binary_elem_wise_shift_ops = set(
+        (
+            Op.SHL,
+            Op.SHR,
+        )
+    )
+    binary_elem_wise_add_mul_sub = set(
+        (
+            Op.Add,
+            Op.Mul,
+            Op.Sub,
+        )
+    )
     binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
     elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
     shapeless_input_ops = binary_elem_wise_main_ops | set((Op.Split, Op.SplitV, Op.Mean, Op.ExpandDims))
-    reshape_ops = set((Op.Reshape, Op.QuantizedReshape, Op.Squeeze, Op.ExpandDims,))
+    reshape_ops = set(
+        (
+            Op.Reshape,
+            Op.QuantizedReshape,
+            Op.Squeeze,
+            Op.ExpandDims,
+        )
+    )
 
     def __init__(self):
         # Setup the generic constraints. Note: the order matters
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 4d82677..6328a4e 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -40,7 +40,13 @@ def _optype_formatter(op_list):
 class TFLiteSupportedOperators:
     # Categorised lists of supported operators
     npu_pre_ops = set((Op.SplitSliceRead,))
-    convolution_ops = set((Op.Conv2DBias, Op.Conv2D, Op.QuantizedConv2D,))
+    convolution_ops = set(
+        (
+            Op.Conv2DBias,
+            Op.Conv2D,
+            Op.QuantizedConv2D,
+        )
+    )
     depthwise_convolution_ops = set((Op.DepthwiseConv2DBias,))
     transpose_convolution_ops = set((Op.Conv2DBackpropInput,))
     convolution_like_ops = convolution_ops | depthwise_convolution_ops | transpose_convolution_ops
@@ -48,7 +54,13 @@ class TFLiteSupportedOperators:
     avg_pooling_ops = Op.op_set(Op.is_avgpool_op)
     pooling_ops = set((Op.ReduceSum,)) | max_pooling_ops | avg_pooling_ops
     resizing_ops = set((Op.ResizeBilinear,))
-    fc_vector_products = set((Op.QuantizedMatMul, Op.MatMul, Op.FullyConnected,))
+    fc_vector_products = set(
+        (
+            Op.QuantizedMatMul,
+            Op.MatMul,
+            Op.FullyConnected,
+        )
+    )
     mac_main_ops = (
         # RNN/LSTM/GRU
         set((Op.BlockLSTM,))
@@ -64,17 +76,47 @@ class TFLiteSupportedOperators:
         | set((Op.Mean,))
     )
     unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op)
-    binary_elem_wise_min_max_ops = set((Op.Minimum, Op.Maximum,))
-    binary_elem_wise_shift_ops = set((Op.SHL, Op.SHR,))
-    binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.Sub,))
+    binary_elem_wise_min_max_ops = set(
+        (
+            Op.Minimum,
+            Op.Maximum,
+        )
+    )
+    binary_elem_wise_shift_ops = set(
+        (
+            Op.SHL,
+            Op.SHR,
+        )
+    )
+    binary_elem_wise_add_mul_sub = set(
+        (
+            Op.Add,
+            Op.Mul,
+            Op.Sub,
+        )
+    )
     binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
     elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
     pad_ops = set((Op.Pad,))
     supported_int32_tensor_ops = (
-        set((Op.ReduceSum, Op.CLZ,)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
+        set(
+            (
+                Op.ReduceSum,
+                Op.CLZ,
+            )
+        )
+        | binary_elem_wise_add_mul_sub
+        | binary_elem_wise_shift_ops
     )
 
-    relu_ops = set((Op.Relu, Op.Relu6, Op.ReluN1To1, Op.Clip,))
+    relu_ops = set(
+        (
+            Op.Relu,
+            Op.Relu6,
+            Op.ReluN1To1,
+            Op.Clip,
+        )
+    )
     activation_ops = relu_ops | set((Op.Tanh, Op.Sigmoid, Op.Softmax, Op.HardSwish))
     npu_post_ops = (
         # activation functions
@@ -84,11 +126,44 @@ class TFLiteSupportedOperators:
         # Quantization
         | set((Op.Quantize,))
     )
-    split_ops = set((Op.Split, Op.SplitV, Op.StridedSlice, Op.Slice, Op.UnpackReshaped, Op.Unpack,))
-    concat_ops = set((Op.Concat, Op.ConcatTFLite, Op.PackReshaped, Op.Pack,))
-    memory_only_ops = set((Op.Reshape, Op.QuantizedReshape, Op.Squeeze, Op.ExpandDims,)) | concat_ops | split_ops
+    split_ops = set(
+        (
+            Op.Split,
+            Op.SplitV,
+            Op.StridedSlice,
+            Op.Slice,
+            Op.UnpackReshaped,
+            Op.Unpack,
+        )
+    )
+    concat_ops = set(
+        (
+            Op.Concat,
+            Op.ConcatTFLite,
+            Op.PackReshaped,
+            Op.Pack,
+        )
+    )
+    memory_only_ops = (
+        set(
+            (
+                Op.Reshape,
+                Op.QuantizedReshape,
+                Op.Squeeze,
+                Op.ExpandDims,
+            )
+        )
+        | concat_ops
+        | split_ops
+    )
     per_axis_quant_ops = convolution_like_ops  # per-axis/channel quantization only currently supported for conv ops
-    supported_fused_activations = relu_ops | set((Op.Tanh, Op.Sigmoid, Op.LUT,))
+    supported_fused_activations = relu_ops | set(
+        (
+            Op.Tanh,
+            Op.Sigmoid,
+            Op.LUT,
+        )
+    )
     supported_operators = npu_pre_ops | mac_main_ops | elem_wise_main_ops | pad_ops | npu_post_ops | memory_only_ops
     # Supported data types
     supported_op_dtypes = set((DataType.uint8, DataType.int8, DataType.int16, DataType.int32))
@@ -441,7 +516,7 @@ class TFLiteSupportedOperators:
     @staticmethod
     def constraint_tconv_valid(op):
         """VALID padding: OFM dimensions must equal IFM dimensions multiplied by stride,
-                  minus difference between kernel size and stride"""
+        minus difference between kernel size and stride"""
         if op.attrs["padding"] == Padding.VALID:
             s_w = op.kernel.stride.x
             s_h = op.kernel.stride.y
diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
index 9e72a6c..778aa2a 100644
--- a/ethosu/vela/tosa_graph_optimiser.py
+++ b/ethosu/vela/tosa_graph_optimiser.py
@@ -876,7 +876,12 @@ def tosa_optimise_graph(nng, arch):
     # TODO the supported operator checking need to be split in semantic and HW checks
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], [supported_operator_check], rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [],
+            [supported_operator_check],
+            rewrite_unsupported=False,
         )
 
     # Decomposing and rewrite of concat
@@ -893,7 +898,12 @@ def tosa_optimise_graph(nng, arch):
     # Handle sg input output
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], [fix_sg_input_output_tosa], rewrite_unsupported=True,
+            nng,
+            sg,
+            arch,
+            [],
+            [fix_sg_input_output_tosa],
+            rewrite_unsupported=True,
         )
 
     # Removal of reshapes
@@ -909,19 +919,34 @@ def tosa_optimise_graph(nng, arch):
 
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], [set_ifm_ofm_op_shapes], rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [],
+            [set_ifm_ofm_op_shapes],
+            rewrite_unsupported=False,
         )
 
     # Removal of Transpose
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], [remove_const_transpose], rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [],
+            [remove_const_transpose],
+            rewrite_unsupported=False,
         )
 
     # TODO, when and where to best handle calc_scaling_avgpool
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], [calc_scaling_avgpool], rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [],
+            [calc_scaling_avgpool],
+            rewrite_unsupported=False,
         )
 
     # Rewite Operators step
@@ -929,13 +954,22 @@ def tosa_optimise_graph(nng, arch):
 
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
+            nng,
+            sg,
+            arch,
+            [],
+            op_rewrite_list,
+            rewrite_unsupported=False,
         )
 
     # Post-processing step 1
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
-            nng, sg, arch, [], [rewrite_activation, add_padding_fields],
+            nng,
+            sg,
+            arch,
+            [],
+            [rewrite_activation, add_padding_fields],
         )
 
     # Removal of Slice, need to be done after optimisation has been performed,
@@ -946,6 +980,12 @@ def tosa_optimise_graph(nng, arch):
 
     # Post-processing step 2
     for idx, sg in enumerate(nng.subgraphs):
-        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [fixup_quantization],)
+        nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
+            nng,
+            sg,
+            arch,
+            [],
+            [fixup_quantization],
+        )
 
     return nng
diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py
index e378511..15e1569 100644
--- a/ethosu/vela/tosa_supported_operators.py
+++ b/ethosu/vela/tosa_supported_operators.py
@@ -38,11 +38,30 @@ class TosaSupportedOperators:
     fc_vector_products = set((Op.FullyConnected,))
 
     mac_main_ops = convolution_like_ops | pooling_ops | fc_vector_products
-    memory_only_ops = set((Op.Reshape, Op.Transpose, Op.Concat, Op.SplitSliceRead,))
-    binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.RescaleMul, Op.Sub,))
+    memory_only_ops = set(
+        (
+            Op.Reshape,
+            Op.Transpose,
+            Op.Concat,
+            Op.SplitSliceRead,
+        )
+    )
+    binary_elem_wise_add_mul_sub = set(
+        (
+            Op.Add,
+            Op.Mul,
+            Op.RescaleMul,
+            Op.Sub,
+        )
+    )
     elem_wise_ops = binary_elem_wise_add_mul_sub
     type_conversion_ops = set((Op.Rescale,))
-    relu_ops = set((Op.Clamp, Op.ReluN,))
+    relu_ops = set(
+        (
+            Op.Clamp,
+            Op.ReluN,
+        )
+    )
     activation_ops = relu_ops | set((Op.Table,))
     pad_ops = set((Op.Pad,))
 
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index cdac641..78c4351 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -42,7 +42,8 @@ from ethosu import mlw_codec
 # Contains meta info for a weight compression. If two tensors have identical weight compression config,
 # then they also will have identical compressed weights.
 WeightCompressionConfig = namedtuple(
-    "WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"],
+    "WeightCompressionConfig",
+    ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"],
 )
 
 ScaleCompressionConfig = namedtuple("ScaleCompressionConfig", ["scale_value_id", "ifm_scale", "ofm_scale"])
author	Jonas Ohlsson <jonas.ohlsson@arm.com>	2022-03-30 10:30:25 +0200
committer	Jonas Ohlsson <jonas.ohlsson@arm.com>	2022-03-30 15:54:14 +0200
commit	d85750702229af97c0b0bbda6e397a23254b6144 (patch)
tree	389962105a35d5cef595cfeb5d640bd59a0d0ff8
parent	cc5f4de1c35ba44fca7ff6295c6ae846f8242344 (diff)
download	ethos-u-vela-d85750702229af97c0b0bbda6e397a23254b6144.tar.gz