From 845e23200d471e44f274940846e400d170b5ff37 Mon Sep 17 00:00:00 2001
From: Jonas Ohlsson <jonas.ohlsson@arm.com>
Date: Tue, 1 Mar 2022 12:39:55 +0100
Subject: MLBEDSW-3367 Add mypy to pre-commit

Add mypy to pre-commit and clean up all reported errors.

Signed-off-by: Jonas Ohlsson <jonas.ohlsson@arm.com>
Change-Id: If7dc869f5fecdb0e2db40f14e7d9db21aa33df71
---
 .pre-commit-config.yaml                          |  9 ++++++
 ethosu/mlw_codec/test/test_mlw_codec.py          |  5 +++-
 ethosu/vela/architecture_allocator.py            | 36 ++++++++++++++----------
 ethosu/vela/high_level_command_stream.py         |  5 ++--
 ethosu/vela/high_level_command_to_npu_op.py      | 11 +++++---
 ethosu/vela/hillclimb_allocation.py              |  6 ++--
 ethosu/vela/nn_graph.py                          |  4 +++
 ethosu/vela/npu_performance.py                   |  3 +-
 ethosu/vela/operation.py                         | 19 +++++++------
 ethosu/vela/operation_util.py                    |  2 +-
 ethosu/vela/register_command_stream_generator.py |  9 ++++--
 ethosu/vela/register_command_stream_util.py      |  6 ++--
 ethosu/vela/scheduler.py                         | 35 ++++++++++++++---------
 ethosu/vela/tensor.py                            |  2 +-
 ethosu/vela/tensor_allocation.py                 |  2 +-
 ethosu/vela/weight_compressor.py                 |  6 ++--
 setup.py                                         |  2 +-
 17 files changed, 102 insertions(+), 60 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8e976b65..ae2bae58 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,14 @@
 exclude: '^ethosu/vela/(tflite|ethos_u55_regs|tosa)/'
 repos:
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: 'v0.931'
+    hooks:
+    -   id: mypy
+        args: ["--no-strict-optional", "--show-error-codes", "--ignore-missing-imports"]
+        require_serial: true
+        additional_dependencies: [types-setuptools]
+        minimum_pre_commit_version: '2.9.2'
+
 -   repo: https://github.com/asottile/reorder_python_imports
     rev: v2.2.0
     hooks:
diff --git a/ethosu/mlw_codec/test/test_mlw_codec.py b/ethosu/mlw_codec/test/test_mlw_codec.py
index 18c828a3..3ff26e53 100644
--- a/ethosu/mlw_codec/test/test_mlw_codec.py
+++ b/ethosu/mlw_codec/test/test_mlw_codec.py
@@ -15,6 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Simple example of the usage of mlw_codec.
+from typing import Any
+from typing import List
+
 import pytest
 
 from ethosu import mlw_codec
@@ -68,7 +71,7 @@ class TestMLWCodec:
         with pytest.raises(Exception):
             mlw_codec.encode(input)
 
-    invalid_decode_test_data = [None, 3, []]
+    invalid_decode_test_data: List[Any] = [None, 3, []]
 
     @pytest.mark.parametrize("input", invalid_decode_test_data)
     def test_decode_invalid_input(self, input):
diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py
index 84d8354b..d27f1264 100644
--- a/ethosu/vela/architecture_allocator.py
+++ b/ethosu/vela/architecture_allocator.py
@@ -17,8 +17,10 @@
 # Description: Architecture SHRAM allocator
 import enum
 import math
+from typing import Dict
 from typing import Optional
 from typing import Tuple
+from typing import Union
 
 from .architecture_features import ArchitectureFeatures
 from .architecture_features import Block
@@ -77,8 +79,8 @@ class ElementwiseUsage(enum.IntEnum):
 def _try_block_config(
     shram: SHRAMConfig,
     ew_usage: ElementwiseUsage,
-    ofm_block: Block,
-    ifm_block: Block,
+    ofm_block: Union[Shape4D, Block],
+    ifm_block: Union[Shape4D, Block],
     ifm_bits: int,
     ifm_granule: int,
     acc_bits: int,
@@ -86,7 +88,7 @@ def _try_block_config(
     lut_banks: int,
     ifm_depth_buf_scaling: int,
     cores: int,
-) -> SHRAMLayout:
+) -> Union[SHRAMLayout, None]:
     assert (acc_bits > 0) and (acc_granule > 0)
     assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)
 
@@ -173,7 +175,7 @@ def to_upscale(ifm_resampling: resampling_mode) -> int:
     return 1 if ifm_resampling == resampling_mode.NONE else 2
 
 
-def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool):
+def _ifm_blockdepth(arch, ifm_shape: Union[Shape4D, Block], ifm_bits: int, is_partkernel: bool):
     if ifm_bits == 16:
         ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
     else:
@@ -185,7 +187,9 @@ def _required_size(value: int, stride: int, border: int, upscale: int, nearest:
     return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))
 
 
-def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, resampling_mode: resampling_mode) -> Tuple[int, int]:
+def get_ifm_area_required(
+    ofm_shape: Union[Shape4D, Block], kernel: Kernel, resampling_mode: resampling_mode
+) -> Tuple[int, int]:
     upscale = to_upscale(resampling_mode)
     nearest = is_nearest(resampling_mode)
     h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
@@ -194,7 +198,7 @@ def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, resampling_mode: r
 
 
 def _get_ifm_blocksize(
-    ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
+    ofm_block: Union[Shape4D, Block], kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
 ) -> Shape4D:
     # IFM block height
     h1 = _required_size(
@@ -213,7 +217,9 @@ def _get_ifm_blocksize(
     return Shape4D(1, height, width, ofm_block.depth)
 
 
-def fit_block_for_ofm(arch: ArchitectureFeatures, ofm_shape: Shape4D, kernel: Kernel, block: Shape4D):
+def fit_block_for_ofm(
+    arch: ArchitectureFeatures, ofm_shape: Union[Shape4D, Block], kernel: Kernel, block: Union[Shape4D, Block]
+):
     # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
     # interpretation of a more general constraint that can't be applied because the
     # find_block_config function must return block configs that can be applied to any OFM shape.
@@ -227,14 +233,14 @@ def find_block_config(
     npu_op_type: NpuBlockType,
     ofm_shape: Shape4D,
     ifm_shape: Shape4D,
-    ifm2_shape: Shape4D,
+    ifm2_shape: Optional[Shape4D],
     uses_scalar: bool,
     ifm_bits: int,
     kernel: Kernel,
     lut_banks: int,
     scaled: bool,
     ifm_resampling: resampling_mode,
-) -> ArchitectureBlockConfig:
+) -> Optional[ArchitectureBlockConfig]:
     SplitDepth = ArchitectureFeatures.OFMSplitDepth
     # Elementwise larger-volume correction
     if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
@@ -296,7 +302,7 @@ def find_block_config(
         depth = round_up(depth, SplitDepth)
 
     while depth <= search_space.depth:
-        wont_fit = {}
+        wont_fit: Dict[Tuple[int, int], bool] = {}
         for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
             for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
                 # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
@@ -315,8 +321,8 @@ def find_block_config(
                 layout = _try_block_config(
                     arch.shram,
                     ew_usage,
-                    ofm_block,
-                    ifm_block,
+                    Block(ofm_block.width, ofm_block.height, ofm_block.depth),
+                    Block(ifm_block.width, ifm_block.height, ifm_block.depth),
                     ifm_bits,
                     ifm_granule,
                     acc_bits,
@@ -385,9 +391,9 @@ def try_block_config(
     block_config: Block,
     arch: ArchitectureFeatures,
     npu_op_type: NpuBlockType,
-    ofm_shape: Block,
-    ifm_shape: Block,
-    ifm2_shape: Optional[Block],
+    ofm_shape: Union[Shape4D, Block],
+    ifm_shape: Union[Shape4D, Block],
+    ifm2_shape: Optional[Union[Shape4D, Block]],
     uses_scalar: bool,
     ifm_bits: int,
     is_partkernel: bool,
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index 7e60221d..0009f6cf 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -16,6 +16,7 @@
 # Description:
 # Contains classes that hold commands for the high-level command stream (one command per DMA or NPU stripe).
 from typing import List
+from typing import Optional
 
 import numpy as np
 
@@ -41,8 +42,8 @@ class Box:
         npu_block_type: NpuBlockType,
         concat_offsets: List[int],
         k_dilated_height: int,
-        split_offset: Shape4D = None,
-        split_shape: Shape4D = None,
+        split_offset: Optional[Shape4D] = None,
+        split_shape: Optional[Shape4D] = None,
         upscaling_factor: int = 1,
     ):
         new_start_coord = list(self.start_coord)
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 6c403c86..f7c91aa2 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -17,9 +17,11 @@
 # Description:
 # Conversion from high level command to NpuOperation
 from enum import IntEnum
+from typing import cast
 from typing import Dict
 from typing import List
 from typing import Optional
+from typing import Tuple
 
 from .api import NpuActivation
 from .api import NpuActivationOp
@@ -66,6 +68,7 @@ from .tensor import Tensor
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 from .tensor import TensorSubPurpose
+from .weight_compressor import NpuWeightTensor
 from .weight_compressor import WeightKey
 
 
@@ -294,17 +297,17 @@ def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_sh
 
 
 def create_weights(
-    weight_tensor: Tensor, weight_box: Box, scale_tensor: Tensor, arch: ArchitectureFeatures
-) -> List[NpuAddressRange]:
+    weight_tensor: NpuWeightTensor, weight_box: Box, scale_tensor: NpuWeightTensor, arch: ArchitectureFeatures
+) -> Tuple[List[NpuAddressRange], List[NpuAddressRange]]:
     """Returns address ranges for weights and scales"""
     weights = []
     biases = []
     shared_region = get_region(weight_tensor.mem_type, arch)
-    scale_region = scale_tensor and get_region(scale_tensor.mem_type, arch)
+    scale_region = get_region(scale_tensor.mem_type, arch) if scale_tensor else 0
 
     w_tensor_src = weight_tensor
     if weight_tensor.src_tensor:
-        w_tensor_src = weight_tensor.src_tensor
+        w_tensor_src = cast(NpuWeightTensor, weight_tensor.src_tensor)
 
     core_offset = 0
     for core in range(0, arch.ncores):
diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py
index 5e02dac0..2271fe9c 100644
--- a/ethosu/vela/hillclimb_allocation.py
+++ b/ethosu/vela/hillclimb_allocation.py
@@ -101,7 +101,7 @@ class HillClimbAllocator:
             LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())
             for id, lr in enumerate(live_ranges)
         ]
-        self.lrs_at_time = []
+        self.lrs_at_time: List[List[LiveRangeInfo]] = []
         # The available size (input to algorithm).
         self.available_size: int = 0
         # The algorithm stops once the target size has been achieved
@@ -227,8 +227,8 @@ class HillClimbAllocator:
         # - direct neighbours of the bottleneck live range
         # - direct and indirect predecessors of these neighbours + bottleneck
         # The turns at which these live ranges were allocated are put in the turns set.
-        turn_set = set()
-        turn_list = list()
+        turn_set: Set[int] = set()
+        turn_list: List[int] = list()
         self.add_predecessor_turns(turn_set, turn_list, max_lr)
         for lr2 in max_lr.neighbours:
             self.add_predecessor_turns(turn_set, turn_list, lr2)
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 8a2517de..671843f3 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -130,6 +130,7 @@ class CascadedPass:
         self.predecessors = []
         self.successors = []
         self.sram_used = 0
+        self.time = 0
 
     def __str__(self):
         return "<nng.CascadedPass strategy=%s x %s '%s',  passes=%s, block_configs=%s>" % (
@@ -537,6 +538,9 @@ class Graph:
         self.total_npu_weights = 0
         self.total_npu_encoded_weights = 0
         self.weight_cache = None  # See CompressedWeightCache
+        self.bandwidths = 0
+        self.macs = 0
+        self.cycles = 0
 
     def get_root_subgraph(self):
         return self.subgraphs[0]
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 34530ae8..8c4aee63 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -22,6 +22,7 @@
 import copy
 from enum import auto
 from enum import IntEnum
+from typing import Optional
 from typing import Set
 from uuid import UUID
 
@@ -580,7 +581,7 @@ def update_summary_cycles(arch, bws, cycles):
 
 
 def estimate_full_op_performance(
-    arch, schedule: Schedule, op: SchedulerOperation, prev_op: SchedulerOperation, block_config
+    arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config
 ):
     cycles_a = make_cycles_array()
     bws = make_bandwidth_array()
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 277f2de5..5a6423d8 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -15,6 +15,9 @@
 # limitations under the License.
 # Description:
 # Internal representation of a Neural Network Operation.
+# For Class name forward references for the type annotations. (see PEP 563).
+from __future__ import annotations
+
 import copy
 from collections import namedtuple
 from enum import Enum
@@ -24,13 +27,14 @@ from typing import List
 from typing import Optional
 from typing import Tuple
 from typing import TYPE_CHECKING
+from typing import Union
 
 from .api import NpuRoundingMode
 from .errors import VelaError
 from .numeric_util import full_shape
 from .shape4d import Shape4D
 
-
+# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
 if TYPE_CHECKING:
     from .tensor import Tensor
 
@@ -80,9 +84,6 @@ class Kernel:
     def area_height(self) -> int:
         return (self.height - 1) * self.dilation.y + 1
 
-    def dilation(self) -> PointXY:
-        return self.dilation
-
     def dilated_wh(self) -> Tuple[int, int]:
         """Returns the dilated kernel width/height"""
         return self.dilation.x * (self.width - 1) + 1, self.dilation.y * (self.height - 1) + 1
@@ -443,7 +444,7 @@ def create_activation_function(op_type: Op, min=None, max=None) -> ActivationFun
     return act
 
 
-def get_slice_offsets(input_shape: List[int], offset_tens: int, offset_mask: int, is_begin: bool = True):
+def get_slice_offsets(input_shape: List[int], offset_tens: Tensor, offset_mask: int, is_begin: bool = True):
     # For strided slice operator: get start or end offsets
     offsets = len(input_shape) * [0] if is_begin else input_shape[:]
     for idx in range(len(input_shape)):
@@ -493,7 +494,7 @@ class Operation:
         self.type = op_type
         self.name = name
         self.attrs: Dict[str, Any] = {}
-        self.inputs: List[Tensor] = []
+        self.inputs: List[Optional[Tensor]] = []
         self.outputs: List[Tensor] = []
         self.intermediates: List[Tensor] = []
         self.flops = 0
@@ -514,9 +515,9 @@ class Operation:
         self.ofm_shapes: List[Shape4D] = []
         # If not none: contains rescale to be used as output scaling
         # (which overrides the ofm tensor's scale)
-        self.rescale = None
-        self.read_offsets: List[Shape4D] = [None, None]  # offset for [ifm, ifm2]
-        self.read_shapes: List[Shape4D] = [None, None]  # read shape for [ifm, ifm2]
+        self.rescale: Optional[Union[Tuple[int, int], ExplicitScaling]] = None
+        self.read_offsets: List[Optional[Shape4D]] = [None, None]  # offset for [ifm, ifm2]
+        self.read_shapes: List[Optional[Shape4D]] = [None, None]  # read shape for [ifm, ifm2]
         self.rounding_mode: Optional[NpuRoundingMode] = None
         # Rescale op in TOSA supplies explicit multiplier and shift values
         self.explicit_scaling: Optional[ExplicitScaling] = None
diff --git a/ethosu/vela/operation_util.py b/ethosu/vela/operation_util.py
index 29caf6d0..36a8e592 100644
--- a/ethosu/vela/operation_util.py
+++ b/ethosu/vela/operation_util.py
@@ -234,7 +234,7 @@ def create_binary_elementwise(
     op_type: Op,
     name: str,
     ifm: Tensor,
-    ifm2: Tensor,
+    ifm2: Optional[Tensor],
     quantization: QuantizationParameters,
     activation: Optional[ActivationFunction] = None,
     dtype: Optional[DataType] = None,
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index fd32b655..3be2898c 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -21,6 +21,7 @@ import math
 from collections import defaultdict
 from enum import Enum
 from enum import IntEnum
+from typing import cast
 from typing import Dict
 from typing import List
 from typing import Optional
@@ -319,7 +320,7 @@ def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActi
             quantized_min = max(-128, quantized_min)
             quantized_max = min(127, quantized_max)
     else:
-        activation_value = activation_op_map[act.op_type]
+        activation_value = cast(int, activation_op_map[act.op_type])
     emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
     emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
     emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
@@ -584,7 +585,7 @@ def get_arch_block_config(
         block_config,
         arch,
         block_type,
-        npu_op.ofm.shape,
+        shape3d_to_block(npu_op.ofm.shape),
         ifm_shape,
         ifm2_shape,
         uses_scalar,
@@ -741,6 +742,8 @@ def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElem
                 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
             emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
         else:  # Add/Sub
+            opa_scale: float
+            opb_scale: float
             bitdepth = npu_op.ifm.data_type.size_in_bits()
             use_advanced_scaling = False
             if None in (input_scale, input2_scale, output_scale):
@@ -799,7 +802,7 @@ def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElem
 # -------------------------------------------------------------------
 
 
-def print_feature_map(fm: NpuFeatureMap, name: str):
+def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
     if fm is not None:
         q = (
             "no quantization"
diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py
index 3751d88e..83126ead 100644
--- a/ethosu/vela/register_command_stream_util.py
+++ b/ethosu/vela/register_command_stream_util.py
@@ -163,7 +163,7 @@ def get_h_ranges(
     return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
 
 
-def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[NpuAddressRange]:
+def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[Optional[NpuAddressRange]]:
     """
     Returns a list of adddress ranges that covers the area start - end (inclusive).
     Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
@@ -183,7 +183,7 @@ def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXY
     h, w, c = fm.shape
     y0, x0, c0 = start.y, start.x, start.z
     y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1)
-    ranges = []
+    ranges: List[Optional[NpuAddressRange]] = []
     if x0 < width_0 and y0 < height_0:
         # Horizontal ranges for tile 0
         ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1))
@@ -373,7 +373,7 @@ def intersects(
     else:
         # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
         # In this case, address comparison between the two areas is needed
-        ifm_ranges = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
+        ifm_ranges: List[Optional[NpuAddressRange]] = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
         prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord)
         res = range_lists_overlap(ifm_ranges, prev_ofm_ranges)
     return res
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 284848f5..73133bcd 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -17,6 +17,9 @@
 # Description:
 # The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
 # subdivisions for the Operators
+# For Class name forward references for the type annotations. (see PEP 563).
+from __future__ import annotations
+
 import copy
 from collections import namedtuple
 from enum import auto
@@ -25,6 +28,11 @@ from typing import Dict
 from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import TYPE_CHECKING
+
+# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
+if TYPE_CHECKING:
+    from .npu_performance import CycleCost
 
 import numpy as np
 
@@ -57,6 +65,7 @@ from .tensor import Tensor
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 from .tensor import TensorSubPurpose
+from .weight_compressor import NpuWeightTensor
 
 
 def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:
@@ -95,10 +104,10 @@ class SchedulerOpInfo:
         self.cascade = 0  # Assigned by CascadeBuilder. 0 means not part of a cascade
         self.time_index = None  # Set by update_op_memory_snapshot
         self.ofm_depth_slices: List[int] = [0, stripe.depth]
-        self.npu_weights_tensor = None
-        self.npu_scales_tensor = None
-        self.buffered_weight_tensor = None
-        self.cycles = None
+        self.npu_weights_tensor: Optional[NpuWeightTensor] = None
+        self.npu_scales_tensor: Optional[NpuWeightTensor] = None
+        self.buffered_weight_tensor: Optional[Tensor] = None
+        self.cycles: Optional[CycleCost] = None
         self.slack_buffering_cycles = 0
         self.slack_buffering_memory = 0
         self.full_weight_transfer_cycles = 0
@@ -230,7 +239,7 @@ class SchedulerOperation:
     def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:
         """Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""
         ifm_shape = self.ifm.shape
-        ifm2_shape = self.ifm2 and self.ifm2.shape
+        ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None
         ofm_shape = stripe
 
         if ofm_shape != self.ofm.shape:
@@ -273,14 +282,14 @@ class SchedulerOperation:
 
         return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
 
-    def _calculate_min_stripe_input(self) -> Shape4D:
+    def _calculate_min_stripe_input(self) -> Tuple[int, int]:
         # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
         min_stripe = self.ofm.shape.with_hw(1, 1)
         return self._get_stripe_input_requirement(min_stripe)
 
     def _get_block_config(
         self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D
-    ) -> ArchitectureBlockConfig:
+    ) -> Optional[ArchitectureBlockConfig]:
         # Returns a block config and SHRAM layout
         lut_banks = 2 if self.parent_op.activation_lut else 0
         return find_block_config(
@@ -325,7 +334,7 @@ class Schedule:
         self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}
         self.cascades: Dict[int, CascadeInfo] = {}
         self.fast_storage_peak_usage = 0
-        self.memory_snapshot = None
+        self.memory_snapshot: Optional[List[int]] = None
 
     @property
     def name(self):
@@ -340,7 +349,7 @@ class Scheduler:
         self.sg = sg
         self.arch = arch
         self.sched_ops: List[SchedulerOperation] = []
-        self.max_schedule = None
+        self.max_schedule: Optional[Schedule] = None
         self.scheduler_options = options
 
     def avoid_nhcwb16_for_ofm(self, tens, ps, arch):
@@ -524,7 +533,7 @@ class Scheduler:
     def propose_operator_buffering(
         self,
         sched_op: SchedulerOperation,
-        prev_op: SchedulerOperation,
+        prev_op: Optional[SchedulerOperation],
         buffered_schedule: Schedule,
         ref_schedule: Schedule,
         staging_limit_bytes,
@@ -605,7 +614,7 @@ class Scheduler:
             cost.npu_scales_tensor = full_scales
             return
 
-        encoded_weights = full_weights
+        encoded_weights: Optional[NpuWeightTensor] = full_weights
         encoded_scales = full_scales
 
         # How many NPU cycles are available under the previously executing
@@ -681,7 +690,7 @@ class Scheduler:
                         cost.block_config,
                         cost.ofm_depth_slices,
                     )
-
+                    assert encoded_weights is not None
                     # Chosen buffering might not fit at all, iterate until it does
                     # or until the minimum usable slice size is reached
                     if (
@@ -747,7 +756,7 @@ class Scheduler:
         cost_map = min_schedule.cost_map
 
         # Keep track of the previous Op - which consumes the current Op's OFM
-        prev_op = None
+        prev_op: Optional[SchedulerOperation] = None
         for sched_op in reversed(self.sched_ops):
             min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
             min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 82de8973..19016a0f 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -420,7 +420,7 @@ class Tensor:
         self.ifm_write_protected = False
 
         # Reference to parent-tensor if this tensor is a clone
-        self.src_tensor = None
+        self.src_tensor: Optional[Tensor] = None
 
     @property
     def address(self) -> int:
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index c82140c5..c8b5129d 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -91,7 +91,7 @@ def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
     verify_alignment(live_ranges, alignment)
     nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
     # Contains active live ranges at each timestamp
-    lrs_at_time = [[] for i in range(nr_time_slots)]
+    lrs_at_time: List[List[LiveRange]] = [[] for i in range(nr_time_slots)]
     for lr in live_ranges.lrs:
         for t in range(lr.start_time, lr.end_time + 1):
             lrs_at_time[t].append(lr)
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index 68817035..22fe512e 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -17,6 +17,8 @@
 # Compresses and pads the weigths. It also calculates the scales and packs with the biases.
 from collections import namedtuple
 from collections import OrderedDict
+from typing import Dict
+from typing import Optional
 from typing import Tuple
 
 import numpy as np
@@ -75,7 +77,7 @@ class NpuWeightTensor(Tensor):
 class CompressedWeightCache:
     """Global tensor weight compression cache"""
 
-    cache = {}
+    cache: Dict[WeightCompressionConfig, Tensor] = {}
 
     @staticmethod
     def get_tensor_with_same_compression(wcc):
@@ -279,7 +281,7 @@ def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling):
 
 def encode_weight_and_scale_tensor(
     arch, op, weight_tens, scale_tens, kernel, block_config, depth_offsets, rescale_for_faf=False
-) -> (NpuWeightTensor, NpuWeightTensor):
+) -> Tuple[Optional[NpuWeightTensor], Optional[NpuWeightTensor]]:
     npu_block_type = op.type.npu_block_type
 
     ifm_scale = scale_tens and scale_tens.consumer_list[0].get_input_quantization().scale_f32
diff --git a/setup.py b/setup.py
index 488d2966..031401e7 100644
--- a/setup.py
+++ b/setup.py
@@ -98,6 +98,6 @@ setup(
     ],
     entry_points={"console_scripts": ["vela = ethosu.vela.vela:main"]},
     ext_modules=[mlw_module],
-    cmdclass={"build_ext": BuildExtension},
+    cmdclass={"build_ext": BuildExtension},  # type: ignore[dict-item]
     setup_requires=["numpy>=1.16.6,<=1.19.5", "setuptools_scm"],
 )
-- 
cgit v1.2.1