aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonas Ohlsson <jonas.ohlsson@arm.com>2022-03-30 10:30:25 +0200
committerJonas Ohlsson <jonas.ohlsson@arm.com>2022-03-30 15:54:14 +0200
commitd85750702229af97c0b0bbda6e397a23254b6144 (patch)
tree389962105a35d5cef595cfeb5d640bd59a0d0ff8
parentcc5f4de1c35ba44fca7ff6295c6ae846f8242344 (diff)
downloadethos-u-vela-d85750702229af97c0b0bbda6e397a23254b6144.tar.gz
Update version of Black to 22.3.0
Update version of Black to 22.3.0 due to updated dependencies. Updates to fix reported issues due to new version. Signed-off-by: Jonas Ohlsson <jonas.ohlsson@arm.com> Change-Id: I60056aae452093ce8dcea1f499ecced22b25eef1
-rw-r--r--.pre-commit-config.yaml2
-rw-r--r--ethosu/mlw_codec/test/test_mlw_codec.py2
-rw-r--r--ethosu/vela/api.py4
-rw-r--r--ethosu/vela/architecture_features.py11
-rw-r--r--ethosu/vela/compiler_driver.py12
-rw-r--r--ethosu/vela/driver_actions.py3
-rw-r--r--ethosu/vela/high_level_command_stream_generator.py8
-rw-r--r--ethosu/vela/live_range.py6
-rw-r--r--ethosu/vela/npu_performance.py24
-rw-r--r--ethosu/vela/pass_packing.py9
-rw-r--r--ethosu/vela/range_set.py2
-rw-r--r--ethosu/vela/register_command_stream_generator.py17
-rw-r--r--ethosu/vela/register_command_stream_util.py25
-rw-r--r--ethosu/vela/scheduler.py59
-rw-r--r--ethosu/vela/softmax.py49
-rw-r--r--ethosu/vela/stats_writer.py12
-rw-r--r--ethosu/vela/tensor_allocation.py23
-rw-r--r--ethosu/vela/test/extapi/test_extapi_encode_weights.py10
-rw-r--r--ethosu/vela/test/test_register_command_stream_util.py84
-rw-r--r--ethosu/vela/test/test_tflite_model_semantic.py22
-rw-r--r--ethosu/vela/test/test_tflite_supported_operators.py37
-rw-r--r--ethosu/vela/tflite_graph_optimiser.py57
-rw-r--r--ethosu/vela/tflite_mapping.py21
-rw-r--r--ethosu/vela/tflite_model_semantic.py39
-rw-r--r--ethosu/vela/tflite_supported_operators.py99
-rw-r--r--ethosu/vela/tosa_graph_optimiser.py56
-rw-r--r--ethosu/vela/tosa_supported_operators.py25
-rw-r--r--ethosu/vela/weight_compressor.py3
28 files changed, 598 insertions, 123 deletions
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ae2bae5..9c70706 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@ repos:
- id: reorder-python-imports
- repo: https://github.com/psf/black
- rev: 19.10b0
+ rev: 22.3.0
hooks:
- id: black
language_version: python3
diff --git a/ethosu/mlw_codec/test/test_mlw_codec.py b/ethosu/mlw_codec/test/test_mlw_codec.py
index 3ff26e5..d77c82a 100644
--- a/ethosu/mlw_codec/test/test_mlw_codec.py
+++ b/ethosu/mlw_codec/test/test_mlw_codec.py
@@ -24,7 +24,7 @@ from ethosu import mlw_codec
class TestMLWCodec:
- """ This class is responsible to test the mlw_codec library
+ """This class is responsible to test the mlw_codec library
It mainly tests the two methods encode() and decode() with different inputs"""
weights = [0, 2, 3, 0, -1, -2, -3, 0, 0, 0, 1, -250, 240] * 3
diff --git a/ethosu/vela/api.py b/ethosu/vela/api.py
index f49df25..3382ea9 100644
--- a/ethosu/vela/api.py
+++ b/ethosu/vela/api.py
@@ -139,11 +139,11 @@ class NpuDataType(Enum):
return self.value[1]
def size_in_bits(self) -> int:
- """ Size of the data type in bits"""
+ """Size of the data type in bits"""
return self.value[0]
def size_in_bytes(self) -> int:
- """ Size of the data type in bytes"""
+ """Size of the data type in bytes"""
return self.value[0] // 8
def min_value(self) -> int:
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index e79ed72..08ff260 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -647,7 +647,9 @@ class ArchitectureFeatures:
else:
raise CliOptionError(
- "--system-config", self.system_config, f"Section {sys_cfg_section} not found in Vela config file",
+ "--system-config",
+ self.system_config,
+ f"Section {sys_cfg_section} not found in Vela config file",
)
# read the memory mode
@@ -678,7 +680,9 @@ class ArchitectureFeatures:
else:
raise CliOptionError(
- "--memory-mode", self.memory_mode, f"Section {mem_mode_section} not found in Vela config file",
+ "--memory-mode",
+ self.memory_mode,
+ f"Section {mem_mode_section} not found in Vela config file",
)
# override sram to onchipflash
@@ -777,7 +781,8 @@ class ArchitectureFeatures:
# check for recursion loop
if inheritance_section == section:
raise ConfigOptionError(
- "inherit", f"{inheritance_section}. This references its own section and recursion is not allowed",
+ "inherit",
+ f"{inheritance_section}. This references its own section and recursion is not allowed",
)
result = self._read_config(inheritance_section, key, result, found)
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index cf26eb3..2715c8f 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -44,10 +44,9 @@ from .tensor import Tensor
class CompilerOptions:
"""Set of options to change compiler behaviour - verbosity, targets, turning off passes.
-Note the difference between ArchitectureFeatures and CompilerOptions
-- ArchitectureFeatures is for changing the Ethos-U and system architecture
-- CompilerOptions is for changing the behaviour of the compiler
-"""
+ Note the difference between ArchitectureFeatures and CompilerOptions
+ - ArchitectureFeatures is for changing the Ethos-U and system architecture
+ - CompilerOptions is for changing the behaviour of the compiler"""
def __init__(
self,
@@ -194,7 +193,10 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type):
# Calculate live ranges for all constant Npu tensors, in permanent storage
for sg in npu_subgraphs:
lr_graph_flash = live_range.create_linear_live_range_graph(
- sg, permanent_storage, MemType.Permanent_NPU, lr_graph=lr_graph_flash,
+ sg,
+ permanent_storage,
+ MemType.Permanent_NPU,
+ lr_graph=lr_graph_flash,
)
if npu_subgraphs:
diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py
index 90af02c..4ad2a33 100644
--- a/ethosu/vela/driver_actions.py
+++ b/ethosu/vela/driver_actions.py
@@ -119,8 +119,7 @@ def emit_dump_shram(data: List[int]):
def create_driver_payload(register_command_stream: List[int], arch: ArchitectureFeatures) -> bytes:
- """Creates driver header and includes the given command
- """
+ """Creates driver header and includes the given command"""
# Prepare driver actions for this command tensor
da_list: List[int] = []
emit_fourcc(da_list, "COP1")
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index eef4e6d..81c0d5b 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -67,7 +67,13 @@ def generate_high_level_commands_for_sched_op(sched_op, schedule):
ofm_tensor = ps.ofm_tensor
# Get Tensors and Full Shapes
- (ifm_tensor, ifm2_tensor, uncomp_weight_tensor, _, _,) = parent_op.get_ifm_ifm2_weights_biases_ofm()
+ (
+ ifm_tensor,
+ ifm2_tensor,
+ uncomp_weight_tensor,
+ _,
+ _,
+ ) = parent_op.get_ifm_ifm2_weights_biases_ofm()
ifm = sched_op.ifm
ifm2 = sched_op.ifm2
ofm_shape = sched_op.ofm.shape
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 45baf44..ccf4929 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -200,7 +200,11 @@ def merge_elementwise_op_ranges(sg, sched_op, lr_graph, target_mem_area, target_
def extract_live_ranges_from_cascaded_passes(
- sg, target_mem_area, target_mem_type_set, lr_graph=None, cpu_tensor_alignment=Tensor.AllocationQuantum,
+ sg,
+ target_mem_area,
+ target_mem_type_set,
+ lr_graph=None,
+ cpu_tensor_alignment=Tensor.AllocationQuantum,
):
if lr_graph is None:
lr_graph = LiveRangeGraph()
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 4ffca49..0c8a907 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -59,14 +59,26 @@ class PassCycles(IntEnum):
Size = auto()
def display_name(self):
- return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[
- self.value
- ]
+ return (
+ "NPU",
+ "SRAM Access",
+ "DRAM Access",
+ "On-chip Flash Access",
+ "Off-chip Flash Access",
+ "Total",
+ "Size",
+ )[self.value]
def identifier_name(self):
- return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[
- self.value
- ]
+ return (
+ "npu",
+ "sram_access",
+ "dram_access",
+ "on_chip_flash_access",
+ "off_chip_flash_access",
+ "total",
+ "size",
+ )[self.value]
@staticmethod
def all():
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 1fefdf4..8535fa0 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -87,7 +87,14 @@ quantization_ops = set((Op.Dequantize, Op.Max, Op.Min))
cpu_ops = set((Op.Softmax, Op.LRN, Op.Shape, Op.Pad, Op.AddN)) | quantization_ops
startup_init_ops = set((Op.Const, Op.Placeholder, Op.SubgraphInput))
-memory_only_ops = set((Op.Squeeze, Op.Reshape, Op.QuantizedReshape, Op.ExpandDims,))
+memory_only_ops = set(
+ (
+ Op.Squeeze,
+ Op.Reshape,
+ Op.QuantizedReshape,
+ Op.ExpandDims,
+ )
+)
test_sequence = [
diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py
index f03174e..6b28282 100644
--- a/ethosu/vela/range_set.py
+++ b/ethosu/vela/range_set.py
@@ -21,7 +21,7 @@ from functools import lru_cache
class RangeSet:
"""A Range set class to track ranges and whether they intersect.
-Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas."""
+ Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas."""
def __init__(self, start=None, end=None, ranges=None):
if ranges is None:
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 3be2898..be01a75 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -521,7 +521,8 @@ def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], a
def generate_block_config(
- emit: CommandStreamEmitter, block_config: NpuShape3D,
+ emit: CommandStreamEmitter,
+ block_config: NpuShape3D,
):
"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
@@ -530,7 +531,9 @@ def generate_block_config(
def generate_shram_registers(
- emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig,
+ emit: CommandStreamEmitter,
+ npu_op: NpuBlockOperation,
+ arch_block_config: ArchitectureBlockConfig,
):
"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
@@ -775,9 +778,13 @@ def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElem
if use_advanced_scaling:
# Use advanced implementation only when input/output scales differ,
# or when we can't guarantee the absence of rounding errors
- (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
- input_scale, input2_scale, output_scale, bitdepth
- )
+ (
+ opa_scale,
+ opa_shift,
+ ofm_scale,
+ shift,
+ op_to_scale,
+ ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
opb_scale = 0 # Unused for this case
if npu_op.reversed_operands:
# If the operand order is reversed we also have to swap which operand is scaled
diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py
index 83126ea..b2c84d7 100644
--- a/ethosu/vela/register_command_stream_util.py
+++ b/ethosu/vela/register_command_stream_util.py
@@ -204,7 +204,16 @@ def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
strides = get_strides(fm)
height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
- t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
+ t0 = get_address_range(
+ fm,
+ strides,
+ 0,
+ 0,
+ 0,
+ min(height, height_0) - 1,
+ min(width, width_0) - 1,
+ depth - 1,
+ )
if width > width_0:
t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
else:
@@ -443,7 +452,9 @@ def get_first_job_input_volume(
# IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
end_coord = PointXYZ(
- x=start_coord[0] + ifm_block.width, y=start_coord[1] + ifm_block.height, z=start_coord[2] + ifm_block.depth,
+ x=start_coord[0] + ifm_block.width,
+ y=start_coord[1] + ifm_block.height,
+ z=start_coord[2] + ifm_block.depth,
)
return (start_coord, end_coord, 1) # start, end, total jobs
@@ -456,12 +467,18 @@ def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
if start_coord is None:
return None
end_coord = PointXYZ(
- x=start_coord.x + ofm_block.width, y=start_coord.y + ofm_block.height, z=start_coord.z + ofm_block.depth,
+ x=start_coord.x + ofm_block.width,
+ y=start_coord.y + ofm_block.height,
+ z=start_coord.z + ofm_block.depth,
)
return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
-def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
+def calc_blockdep(
+ arch: ArchitectureFeatures,
+ prev_op: Optional[NpuBlockOperation],
+ npu_op: NpuBlockOperation,
+) -> int:
"""Calculates the value of the BLOCKDEP register"""
if prev_op is None:
return 0
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index fe2d711..a19d053 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -113,7 +113,13 @@ class SchedulerOpInfo:
self.full_weight_transfer_cycles = 0
def copy(self):
- res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,)
+ res = SchedulerOpInfo(
+ self.block_config,
+ self.weights_size,
+ self.stripe_input,
+ self.stripe_input2,
+ self.stripe,
+ )
res.cascade = self.cascade
return res
@@ -135,7 +141,10 @@ class SchedulerOptions:
"""Contains options for the Scheduler"""
def __init__(
- self, optimization_strategy, sram_target, verbose_schedule,
+ self,
+ optimization_strategy,
+ sram_target,
+ verbose_schedule,
):
self.optimization_strategy = optimization_strategy
self.optimization_sram_limit = sram_target
@@ -175,15 +184,28 @@ class SchedulerOperation:
)
self.ifm_ublock = arch.ifm_ublock
- self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,)
+ self.ifm = SchedulerTensor(
+ ps.ifm_shapes[0],
+ ps.ifm_tensor.dtype,
+ ps.ifm_tensor.mem_area,
+ ps.ifm_tensor.format,
+ )
self.ifm2 = None
if ps.ifm2_tensor:
self.ifm2 = SchedulerTensor(
- ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format,
+ ps.ifm_shapes[1],
+ ps.ifm2_tensor.dtype,
+ ps.ifm2_tensor.mem_area,
+ ps.ifm2_tensor.format,
)
- self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,)
+ self.ofm = SchedulerTensor(
+ ps.ofm_shapes[0],
+ ps.ofm_tensor.dtype,
+ ps.ofm_tensor.mem_area,
+ ps.ofm_tensor.format,
+ )
# Input volume width and height required to produce the smallest possible stripe
self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
@@ -481,7 +503,11 @@ class Scheduler:
lr_graph = live_range.LiveRangeGraph()
for mem_area, mem_type_set in memories_list:
live_range.extract_live_ranges_from_cascaded_passes(
- self.nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum,
+ self.nng.get_root_subgraph(),
+ mem_area,
+ mem_type_set,
+ lr_graph,
+ Tensor.AllocationQuantum,
)
# Populate time-array with memory used by live ranges
@@ -923,7 +949,11 @@ class Scheduler:
return best_schedule
def optimize_schedule(
- self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions,
+ self,
+ schedule: Schedule,
+ max_sched: Schedule,
+ max_template: Schedule,
+ options: SchedulerOptions,
) -> Schedule:
"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
sram_limit = options.optimization_sram_limit
@@ -994,7 +1024,11 @@ class Scheduler:
lr_graph = live_range.LiveRangeGraph()
for mem_area, mem_type_set in memories_list:
live_range.extract_live_ranges_from_cascaded_passes(
- self.nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum,
+ self.nng.get_root_subgraph(),
+ mem_area,
+ mem_type_set,
+ lr_graph,
+ Tensor.AllocationQuantum,
)
max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
@@ -1252,7 +1286,14 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
cascaded_passes = []
for idx, ps in enumerate(sg.passes):
cps = CascadedPass(
- ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False,
+ ps.name,
+ SchedulingStrategy.WeightStream,
+ ps.inputs,
+ [],
+ ps.outputs,
+ [ps],
+ ps.placement,
+ False,
)
cps.time = idx
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 711c1e0..9565bc5 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -300,11 +300,20 @@ class SoftMax:
# PASS 5 - Sub
headroom_offset = create_const_tensor(
- "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant,
+ "headroom_offset_const",
+ [1, 1, 1, 1],
+ DataType.int32,
+ [12 + 31 - 8],
+ np.int32,
+ quantization=no_scale_quant,
)
right_shift = add_op_get_ofm(
create_sub(
- f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation,
+ f"{self.op.name}_sub{pass_number}",
+ headroom_offset,
+ headroom_plus_one,
+ no_scale_quant,
+ activation,
)
)
@@ -329,7 +338,13 @@ class SoftMax:
# PASS 9 - SHL
shifted_sum_minus_one = add_op_get_ofm(
- create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,)
+ create_shl(
+ f"{self.op.name}_shl{pass_number}",
+ shifted_sum_minus_one,
+ one,
+ no_scale_quant,
+ activation,
+ )
)
# PASS 10 - Add
@@ -353,7 +368,11 @@ class SoftMax:
)
rescaled = add_op_get_ofm(
create_mul(
- f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2,
+ f"{self.op.name}_mul{pass_number}",
+ half_denominator,
+ neg_32_over_17,
+ two_scale_quant,
+ activation2,
)
)
@@ -362,7 +381,13 @@ class SoftMax:
"48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant
)
rescale_w_offset = add_op_get_ofm(
- create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,)
+ create_add(
+ f"{self.op.name}_add{pass_number}",
+ rescaled,
+ const_48_over_17,
+ one_scale_quant,
+ activation,
+ )
)
# PASS 13 - 27
@@ -376,12 +401,22 @@ class SoftMax:
for _ in range(3):
# PASS 13, 18, 23 - MUL
half_denominator_times_x = add_op_get_ofm(
- create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,)
+ create_mul(
+ f"{self.op.name}_mul{pass_number}",
+ nr_x,
+ half_denominator,
+ two_scale_quant,
+ activation2,
+ )
)
# PASS 14, 19, 24 - SUB
one_minus_half_denominator_times_x = add_op_get_ofm(
create_sub(
- f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation,
+ f"{self.op.name}_sub{pass_number}",
+ F2_one,
+ half_denominator_times_x,
+ one_scale_quant,
+ activation,
)
)
# PASS 15, 20, 25 - MUL
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index d8a274b..22605a6 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -256,7 +256,8 @@ def print_performance_metrics_for_strat(
label += " bandwidth"
bandwidth = arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
print(
- f"Design peak {label:25} {bandwidth:12.2f} GB/s", file=f,
+ f"Design peak {label:25} {bandwidth:12.2f} GB/s",
+ file=f,
)
print(file=f)
for mem_area, label in mem_area_labels:
@@ -302,7 +303,8 @@ def print_performance_metrics_for_strat(
fm_bws = bws[TensorPurpose.FeatureMap]
aug_label = label + " bandwidth"
print(
- f"Average {aug_label:25} {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s", file=f,
+ f"Average {aug_label:25} {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s",
+ file=f,
)
print(
f"Input {aug_label:25} {np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0:12.2f} MB/batch",
@@ -328,10 +330,12 @@ def print_performance_metrics_for_strat(
print(file=f)
print(
- f"Neural network macs {int(macs):12d} MACs/batch", file=f,
+ f"Neural network macs {int(macs):12d} MACs/batch",
+ file=f,
)
print(
- f"Network Tops/s {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s", file=f,
+ f"Network Tops/s {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s",
+ file=f,
)
print(file=f)
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index c8b5129..ab65740 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -128,7 +128,12 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m
print("\n" + "#" * 80)
sg_placement = (
sg.placement.name
- if mem_type_set.intersection((MemType.Permanent_NPU, MemType.Permanent_CPU,))
+ if mem_type_set.intersection(
+ (
+ MemType.Permanent_NPU,
+ MemType.Permanent_CPU,
+ )
+ )
else "Cpu and Npu"
)
print(
@@ -141,7 +146,15 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m
min_mem_usage_for_alloc = max(memory_hist)
print("Start Time - End Time: Start Addr - End Addr: Tensor Size: Memory Usage: Tensor Purpose: Tensor Name")
for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
- (lr.start_time, lr.end_time, lr.size, tens.address, tens.address + lr.size, tens.purpose, tens.name,)
+ (
+ lr.start_time,
+ lr.end_time,
+ lr.size,
+ tens.address,
+ tens.address + lr.size,
+ tens.purpose,
+ tens.name,
+ )
for tens, lr in lrs.ranges.items()
):
print(
@@ -184,7 +197,11 @@ def allocate(
):
# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
lrs = live_range.extract_live_ranges_from_cascaded_passes(
- sg, mem_area, mem_type_set, lr_graph=lr_graph, cpu_tensor_alignment=cpu_tensor_alignment,
+ sg,
+ mem_area,
+ mem_type_set,
+ lr_graph=lr_graph,
+ cpu_tensor_alignment=cpu_tensor_alignment,
)
total_sz = 0
if lrs.ranges:
diff --git a/ethosu/vela/test/extapi/test_extapi_encode_weights.py b/ethosu/vela/test/extapi/test_extapi_encode_weights.py
index 6367cb3..87c504f 100644
--- a/ethosu/vela/test/extapi/test_extapi_encode_weights.py
+++ b/ethosu/vela/test/extapi/test_extapi_encode_weights.py
@@ -24,7 +24,8 @@ from ethosu.vela.api import NpuBlockTraversal
@pytest.mark.parametrize(
- "arch", list(NpuAccelerator),
+ "arch",
+ list(NpuAccelerator),
)
@pytest.mark.parametrize("dilation_x", [1, 2])
@pytest.mark.parametrize("dilation_y", [1, 2])
@@ -32,7 +33,12 @@ from ethosu.vela.api import NpuBlockTraversal
@pytest.mark.parametrize("depth_control", [1, 2, 3])
@pytest.mark.parametrize("weights_shape_and_block_depth", [((16, 16, 16, 16), 8), ((3, 3, 25, 16), 8)])
def test_encode_weights(
- arch, weights_shape_and_block_depth, dilation_x, dilation_y, ifm_bitdepth, depth_control,
+ arch,
+ weights_shape_and_block_depth,
+ dilation_x,
+ dilation_y,
+ ifm_bitdepth,
+ depth_control,
):
"""
This unit test checks the interface of the API function but not the functionality.
diff --git a/ethosu/vela/test/test_register_command_stream_util.py b/ethosu/vela/test/test_register_command_stream_util.py
index 985523f..86a48ff 100644
--- a/ethosu/vela/test/test_register_command_stream_util.py
+++ b/ethosu/vela/test/test_register_command_stream_util.py
@@ -131,14 +131,34 @@ def test_calc_blockdep0():
op2 takes 1 block to complete, which results in blockdep 0
"""
op1 = NpuElementWiseOperation(NpuElementWiseOp.CLZ)
- op1.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x60, layout=NpuLayout.NHCWB16,)
- intermediate_fm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xA0, layout=NpuLayout.NHCWB16,)
+ op1.ifm = create_feature_map(
+ NpuShape3D(height=1, width=1, depth=1),
+ 1,
+ 0x60,
+ layout=NpuLayout.NHCWB16,
+ )
+ intermediate_fm = create_feature_map(
+ NpuShape3D(height=1, width=1, depth=1),
+ 1,
+ 0xA0,
+ layout=NpuLayout.NHCWB16,
+ )
op1.ofm = intermediate_fm
op1.block_config = NpuShape3D(height=1, width=1, depth=4)
op2 = NpuElementWiseOperation(NpuElementWiseOp.SUB)
- op2.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x39AC0, layout=NpuLayout.NHCWB16,)
+ op2.ifm = create_feature_map(
+ NpuShape3D(height=1, width=1, depth=1),
+ 1,
+ 0x39AC0,
+ layout=NpuLayout.NHCWB16,
+ )
op2.ifm2 = intermediate_fm
- op2.ofm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xE0, layout=NpuLayout.NHCWB16,)
+ op2.ofm = create_feature_map(
+ NpuShape3D(height=1, width=1, depth=1),
+ 1,
+ 0xE0,
+ layout=NpuLayout.NHCWB16,
+ )
op2.block_config = NpuShape3D(height=1, width=1, depth=4)
arch = create_default_arch(Accelerator.Ethos_U55_128)
block_dep = calc_blockdep(arch, op1, op2)
@@ -153,8 +173,18 @@ def test_calc_blockdep2():
which results in blockdep 2
"""
op1 = NpuConv2DOperation()
- op1.ifm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,)
- op1.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,)
+ op1.ifm = create_feature_map(
+ NpuShape3D(height=4, width=48, depth=8),
+ 1,
+ 0x4C80,
+ layout=NpuLayout.NHCWB16,
+ )
+ op1.ofm = create_feature_map(
+ NpuShape3D(height=4, width=48, depth=16),
+ 1,
+ 0x6480,
+ layout=NpuLayout.NHCWB16,
+ )
op1.kernel = NpuKernel(1, 1)
op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=208)]
op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=160)]
@@ -162,10 +192,20 @@ def test_calc_blockdep2():
op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
op1.block_config = NpuShape3D(height=4, width=6, depth=16)
op2 = NpuConvDepthWiseOperation()
- op2.ifm = create_feature_map(NpuShape3D(height=3, width=48, depth=16), 1, 0, layout=NpuLayout.NHCWB16,)
+ op2.ifm = create_feature_map(
+ NpuShape3D(height=3, width=48, depth=16),
+ 1,
+ 0,
+ layout=NpuLayout.NHCWB16,
+ )
# op2 has two tiles, the lower tile is produced by op1
op2.ifm.tiles = NpuTileBox(height_0=2, height_1=2, width_0=48, addresses=[0x7680, 0, 0x6480, 0])
- op2.ofm = create_feature_map(NpuShape3D(height=1, width=24, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,)
+ op2.ofm = create_feature_map(
+ NpuShape3D(height=1, width=24, depth=16),
+ 1,
+ 0x6480,
+ layout=NpuLayout.NHCWB16,
+ )
op2.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2)
op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=208)]
op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=160)]
@@ -183,8 +223,18 @@ def test_calc_blockdep3():
which results in blockdep 3
"""
op1 = NpuConv2DOperation()
- op1.ifm = create_feature_map(NpuShape3D(height=13, width=96, depth=1), 1, 0, layout=NpuLayout.NHWC,)
- op1.ofm = create_feature_map(NpuShape3D(height=6, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,)
+ op1.ifm = create_feature_map(
+ NpuShape3D(height=13, width=96, depth=1),
+ 1,
+ 0,
+ layout=NpuLayout.NHWC,
+ )
+ op1.ofm = create_feature_map(
+ NpuShape3D(height=6, width=48, depth=8),
+ 1,
+ 0x7C80,
+ layout=NpuLayout.NHCWB16,
+ )
op1.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2)
op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=144)]
op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=80)]
@@ -192,8 +242,18 @@ def test_calc_blockdep3():
op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
op1.block_config = NpuShape3D(height=6, width=3, depth=8)
op2 = NpuConvDepthWiseOperation()
- op2.ifm = create_feature_map(NpuShape3D(height=5, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,)
- op2.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,)
+ op2.ifm = create_feature_map(
+ NpuShape3D(height=5, width=48, depth=8),
+ 1,
+ 0x7C80,
+ layout=NpuLayout.NHCWB16,
+ )
+ op2.ofm = create_feature_map(
+ NpuShape3D(height=4, width=48, depth=8),
+ 1,
+ 0x4C80,
+ layout=NpuLayout.NHCWB16,
+ )
op2.kernel = NpuKernel(3, 3)
op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=112)]
op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=80)]
diff --git a/ethosu/vela/test/test_tflite_model_semantic.py b/ethosu/vela/test/test_tflite_model_semantic.py
index 84f9916..1e5dbd4 100644
--- a/ethosu/vela/test/test_tflite_model_semantic.py
+++ b/ethosu/vela/test/test_tflite_model_semantic.py
@@ -128,7 +128,14 @@ def test_constraint_quant_scale_inf():
def test_constraint_ofm_scale_too_small():
# Tests handling of OFM scale < 1e-38
shp = [1, 10, 20, 16]
- op = testutil.create_elemwise_op(Op.Mul, "mul", shp, shp, shp, ofm_quant=testutil.default_quant_params(),)
+ op = testutil.create_elemwise_op(
+ Op.Mul,
+ "mul",
+ shp,
+ shp,
+ shp,
+ ofm_quant=testutil.default_quant_params(),
+ )
assert semantic_checker.is_operator_semantic_valid(op)
op.ofm.quantization.scale_f32 = 1e-43
assert not semantic_checker.is_operator_semantic_valid(op)
@@ -245,7 +252,12 @@ def create_strided_slice_op(in_shape, out_shape, start_offsets, end_offsets):
def create_pad_op(
- in_shape, out_shape, padding, in_dtype=DataType.int8, out_dtype=DataType.int8, pad_dtype=DataType.int32,
+ in_shape,
+ out_shape,
+ padding,
+ in_dtype=DataType.int8,
+ out_dtype=DataType.int8,
+ pad_dtype=DataType.int32,
):
qp = testutil.default_quant_params()
in0 = Tensor(in_shape, in_dtype, "in")
@@ -259,7 +271,11 @@ def create_pad_op(
def test_constraint_pad_input_count():
# Incorrect number of input tensors (2)
- op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[0, 0], [1, 1], [1, 1], [0, 0]],)
+ op = create_pad_op(
+ in_shape=[1, 1, 1, 1],
+ out_shape=[1, 3, 3, 1],
+ padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+ )
assert semantic_checker.is_operator_semantic_valid(op)
op.add_input_tensor(op.inputs[0].clone())
assert not semantic_checker.is_operator_semantic_valid(op)
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index e3db791..04d3cba 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -345,7 +345,12 @@ def test_constraint_concat_pass():
def create_pad_op(
- in_shape, out_shape, padding, in_dtype=DataType.int8, out_dtype=DataType.int8, pad_dtype=DataType.int32,
+ in_shape,
+ out_shape,
+ padding,
+ in_dtype=DataType.int8,
+ out_dtype=DataType.int8,
+ pad_dtype=DataType.int32,
):
qp = testutil.default_quant_params()
in0 = Tensor(in_shape, in_dtype, "in")
@@ -359,11 +364,23 @@ def create_pad_op(
def test_constraint_padded_dimensions():
# Incorrect padding dimensions, can only pad width and height
- op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [1, 1], [0, 0]],)
+ op = create_pad_op(
+ in_shape=[1, 1, 1, 1],
+ out_shape=[1, 3, 3, 1],
+ padding=[[1, 1], [1, 1], [1, 1], [0, 0]],
+ )
assert not support.is_operator_supported(op)
- op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 0]],)
+ op = create_pad_op(
+ in_shape=[1, 1, 1, 1],
+ out_shape=[1, 3, 3, 1],
+ padding=[[1, 1], [1, 1], [0, 0]],
+ )
assert support.is_operator_supported(op)
- op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 1]],)
+ op = create_pad_op(
+ in_shape=[1, 1, 1, 1],
+ out_shape=[1, 3, 3, 1],
+ padding=[[1, 1], [1, 1], [0, 1]],
+ )
assert not support.is_operator_supported(op)
@@ -371,12 +388,20 @@ def test_constraint_pad_shape():
# PAD operator must be of shape (3,2) or (4,2)
op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 0]])
assert support.is_operator_supported(op)
- op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[0, 0], [1, 1], [1, 1], [0, 0], [0, 0]],)
+ op = create_pad_op(
+ in_shape=[1, 1, 1, 1],
+ out_shape=[1, 3, 3, 1],
+ padding=[[0, 0], [1, 1], [1, 1], [0, 0], [0, 0]],
+ )
assert not support.is_operator_supported(op)
def test_constraint_pad_none():
- op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[],)
+ op = create_pad_op(
+ in_shape=[1, 1, 1, 1],
+ out_shape=[1, 3, 3, 1],
+ padding=[],
+ )
assert not support.is_operator_supported(op)
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index fb8a08c..88d58a3 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -512,7 +512,10 @@ def add_padding_fields(op, arch, nng):
)
else:
padding, skirt = calc_padding_and_skirt(
- op.attrs["padding"], op.kernel, input_shape, op.attrs.get("explicit_padding"),
+ op.attrs["padding"],
+ op.kernel,
+ input_shape,
+ op.attrs.get("explicit_padding"),
)
op.attrs["explicit_padding"] = padding
@@ -642,11 +645,11 @@ def convert_softmax(op, arch, nng):
def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
r"""Whenever there is a subgraph with this topology:
- Input X For X = -1 or X > 0
- | \ / This subgraph can be replaced with either
- | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
- | /
- Max
+ Input X For X = -1 or X > 0
+ | \ / This subgraph can be replaced with either
+ | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
+ | /
+ Max
"""
if op.type == Op.Maximum:
@@ -1246,7 +1249,12 @@ def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
quant = QuantizationParameters()
quant.zero_point = 0
bias_term_tens = create_const_tensor(
- op.name + "_bias", [1, 1, 1, 1], DataType.int16, [bias_term], np.int16, quantization=quant,
+ op.name + "_bias",
+ [1, 1, 1, 1],
+ DataType.int16,
+ [bias_term],
+ np.int16,
+ quantization=quant,
)
add_op.add_input_tensor(bias_term_tens)
add_op.set_output_tensor(op.ofm)
@@ -1370,7 +1378,12 @@ def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
bias_shape = [shape[-1]]
op.set_input_tensor(
create_const_tensor(
- "bias", bias_shape, inp.dtype, np.ones(bias_shape) * bias, value_dtype=np.int32, quantization=None,
+ "bias",
+ bias_shape,
+ inp.dtype,
+ np.ones(bias_shape) * bias,
+ value_dtype=np.int32,
+ quantization=None,
),
2,
)
@@ -1392,7 +1405,12 @@ def tflite_optimise_graph(nng, arch):
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [],
+ pre_process_list,
+ rewrite_unsupported=False,
)
# Handle Concat Ops
@@ -1413,13 +1431,23 @@ def tflite_optimise_graph(nng, arch):
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [rewrite_split_ops],
+ [],
+ rewrite_unsupported=False,
)
# Handle sg input output
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], [fix_sg_input_output], rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [],
+ [fix_sg_input_output],
+ rewrite_unsupported=False,
)
# Removal of memory only operators
@@ -1452,7 +1480,12 @@ def tflite_optimise_graph(nng, arch):
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [],
+ op_rewrite_list,
+ rewrite_unsupported=False,
)
for idx, sg in enumerate(nng.subgraphs):
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
index 2a70b06..a5f7fa2 100644
--- a/ethosu/vela/tflite_mapping.py
+++ b/ethosu/vela/tflite_mapping.py
@@ -669,7 +669,10 @@ builtin_operator_map = {
BuiltinOperator.DIV: (Op.Div, OptionsSerializer("DivOptions", (fused_act,)), TFLITE_NO_INDICES),
BuiltinOperator.SQUEEZE: (
Op.Squeeze,
- OptionsSerializer("SqueezeOptions", (("squeeze_dims", is_int_vec),),),
+ OptionsSerializer(
+ "SqueezeOptions",
+ (("squeeze_dims", is_int_vec),),
+ ),
TFLITE_IFM_INDICES,
),
BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: (
@@ -915,7 +918,13 @@ builtin_operator_map = {
),
BuiltinOperator.VAR_HANDLE: (
Op.VarHandle,
- OptionsSerializer("VarHandleOptions", ("container", "shared_name",),),
+ OptionsSerializer(
+ "VarHandleOptions",
+ (
+ "container",
+ "shared_name",
+ ),
+ ),
TFLITE_NO_INDICES,
),
BuiltinOperator.READ_VARIABLE: (Op.ReadVariable, OptionsSerializer("ReadVariableOptions"), TFLITE_NO_INDICES),
@@ -923,7 +932,13 @@ builtin_operator_map = {
BuiltinOperator.BROADCAST_ARGS: (Op.BroadcastArgs, None, TFLITE_NO_INDICES),
BuiltinOperator.RANDOM_STANDARD_NORMAL: (
Op.RandomStandardNormal,
- OptionsSerializer("RandomOptions", ("seed", "seed2",),),
+ OptionsSerializer(
+ "RandomOptions",
+ (
+ "seed",
+ "seed2",
+ ),
+ ),
TFLITE_NO_INDICES,
),
BuiltinOperator.CUSTOM: (Op.Custom, CustomOptionsSerializer(), TFLITE_NO_INDICES),
diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py
index 3b7f248..b264479 100644
--- a/ethosu/vela/tflite_model_semantic.py
+++ b/ethosu/vela/tflite_model_semantic.py
@@ -41,7 +41,13 @@ def _optype_formatter(op_list):
class TFLiteSemantic:
# Categorised lists of operators
- convolution_ops = set((Op.Conv2DBias, Op.Conv2D, Op.QuantizedConv2D,))
+ convolution_ops = set(
+ (
+ Op.Conv2DBias,
+ Op.Conv2D,
+ Op.QuantizedConv2D,
+ )
+ )
depthwise_convolution_ops = set((Op.DepthwiseConv2DBias,))
transpose_convolution_ops = set((Op.Conv2DBackpropInput,))
convolution_like_ops = convolution_ops | depthwise_convolution_ops | transpose_convolution_ops
@@ -49,13 +55,36 @@ class TFLiteSemantic:
avg_pooling_ops = Op.op_set(Op.is_avgpool_op)
pooling_ops = set((Op.ReduceSum,)) | max_pooling_ops | avg_pooling_ops
unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op)
- binary_elem_wise_min_max_ops = set((Op.Minimum, Op.Maximum,))
- binary_elem_wise_shift_ops = set((Op.SHL, Op.SHR,))
- binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.Sub,))
+ binary_elem_wise_min_max_ops = set(
+ (
+ Op.Minimum,
+ Op.Maximum,
+ )
+ )
+ binary_elem_wise_shift_ops = set(
+ (
+ Op.SHL,
+ Op.SHR,
+ )
+ )
+ binary_elem_wise_add_mul_sub = set(
+ (
+ Op.Add,
+ Op.Mul,
+ Op.Sub,
+ )
+ )
binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
shapeless_input_ops = binary_elem_wise_main_ops | set((Op.Split, Op.SplitV, Op.Mean, Op.ExpandDims))
- reshape_ops = set((Op.Reshape, Op.QuantizedReshape, Op.Squeeze, Op.ExpandDims,))
+ reshape_ops = set(
+ (
+ Op.Reshape,
+ Op.QuantizedReshape,
+ Op.Squeeze,
+ Op.ExpandDims,
+ )
+ )
def __init__(self):
# Setup the generic constraints. Note: the order matters
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 4d82677..6328a4e 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -40,7 +40,13 @@ def _optype_formatter(op_list):
class TFLiteSupportedOperators:
# Categorised lists of supported operators
npu_pre_ops = set((Op.SplitSliceRead,))
- convolution_ops = set((Op.Conv2DBias, Op.Conv2D, Op.QuantizedConv2D,))
+ convolution_ops = set(
+ (
+ Op.Conv2DBias,
+ Op.Conv2D,
+ Op.QuantizedConv2D,
+ )
+ )
depthwise_convolution_ops = set((Op.DepthwiseConv2DBias,))
transpose_convolution_ops = set((Op.Conv2DBackpropInput,))
convolution_like_ops = convolution_ops | depthwise_convolution_ops | transpose_convolution_ops
@@ -48,7 +54,13 @@ class TFLiteSupportedOperators:
avg_pooling_ops = Op.op_set(Op.is_avgpool_op)
pooling_ops = set((Op.ReduceSum,)) | max_pooling_ops | avg_pooling_ops
resizing_ops = set((Op.ResizeBilinear,))
- fc_vector_products = set((Op.QuantizedMatMul, Op.MatMul, Op.FullyConnected,))
+ fc_vector_products = set(
+ (
+ Op.QuantizedMatMul,
+ Op.MatMul,
+ Op.FullyConnected,
+ )
+ )
mac_main_ops = (
# RNN/LSTM/GRU
set((Op.BlockLSTM,))
@@ -64,17 +76,47 @@ class TFLiteSupportedOperators:
| set((Op.Mean,))
)
unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op)
- binary_elem_wise_min_max_ops = set((Op.Minimum, Op.Maximum,))
- binary_elem_wise_shift_ops = set((Op.SHL, Op.SHR,))
- binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.Sub,))
+ binary_elem_wise_min_max_ops = set(
+ (
+ Op.Minimum,
+ Op.Maximum,
+ )
+ )
+ binary_elem_wise_shift_ops = set(
+ (
+ Op.SHL,
+ Op.SHR,
+ )
+ )
+ binary_elem_wise_add_mul_sub = set(
+ (
+ Op.Add,
+ Op.Mul,
+ Op.Sub,
+ )
+ )
binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
pad_ops = set((Op.Pad,))
supported_int32_tensor_ops = (
- set((Op.ReduceSum, Op.CLZ,)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
+ set(
+ (
+ Op.ReduceSum,
+ Op.CLZ,
+ )
+ )
+ | binary_elem_wise_add_mul_sub
+ | binary_elem_wise_shift_ops
)
- relu_ops = set((Op.Relu, Op.Relu6, Op.ReluN1To1, Op.Clip,))
+ relu_ops = set(
+ (
+ Op.Relu,
+ Op.Relu6,
+ Op.ReluN1To1,
+ Op.Clip,
+ )
+ )
activation_ops = relu_ops | set((Op.Tanh, Op.Sigmoid, Op.Softmax, Op.HardSwish))
npu_post_ops = (
# activation functions
@@ -84,11 +126,44 @@ class TFLiteSupportedOperators:
# Quantization
| set((Op.Quantize,))
)
- split_ops = set((Op.Split, Op.SplitV, Op.StridedSlice, Op.Slice, Op.UnpackReshaped, Op.Unpack,))
- concat_ops = set((Op.Concat, Op.ConcatTFLite, Op.PackReshaped, Op.Pack,))
- memory_only_ops = set((Op.Reshape, Op.QuantizedReshape, Op.Squeeze, Op.ExpandDims,)) | concat_ops | split_ops
+ split_ops = set(
+ (
+ Op.Split,
+ Op.SplitV,
+ Op.StridedSlice,
+ Op.Slice,
+ Op.UnpackReshaped,
+ Op.Unpack,
+ )
+ )
+ concat_ops = set(
+ (
+ Op.Concat,
+ Op.ConcatTFLite,
+ Op.PackReshaped,
+ Op.Pack,
+ )
+ )
+ memory_only_ops = (
+ set(
+ (
+ Op.Reshape,
+ Op.QuantizedReshape,
+ Op.Squeeze,
+ Op.ExpandDims,
+ )
+ )
+ | concat_ops
+ | split_ops
+ )
per_axis_quant_ops = convolution_like_ops # per-axis/channel quantization only currently supported for conv ops
- supported_fused_activations = relu_ops | set((Op.Tanh, Op.Sigmoid, Op.LUT,))
+ supported_fused_activations = relu_ops | set(
+ (
+ Op.Tanh,
+ Op.Sigmoid,
+ Op.LUT,
+ )
+ )
supported_operators = npu_pre_ops | mac_main_ops | elem_wise_main_ops | pad_ops | npu_post_ops | memory_only_ops
# Supported data types
supported_op_dtypes = set((DataType.uint8, DataType.int8, DataType.int16, DataType.int32))
@@ -441,7 +516,7 @@ class TFLiteSupportedOperators:
@staticmethod
def constraint_tconv_valid(op):
"""VALID padding: OFM dimensions must equal IFM dimensions multiplied by stride,
- minus difference between kernel size and stride"""
+ minus difference between kernel size and stride"""
if op.attrs["padding"] == Padding.VALID:
s_w = op.kernel.stride.x
s_h = op.kernel.stride.y
diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
index 9e72a6c..778aa2a 100644
--- a/ethosu/vela/tosa_graph_optimiser.py
+++ b/ethosu/vela/tosa_graph_optimiser.py
@@ -876,7 +876,12 @@ def tosa_optimise_graph(nng, arch):
# TODO the supported operator checking need to be split in semantic and HW checks
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], [supported_operator_check], rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [],
+ [supported_operator_check],
+ rewrite_unsupported=False,
)
# Decomposing and rewrite of concat
@@ -893,7 +898,12 @@ def tosa_optimise_graph(nng, arch):
# Handle sg input output
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], [fix_sg_input_output_tosa], rewrite_unsupported=True,
+ nng,
+ sg,
+ arch,
+ [],
+ [fix_sg_input_output_tosa],
+ rewrite_unsupported=True,
)
# Removal of reshapes
@@ -909,19 +919,34 @@ def tosa_optimise_graph(nng, arch):
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], [set_ifm_ofm_op_shapes], rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [],
+ [set_ifm_ofm_op_shapes],
+ rewrite_unsupported=False,
)
# Removal of Transpose
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], [remove_const_transpose], rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [],
+ [remove_const_transpose],
+ rewrite_unsupported=False,
)
# TODO, when and where to best handle calc_scaling_avgpool
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], [calc_scaling_avgpool], rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [],
+ [calc_scaling_avgpool],
+ rewrite_unsupported=False,
)
# Rewite Operators step
@@ -929,13 +954,22 @@ def tosa_optimise_graph(nng, arch):
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
+ nng,
+ sg,
+ arch,
+ [],
+ op_rewrite_list,
+ rewrite_unsupported=False,
)
# Post-processing step 1
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
- nng, sg, arch, [], [rewrite_activation, add_padding_fields],
+ nng,
+ sg,
+ arch,
+ [],
+ [rewrite_activation, add_padding_fields],
)
# Removal of Slice, need to be done after optimisation has been performed,
@@ -946,6 +980,12 @@ def tosa_optimise_graph(nng, arch):
# Post-processing step 2
for idx, sg in enumerate(nng.subgraphs):
- nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [fixup_quantization],)
+ nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
+ nng,
+ sg,
+ arch,
+ [],
+ [fixup_quantization],
+ )
return nng
diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py
index e378511..15e1569 100644
--- a/ethosu/vela/tosa_supported_operators.py
+++ b/ethosu/vela/tosa_supported_operators.py
@@ -38,11 +38,30 @@ class TosaSupportedOperators:
fc_vector_products = set((Op.FullyConnected,))
mac_main_ops = convolution_like_ops | pooling_ops | fc_vector_products
- memory_only_ops = set((Op.Reshape, Op.Transpose, Op.Concat, Op.SplitSliceRead,))
- binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.RescaleMul, Op.Sub,))
+ memory_only_ops = set(
+ (
+ Op.Reshape,
+ Op.Transpose,
+ Op.Concat,
+ Op.SplitSliceRead,
+ )
+ )
+ binary_elem_wise_add_mul_sub = set(
+ (
+ Op.Add,
+ Op.Mul,
+ Op.RescaleMul,
+ Op.Sub,
+ )
+ )
elem_wise_ops = binary_elem_wise_add_mul_sub
type_conversion_ops = set((Op.Rescale,))
- relu_ops = set((Op.Clamp, Op.ReluN,))
+ relu_ops = set(
+ (
+ Op.Clamp,
+ Op.ReluN,
+ )
+ )
activation_ops = relu_ops | set((Op.Table,))
pad_ops = set((Op.Pad,))
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index cdac641..78c4351 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -42,7 +42,8 @@ from ethosu import mlw_codec
# Contains meta info for a weight compression. If two tensors have identical weight compression config,
# then they also will have identical compressed weights.
WeightCompressionConfig = namedtuple(
- "WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"],
+ "WeightCompressionConfig",
+ ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"],
)
ScaleCompressionConfig = namedtuple("ScaleCompressionConfig", ["scale_value_id", "ifm_scale", "ofm_scale"])