From d85750702229af97c0b0bbda6e397a23254b6144 Mon Sep 17 00:00:00 2001 From: Jonas Ohlsson Date: Wed, 30 Mar 2022 10:30:25 +0200 Subject: Update version of Black to 22.3.0 Update version of Black to 22.3.0 due to updated dependencies. Updates to fix reported issues due to new version. Signed-off-by: Jonas Ohlsson Change-Id: I60056aae452093ce8dcea1f499ecced22b25eef1 --- .pre-commit-config.yaml | 2 +- ethosu/mlw_codec/test/test_mlw_codec.py | 2 +- ethosu/vela/api.py | 4 +- ethosu/vela/architecture_features.py | 11 ++- ethosu/vela/compiler_driver.py | 12 +-- ethosu/vela/driver_actions.py | 3 +- ethosu/vela/high_level_command_stream_generator.py | 8 +- ethosu/vela/live_range.py | 6 +- ethosu/vela/npu_performance.py | 24 ++++-- ethosu/vela/pass_packing.py | 9 +- ethosu/vela/range_set.py | 2 +- ethosu/vela/register_command_stream_generator.py | 17 ++-- ethosu/vela/register_command_stream_util.py | 25 +++++- ethosu/vela/scheduler.py | 59 +++++++++++-- ethosu/vela/softmax.py | 49 +++++++++-- ethosu/vela/stats_writer.py | 12 ++- ethosu/vela/tensor_allocation.py | 23 ++++- .../vela/test/extapi/test_extapi_encode_weights.py | 10 ++- .../vela/test/test_register_command_stream_util.py | 84 +++++++++++++++--- ethosu/vela/test/test_tflite_model_semantic.py | 22 ++++- .../vela/test/test_tflite_supported_operators.py | 37 ++++++-- ethosu/vela/tflite_graph_optimiser.py | 57 ++++++++++--- ethosu/vela/tflite_mapping.py | 21 ++++- ethosu/vela/tflite_model_semantic.py | 39 +++++++-- ethosu/vela/tflite_supported_operators.py | 99 +++++++++++++++++++--- ethosu/vela/tosa_graph_optimiser.py | 56 ++++++++++-- ethosu/vela/tosa_supported_operators.py | 25 +++++- ethosu/vela/weight_compressor.py | 3 +- 28 files changed, 598 insertions(+), 123 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ae2bae58..9c707068 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,7 +15,7 @@ repos: - id: reorder-python-imports - repo: https://github.com/psf/black - rev: 19.10b0 + rev: 22.3.0 hooks: - id: black language_version: python3 diff --git a/ethosu/mlw_codec/test/test_mlw_codec.py b/ethosu/mlw_codec/test/test_mlw_codec.py index 3ff26e53..d77c82a1 100644 --- a/ethosu/mlw_codec/test/test_mlw_codec.py +++ b/ethosu/mlw_codec/test/test_mlw_codec.py @@ -24,7 +24,7 @@ from ethosu import mlw_codec class TestMLWCodec: - """ This class is responsible to test the mlw_codec library + """This class is responsible to test the mlw_codec library It mainly tests the two methods encode() and decode() with different inputs""" weights = [0, 2, 3, 0, -1, -2, -3, 0, 0, 0, 1, -250, 240] * 3 diff --git a/ethosu/vela/api.py b/ethosu/vela/api.py index f49df259..3382ea99 100644 --- a/ethosu/vela/api.py +++ b/ethosu/vela/api.py @@ -139,11 +139,11 @@ class NpuDataType(Enum): return self.value[1] def size_in_bits(self) -> int: - """ Size of the data type in bits""" + """Size of the data type in bits""" return self.value[0] def size_in_bytes(self) -> int: - """ Size of the data type in bytes""" + """Size of the data type in bytes""" return self.value[0] // 8 def min_value(self) -> int: diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py index e79ed720..08ff260c 100644 --- a/ethosu/vela/architecture_features.py +++ b/ethosu/vela/architecture_features.py @@ -647,7 +647,9 @@ class ArchitectureFeatures: else: raise CliOptionError( - "--system-config", self.system_config, f"Section {sys_cfg_section} not found in Vela config file", + "--system-config", + self.system_config, + f"Section {sys_cfg_section} not found in Vela config file", ) # read the memory mode @@ -678,7 +680,9 @@ class ArchitectureFeatures: else: raise CliOptionError( - "--memory-mode", self.memory_mode, f"Section {mem_mode_section} not found in Vela config file", + "--memory-mode", + self.memory_mode, + f"Section {mem_mode_section} not found in Vela config file", ) # override sram to onchipflash @@ -777,7 +781,8 @@ class ArchitectureFeatures: # check for recursion loop if inheritance_section == section: raise ConfigOptionError( - "inherit", f"{inheritance_section}. This references its own section and recursion is not allowed", + "inherit", + f"{inheritance_section}. This references its own section and recursion is not allowed", ) result = self._read_config(inheritance_section, key, result, found) diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index cf26eb3b..2715c8fe 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -44,10 +44,9 @@ from .tensor import Tensor class CompilerOptions: """Set of options to change compiler behaviour - verbosity, targets, turning off passes. -Note the difference between ArchitectureFeatures and CompilerOptions -- ArchitectureFeatures is for changing the Ethos-U and system architecture -- CompilerOptions is for changing the behaviour of the compiler -""" + Note the difference between ArchitectureFeatures and CompilerOptions + - ArchitectureFeatures is for changing the Ethos-U and system architecture + - CompilerOptions is for changing the behaviour of the compiler""" def __init__( self, @@ -194,7 +193,10 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type): # Calculate live ranges for all constant Npu tensors, in permanent storage for sg in npu_subgraphs: lr_graph_flash = live_range.create_linear_live_range_graph( - sg, permanent_storage, MemType.Permanent_NPU, lr_graph=lr_graph_flash, + sg, + permanent_storage, + MemType.Permanent_NPU, + lr_graph=lr_graph_flash, ) if npu_subgraphs: diff --git a/ethosu/vela/driver_actions.py b/ethosu/vela/driver_actions.py index 90af02c2..4ad2a334 100644 --- a/ethosu/vela/driver_actions.py +++ b/ethosu/vela/driver_actions.py @@ -119,8 +119,7 @@ def emit_dump_shram(data: List[int]): def create_driver_payload(register_command_stream: List[int], arch: ArchitectureFeatures) -> bytes: - """Creates driver header and includes the given command - """ + """Creates driver header and includes the given command""" # Prepare driver actions for this command tensor da_list: List[int] = [] emit_fourcc(da_list, "COP1") diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py index eef4e6d6..81c0d5b4 100644 --- a/ethosu/vela/high_level_command_stream_generator.py +++ b/ethosu/vela/high_level_command_stream_generator.py @@ -67,7 +67,13 @@ def generate_high_level_commands_for_sched_op(sched_op, schedule): ofm_tensor = ps.ofm_tensor # Get Tensors and Full Shapes - (ifm_tensor, ifm2_tensor, uncomp_weight_tensor, _, _,) = parent_op.get_ifm_ifm2_weights_biases_ofm() + ( + ifm_tensor, + ifm2_tensor, + uncomp_weight_tensor, + _, + _, + ) = parent_op.get_ifm_ifm2_weights_biases_ofm() ifm = sched_op.ifm ifm2 = sched_op.ifm2 ofm_shape = sched_op.ofm.shape diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py index 45baf440..ccf49297 100644 --- a/ethosu/vela/live_range.py +++ b/ethosu/vela/live_range.py @@ -200,7 +200,11 @@ def merge_elementwise_op_ranges(sg, sched_op, lr_graph, target_mem_area, target_ def extract_live_ranges_from_cascaded_passes( - sg, target_mem_area, target_mem_type_set, lr_graph=None, cpu_tensor_alignment=Tensor.AllocationQuantum, + sg, + target_mem_area, + target_mem_type_set, + lr_graph=None, + cpu_tensor_alignment=Tensor.AllocationQuantum, ): if lr_graph is None: lr_graph = LiveRangeGraph() diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 4ffca496..0c8a9073 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -59,14 +59,26 @@ class PassCycles(IntEnum): Size = auto() def display_name(self): - return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[ - self.value - ] + return ( + "NPU", + "SRAM Access", + "DRAM Access", + "On-chip Flash Access", + "Off-chip Flash Access", + "Total", + "Size", + )[self.value] def identifier_name(self): - return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[ - self.value - ] + return ( + "npu", + "sram_access", + "dram_access", + "on_chip_flash_access", + "off_chip_flash_access", + "total", + "size", + )[self.value] @staticmethod def all(): diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py index 1fefdf42..8535fa06 100644 --- a/ethosu/vela/pass_packing.py +++ b/ethosu/vela/pass_packing.py @@ -87,7 +87,14 @@ quantization_ops = set((Op.Dequantize, Op.Max, Op.Min)) cpu_ops = set((Op.Softmax, Op.LRN, Op.Shape, Op.Pad, Op.AddN)) | quantization_ops startup_init_ops = set((Op.Const, Op.Placeholder, Op.SubgraphInput)) -memory_only_ops = set((Op.Squeeze, Op.Reshape, Op.QuantizedReshape, Op.ExpandDims,)) +memory_only_ops = set( + ( + Op.Squeeze, + Op.Reshape, + Op.QuantizedReshape, + Op.ExpandDims, + ) +) test_sequence = [ diff --git a/ethosu/vela/range_set.py b/ethosu/vela/range_set.py index f03174ed..6b282824 100644 --- a/ethosu/vela/range_set.py +++ b/ethosu/vela/range_set.py @@ -21,7 +21,7 @@ from functools import lru_cache class RangeSet: """A Range set class to track ranges and whether they intersect. -Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas.""" + Intended for e.g. tracking sets of memory ranges and whether two commands use the same memory areas.""" def __init__(self, start=None, end=None, ranges=None): if ranges is None: diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index 3be2898c..be01a75b 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -521,7 +521,8 @@ def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], a def generate_block_config( - emit: CommandStreamEmitter, block_config: NpuShape3D, + emit: CommandStreamEmitter, + block_config: NpuShape3D, ): """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers""" emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1) @@ -530,7 +531,9 @@ def generate_block_config( def generate_shram_registers( - emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig, + emit: CommandStreamEmitter, + npu_op: NpuBlockOperation, + arch_block_config: ArchitectureBlockConfig, ): """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers""" emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end) @@ -775,9 +778,13 @@ def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElem if use_advanced_scaling: # Use advanced implementation only when input/output scales differ, # or when we can't guarantee the absence of rounding errors - (opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale( - input_scale, input2_scale, output_scale, bitdepth - ) + ( + opa_scale, + opa_shift, + ofm_scale, + shift, + op_to_scale, + ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth) opb_scale = 0 # Unused for this case if npu_op.reversed_operands: # If the operand order is reversed we also have to swap which operand is scaled diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py index 83126ead..b2c84d7c 100644 --- a/ethosu/vela/register_command_stream_util.py +++ b/ethosu/vela/register_command_stream_util.py @@ -204,7 +204,16 @@ def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]: strides = get_strides(fm) height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 - t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,) + t0 = get_address_range( + fm, + strides, + 0, + 0, + 0, + min(height, height_0) - 1, + min(width, width_0) - 1, + depth - 1, + ) if width > width_0: t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1) else: @@ -443,7 +452,9 @@ def get_first_job_input_volume( # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z) end_coord = PointXYZ( - x=start_coord[0] + ifm_block.width, y=start_coord[1] + ifm_block.height, z=start_coord[2] + ifm_block.depth, + x=start_coord[0] + ifm_block.width, + y=start_coord[1] + ifm_block.height, + z=start_coord[2] + ifm_block.depth, ) return (start_coord, end_coord, 1) # start, end, total jobs @@ -456,12 +467,18 @@ def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int): if start_coord is None: return None end_coord = PointXYZ( - x=start_coord.x + ofm_block.width, y=start_coord.y + ofm_block.height, z=start_coord.z + ofm_block.depth, + x=start_coord.x + ofm_block.width, + y=start_coord.y + ofm_block.height, + z=start_coord.z + ofm_block.depth, ) return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block -def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int: +def calc_blockdep( + arch: ArchitectureFeatures, + prev_op: Optional[NpuBlockOperation], + npu_op: NpuBlockOperation, +) -> int: """Calculates the value of the BLOCKDEP register""" if prev_op is None: return 0 diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index fe2d711e..a19d0531 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -113,7 +113,13 @@ class SchedulerOpInfo: self.full_weight_transfer_cycles = 0 def copy(self): - res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,) + res = SchedulerOpInfo( + self.block_config, + self.weights_size, + self.stripe_input, + self.stripe_input2, + self.stripe, + ) res.cascade = self.cascade return res @@ -135,7 +141,10 @@ class SchedulerOptions: """Contains options for the Scheduler""" def __init__( - self, optimization_strategy, sram_target, verbose_schedule, + self, + optimization_strategy, + sram_target, + verbose_schedule, ): self.optimization_strategy = optimization_strategy self.optimization_sram_limit = sram_target @@ -175,15 +184,28 @@ class SchedulerOperation: ) self.ifm_ublock = arch.ifm_ublock - self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,) + self.ifm = SchedulerTensor( + ps.ifm_shapes[0], + ps.ifm_tensor.dtype, + ps.ifm_tensor.mem_area, + ps.ifm_tensor.format, + ) self.ifm2 = None if ps.ifm2_tensor: self.ifm2 = SchedulerTensor( - ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format, + ps.ifm_shapes[1], + ps.ifm2_tensor.dtype, + ps.ifm2_tensor.mem_area, + ps.ifm2_tensor.format, ) - self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,) + self.ofm = SchedulerTensor( + ps.ofm_shapes[0], + ps.ofm_tensor.dtype, + ps.ofm_tensor.mem_area, + ps.ofm_tensor.format, + ) # Input volume width and height required to produce the smallest possible stripe self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input() @@ -481,7 +503,11 @@ class Scheduler: lr_graph = live_range.LiveRangeGraph() for mem_area, mem_type_set in memories_list: live_range.extract_live_ranges_from_cascaded_passes( - self.nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum, + self.nng.get_root_subgraph(), + mem_area, + mem_type_set, + lr_graph, + Tensor.AllocationQuantum, ) # Populate time-array with memory used by live ranges @@ -923,7 +949,11 @@ class Scheduler: return best_schedule def optimize_schedule( - self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions, + self, + schedule: Schedule, + max_sched: Schedule, + max_template: Schedule, + options: SchedulerOptions, ) -> Schedule: """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule""" sram_limit = options.optimization_sram_limit @@ -994,7 +1024,11 @@ class Scheduler: lr_graph = live_range.LiveRangeGraph() for mem_area, mem_type_set in memories_list: live_range.extract_live_ranges_from_cascaded_passes( - self.nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum, + self.nng.get_root_subgraph(), + mem_area, + mem_type_set, + lr_graph, + Tensor.AllocationQuantum, ) max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area) @@ -1252,7 +1286,14 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o cascaded_passes = [] for idx, ps in enumerate(sg.passes): cps = CascadedPass( - ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False, + ps.name, + SchedulingStrategy.WeightStream, + ps.inputs, + [], + ps.outputs, + [ps], + ps.placement, + False, ) cps.time = idx diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py index 711c1e04..9565bc5c 100644 --- a/ethosu/vela/softmax.py +++ b/ethosu/vela/softmax.py @@ -300,11 +300,20 @@ class SoftMax: # PASS 5 - Sub headroom_offset = create_const_tensor( - "headroom_offset_const", [1, 1, 1, 1], DataType.int32, [12 + 31 - 8], np.int32, quantization=no_scale_quant, + "headroom_offset_const", + [1, 1, 1, 1], + DataType.int32, + [12 + 31 - 8], + np.int32, + quantization=no_scale_quant, ) right_shift = add_op_get_ofm( create_sub( - f"{self.op.name}_sub{pass_number}", headroom_offset, headroom_plus_one, no_scale_quant, activation, + f"{self.op.name}_sub{pass_number}", + headroom_offset, + headroom_plus_one, + no_scale_quant, + activation, ) ) @@ -329,7 +338,13 @@ class SoftMax: # PASS 9 - SHL shifted_sum_minus_one = add_op_get_ofm( - create_shl(f"{self.op.name}_shl{pass_number}", shifted_sum_minus_one, one, no_scale_quant, activation,) + create_shl( + f"{self.op.name}_shl{pass_number}", + shifted_sum_minus_one, + one, + no_scale_quant, + activation, + ) ) # PASS 10 - Add @@ -353,7 +368,11 @@ class SoftMax: ) rescaled = add_op_get_ofm( create_mul( - f"{self.op.name}_mul{pass_number}", half_denominator, neg_32_over_17, two_scale_quant, activation2, + f"{self.op.name}_mul{pass_number}", + half_denominator, + neg_32_over_17, + two_scale_quant, + activation2, ) ) @@ -362,7 +381,13 @@ class SoftMax: "48_over_17_const", [1, 1, 1, 1], DataType.int32, [1515870810], np.int32, quantization=no_scale_quant ) rescale_w_offset = add_op_get_ofm( - create_add(f"{self.op.name}_add{pass_number}", rescaled, const_48_over_17, one_scale_quant, activation,) + create_add( + f"{self.op.name}_add{pass_number}", + rescaled, + const_48_over_17, + one_scale_quant, + activation, + ) ) # PASS 13 - 27 @@ -376,12 +401,22 @@ class SoftMax: for _ in range(3): # PASS 13, 18, 23 - MUL half_denominator_times_x = add_op_get_ofm( - create_mul(f"{self.op.name}_mul{pass_number}", nr_x, half_denominator, two_scale_quant, activation2,) + create_mul( + f"{self.op.name}_mul{pass_number}", + nr_x, + half_denominator, + two_scale_quant, + activation2, + ) ) # PASS 14, 19, 24 - SUB one_minus_half_denominator_times_x = add_op_get_ofm( create_sub( - f"{self.op.name}_sub{pass_number}", F2_one, half_denominator_times_x, one_scale_quant, activation, + f"{self.op.name}_sub{pass_number}", + F2_one, + half_denominator_times_x, + one_scale_quant, + activation, ) ) # PASS 15, 20, 25 - MUL diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py index d8a274b0..22605a6b 100644 --- a/ethosu/vela/stats_writer.py +++ b/ethosu/vela/stats_writer.py @@ -256,7 +256,8 @@ def print_performance_metrics_for_strat( label += " bandwidth" bandwidth = arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 print( - f"Design peak {label:25} {bandwidth:12.2f} GB/s", file=f, + f"Design peak {label:25} {bandwidth:12.2f} GB/s", + file=f, ) print(file=f) for mem_area, label in mem_area_labels: @@ -302,7 +303,8 @@ def print_performance_metrics_for_strat( fm_bws = bws[TensorPurpose.FeatureMap] aug_label = label + " bandwidth" print( - f"Average {aug_label:25} {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s", file=f, + f"Average {aug_label:25} {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s", + file=f, ) print( f"Input {aug_label:25} {np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0:12.2f} MB/batch", @@ -328,10 +330,12 @@ def print_performance_metrics_for_strat( print(file=f) print( - f"Neural network macs {int(macs):12d} MACs/batch", file=f, + f"Neural network macs {int(macs):12d} MACs/batch", + file=f, ) print( - f"Network Tops/s {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s", file=f, + f"Network Tops/s {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s", + file=f, ) print(file=f) diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index c8b5129d..ab65740e 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -128,7 +128,12 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m print("\n" + "#" * 80) sg_placement = ( sg.placement.name - if mem_type_set.intersection((MemType.Permanent_NPU, MemType.Permanent_CPU,)) + if mem_type_set.intersection( + ( + MemType.Permanent_NPU, + MemType.Permanent_CPU, + ) + ) else "Cpu and Npu" ) print( @@ -141,7 +146,15 @@ def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_m min_mem_usage_for_alloc = max(memory_hist) print("Start Time - End Time: Start Addr - End Addr: Tensor Size: Memory Usage: Tensor Purpose: Tensor Name") for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted( - (lr.start_time, lr.end_time, lr.size, tens.address, tens.address + lr.size, tens.purpose, tens.name,) + ( + lr.start_time, + lr.end_time, + lr.size, + tens.address, + tens.address + lr.size, + tens.purpose, + tens.name, + ) for tens, lr in lrs.ranges.items() ): print( @@ -184,7 +197,11 @@ def allocate( ): # Allocates addresses to tensors, returns False if tensors could not be fit within max_size lrs = live_range.extract_live_ranges_from_cascaded_passes( - sg, mem_area, mem_type_set, lr_graph=lr_graph, cpu_tensor_alignment=cpu_tensor_alignment, + sg, + mem_area, + mem_type_set, + lr_graph=lr_graph, + cpu_tensor_alignment=cpu_tensor_alignment, ) total_sz = 0 if lrs.ranges: diff --git a/ethosu/vela/test/extapi/test_extapi_encode_weights.py b/ethosu/vela/test/extapi/test_extapi_encode_weights.py index 6367cb30..87c504f4 100644 --- a/ethosu/vela/test/extapi/test_extapi_encode_weights.py +++ b/ethosu/vela/test/extapi/test_extapi_encode_weights.py @@ -24,7 +24,8 @@ from ethosu.vela.api import NpuBlockTraversal @pytest.mark.parametrize( - "arch", list(NpuAccelerator), + "arch", + list(NpuAccelerator), ) @pytest.mark.parametrize("dilation_x", [1, 2]) @pytest.mark.parametrize("dilation_y", [1, 2]) @@ -32,7 +33,12 @@ from ethosu.vela.api import NpuBlockTraversal @pytest.mark.parametrize("depth_control", [1, 2, 3]) @pytest.mark.parametrize("weights_shape_and_block_depth", [((16, 16, 16, 16), 8), ((3, 3, 25, 16), 8)]) def test_encode_weights( - arch, weights_shape_and_block_depth, dilation_x, dilation_y, ifm_bitdepth, depth_control, + arch, + weights_shape_and_block_depth, + dilation_x, + dilation_y, + ifm_bitdepth, + depth_control, ): """ This unit test checks the interface of the API function but not the functionality. diff --git a/ethosu/vela/test/test_register_command_stream_util.py b/ethosu/vela/test/test_register_command_stream_util.py index 985523fa..86a48ff6 100644 --- a/ethosu/vela/test/test_register_command_stream_util.py +++ b/ethosu/vela/test/test_register_command_stream_util.py @@ -131,14 +131,34 @@ def test_calc_blockdep0(): op2 takes 1 block to complete, which results in blockdep 0 """ op1 = NpuElementWiseOperation(NpuElementWiseOp.CLZ) - op1.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x60, layout=NpuLayout.NHCWB16,) - intermediate_fm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xA0, layout=NpuLayout.NHCWB16,) + op1.ifm = create_feature_map( + NpuShape3D(height=1, width=1, depth=1), + 1, + 0x60, + layout=NpuLayout.NHCWB16, + ) + intermediate_fm = create_feature_map( + NpuShape3D(height=1, width=1, depth=1), + 1, + 0xA0, + layout=NpuLayout.NHCWB16, + ) op1.ofm = intermediate_fm op1.block_config = NpuShape3D(height=1, width=1, depth=4) op2 = NpuElementWiseOperation(NpuElementWiseOp.SUB) - op2.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x39AC0, layout=NpuLayout.NHCWB16,) + op2.ifm = create_feature_map( + NpuShape3D(height=1, width=1, depth=1), + 1, + 0x39AC0, + layout=NpuLayout.NHCWB16, + ) op2.ifm2 = intermediate_fm - op2.ofm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xE0, layout=NpuLayout.NHCWB16,) + op2.ofm = create_feature_map( + NpuShape3D(height=1, width=1, depth=1), + 1, + 0xE0, + layout=NpuLayout.NHCWB16, + ) op2.block_config = NpuShape3D(height=1, width=1, depth=4) arch = create_default_arch(Accelerator.Ethos_U55_128) block_dep = calc_blockdep(arch, op1, op2) @@ -153,8 +173,18 @@ def test_calc_blockdep2(): which results in blockdep 2 """ op1 = NpuConv2DOperation() - op1.ifm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,) - op1.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,) + op1.ifm = create_feature_map( + NpuShape3D(height=4, width=48, depth=8), + 1, + 0x4C80, + layout=NpuLayout.NHCWB16, + ) + op1.ofm = create_feature_map( + NpuShape3D(height=4, width=48, depth=16), + 1, + 0x6480, + layout=NpuLayout.NHCWB16, + ) op1.kernel = NpuKernel(1, 1) op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=208)] op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=160)] @@ -162,10 +192,20 @@ def test_calc_blockdep2(): op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST op1.block_config = NpuShape3D(height=4, width=6, depth=16) op2 = NpuConvDepthWiseOperation() - op2.ifm = create_feature_map(NpuShape3D(height=3, width=48, depth=16), 1, 0, layout=NpuLayout.NHCWB16,) + op2.ifm = create_feature_map( + NpuShape3D(height=3, width=48, depth=16), + 1, + 0, + layout=NpuLayout.NHCWB16, + ) # op2 has two tiles, the lower tile is produced by op1 op2.ifm.tiles = NpuTileBox(height_0=2, height_1=2, width_0=48, addresses=[0x7680, 0, 0x6480, 0]) - op2.ofm = create_feature_map(NpuShape3D(height=1, width=24, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,) + op2.ofm = create_feature_map( + NpuShape3D(height=1, width=24, depth=16), + 1, + 0x6480, + layout=NpuLayout.NHCWB16, + ) op2.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2) op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=208)] op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=160)] @@ -183,8 +223,18 @@ def test_calc_blockdep3(): which results in blockdep 3 """ op1 = NpuConv2DOperation() - op1.ifm = create_feature_map(NpuShape3D(height=13, width=96, depth=1), 1, 0, layout=NpuLayout.NHWC,) - op1.ofm = create_feature_map(NpuShape3D(height=6, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,) + op1.ifm = create_feature_map( + NpuShape3D(height=13, width=96, depth=1), + 1, + 0, + layout=NpuLayout.NHWC, + ) + op1.ofm = create_feature_map( + NpuShape3D(height=6, width=48, depth=8), + 1, + 0x7C80, + layout=NpuLayout.NHCWB16, + ) op1.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2) op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=144)] op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=80)] @@ -192,8 +242,18 @@ def test_calc_blockdep3(): op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST op1.block_config = NpuShape3D(height=6, width=3, depth=8) op2 = NpuConvDepthWiseOperation() - op2.ifm = create_feature_map(NpuShape3D(height=5, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,) - op2.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,) + op2.ifm = create_feature_map( + NpuShape3D(height=5, width=48, depth=8), + 1, + 0x7C80, + layout=NpuLayout.NHCWB16, + ) + op2.ofm = create_feature_map( + NpuShape3D(height=4, width=48, depth=8), + 1, + 0x4C80, + layout=NpuLayout.NHCWB16, + ) op2.kernel = NpuKernel(3, 3) op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=112)] op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=80)] diff --git a/ethosu/vela/test/test_tflite_model_semantic.py b/ethosu/vela/test/test_tflite_model_semantic.py index 84f99160..1e5dbd4d 100644 --- a/ethosu/vela/test/test_tflite_model_semantic.py +++ b/ethosu/vela/test/test_tflite_model_semantic.py @@ -128,7 +128,14 @@ def test_constraint_quant_scale_inf(): def test_constraint_ofm_scale_too_small(): # Tests handling of OFM scale < 1e-38 shp = [1, 10, 20, 16] - op = testutil.create_elemwise_op(Op.Mul, "mul", shp, shp, shp, ofm_quant=testutil.default_quant_params(),) + op = testutil.create_elemwise_op( + Op.Mul, + "mul", + shp, + shp, + shp, + ofm_quant=testutil.default_quant_params(), + ) assert semantic_checker.is_operator_semantic_valid(op) op.ofm.quantization.scale_f32 = 1e-43 assert not semantic_checker.is_operator_semantic_valid(op) @@ -245,7 +252,12 @@ def create_strided_slice_op(in_shape, out_shape, start_offsets, end_offsets): def create_pad_op( - in_shape, out_shape, padding, in_dtype=DataType.int8, out_dtype=DataType.int8, pad_dtype=DataType.int32, + in_shape, + out_shape, + padding, + in_dtype=DataType.int8, + out_dtype=DataType.int8, + pad_dtype=DataType.int32, ): qp = testutil.default_quant_params() in0 = Tensor(in_shape, in_dtype, "in") @@ -259,7 +271,11 @@ def create_pad_op( def test_constraint_pad_input_count(): # Incorrect number of input tensors (2) - op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[0, 0], [1, 1], [1, 1], [0, 0]],) + op = create_pad_op( + in_shape=[1, 1, 1, 1], + out_shape=[1, 3, 3, 1], + padding=[[0, 0], [1, 1], [1, 1], [0, 0]], + ) assert semantic_checker.is_operator_semantic_valid(op) op.add_input_tensor(op.inputs[0].clone()) assert not semantic_checker.is_operator_semantic_valid(op) diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py index e3db7913..04d3cba1 100644 --- a/ethosu/vela/test/test_tflite_supported_operators.py +++ b/ethosu/vela/test/test_tflite_supported_operators.py @@ -345,7 +345,12 @@ def test_constraint_concat_pass(): def create_pad_op( - in_shape, out_shape, padding, in_dtype=DataType.int8, out_dtype=DataType.int8, pad_dtype=DataType.int32, + in_shape, + out_shape, + padding, + in_dtype=DataType.int8, + out_dtype=DataType.int8, + pad_dtype=DataType.int32, ): qp = testutil.default_quant_params() in0 = Tensor(in_shape, in_dtype, "in") @@ -359,11 +364,23 @@ def create_pad_op( def test_constraint_padded_dimensions(): # Incorrect padding dimensions, can only pad width and height - op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [1, 1], [0, 0]],) + op = create_pad_op( + in_shape=[1, 1, 1, 1], + out_shape=[1, 3, 3, 1], + padding=[[1, 1], [1, 1], [1, 1], [0, 0]], + ) assert not support.is_operator_supported(op) - op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 0]],) + op = create_pad_op( + in_shape=[1, 1, 1, 1], + out_shape=[1, 3, 3, 1], + padding=[[1, 1], [1, 1], [0, 0]], + ) assert support.is_operator_supported(op) - op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 1]],) + op = create_pad_op( + in_shape=[1, 1, 1, 1], + out_shape=[1, 3, 3, 1], + padding=[[1, 1], [1, 1], [0, 1]], + ) assert not support.is_operator_supported(op) @@ -371,12 +388,20 @@ def test_constraint_pad_shape(): # PAD operator must be of shape (3,2) or (4,2) op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[1, 1], [1, 1], [0, 0]]) assert support.is_operator_supported(op) - op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[[0, 0], [1, 1], [1, 1], [0, 0], [0, 0]],) + op = create_pad_op( + in_shape=[1, 1, 1, 1], + out_shape=[1, 3, 3, 1], + padding=[[0, 0], [1, 1], [1, 1], [0, 0], [0, 0]], + ) assert not support.is_operator_supported(op) def test_constraint_pad_none(): - op = create_pad_op(in_shape=[1, 1, 1, 1], out_shape=[1, 3, 3, 1], padding=[],) + op = create_pad_op( + in_shape=[1, 1, 1, 1], + out_shape=[1, 3, 3, 1], + padding=[], + ) assert not support.is_operator_supported(op) diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index fb8a08c0..88d58a32 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -512,7 +512,10 @@ def add_padding_fields(op, arch, nng): ) else: padding, skirt = calc_padding_and_skirt( - op.attrs["padding"], op.kernel, input_shape, op.attrs.get("explicit_padding"), + op.attrs["padding"], + op.kernel, + input_shape, + op.attrs.get("explicit_padding"), ) op.attrs["explicit_padding"] = padding @@ -642,11 +645,11 @@ def convert_softmax(op, arch, nng): def convert_mul_max_to_abs_or_lrelu(op, arch, nng): r"""Whenever there is a subgraph with this topology: - Input X For X = -1 or X > 0 - | \ / This subgraph can be replaced with either - | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0) - | / - Max + Input X For X = -1 or X > 0 + | \ / This subgraph can be replaced with either + | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0) + | / + Max """ if op.type == Op.Maximum: @@ -1246,7 +1249,12 @@ def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng): quant = QuantizationParameters() quant.zero_point = 0 bias_term_tens = create_const_tensor( - op.name + "_bias", [1, 1, 1, 1], DataType.int16, [bias_term], np.int16, quantization=quant, + op.name + "_bias", + [1, 1, 1, 1], + DataType.int16, + [bias_term], + np.int16, + quantization=quant, ) add_op.add_input_tensor(bias_term_tens) add_op.set_output_tensor(op.ofm) @@ -1370,7 +1378,12 @@ def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng): bias_shape = [shape[-1]] op.set_input_tensor( create_const_tensor( - "bias", bias_shape, inp.dtype, np.ones(bias_shape) * bias, value_dtype=np.int32, quantization=None, + "bias", + bias_shape, + inp.dtype, + np.ones(bias_shape) * bias, + value_dtype=np.int32, + quantization=None, ), 2, ) @@ -1392,7 +1405,12 @@ def tflite_optimise_graph(nng, arch): for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], pre_process_list, rewrite_unsupported=False, + nng, + sg, + arch, + [], + pre_process_list, + rewrite_unsupported=False, ) # Handle Concat Ops @@ -1413,13 +1431,23 @@ def tflite_optimise_graph(nng, arch): for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False, + nng, + sg, + arch, + [rewrite_split_ops], + [], + rewrite_unsupported=False, ) # Handle sg input output for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], [fix_sg_input_output], rewrite_unsupported=False, + nng, + sg, + arch, + [], + [fix_sg_input_output], + rewrite_unsupported=False, ) # Removal of memory only operators @@ -1452,7 +1480,12 @@ def tflite_optimise_graph(nng, arch): for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False, + nng, + sg, + arch, + [], + op_rewrite_list, + rewrite_unsupported=False, ) for idx, sg in enumerate(nng.subgraphs): diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py index 2a70b06c..a5f7fa2b 100644 --- a/ethosu/vela/tflite_mapping.py +++ b/ethosu/vela/tflite_mapping.py @@ -669,7 +669,10 @@ builtin_operator_map = { BuiltinOperator.DIV: (Op.Div, OptionsSerializer("DivOptions", (fused_act,)), TFLITE_NO_INDICES), BuiltinOperator.SQUEEZE: ( Op.Squeeze, - OptionsSerializer("SqueezeOptions", (("squeeze_dims", is_int_vec),),), + OptionsSerializer( + "SqueezeOptions", + (("squeeze_dims", is_int_vec),), + ), TFLITE_IFM_INDICES, ), BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: ( @@ -915,7 +918,13 @@ builtin_operator_map = { ), BuiltinOperator.VAR_HANDLE: ( Op.VarHandle, - OptionsSerializer("VarHandleOptions", ("container", "shared_name",),), + OptionsSerializer( + "VarHandleOptions", + ( + "container", + "shared_name", + ), + ), TFLITE_NO_INDICES, ), BuiltinOperator.READ_VARIABLE: (Op.ReadVariable, OptionsSerializer("ReadVariableOptions"), TFLITE_NO_INDICES), @@ -923,7 +932,13 @@ builtin_operator_map = { BuiltinOperator.BROADCAST_ARGS: (Op.BroadcastArgs, None, TFLITE_NO_INDICES), BuiltinOperator.RANDOM_STANDARD_NORMAL: ( Op.RandomStandardNormal, - OptionsSerializer("RandomOptions", ("seed", "seed2",),), + OptionsSerializer( + "RandomOptions", + ( + "seed", + "seed2", + ), + ), TFLITE_NO_INDICES, ), BuiltinOperator.CUSTOM: (Op.Custom, CustomOptionsSerializer(), TFLITE_NO_INDICES), diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py index 3b7f248a..b2644791 100644 --- a/ethosu/vela/tflite_model_semantic.py +++ b/ethosu/vela/tflite_model_semantic.py @@ -41,7 +41,13 @@ def _optype_formatter(op_list): class TFLiteSemantic: # Categorised lists of operators - convolution_ops = set((Op.Conv2DBias, Op.Conv2D, Op.QuantizedConv2D,)) + convolution_ops = set( + ( + Op.Conv2DBias, + Op.Conv2D, + Op.QuantizedConv2D, + ) + ) depthwise_convolution_ops = set((Op.DepthwiseConv2DBias,)) transpose_convolution_ops = set((Op.Conv2DBackpropInput,)) convolution_like_ops = convolution_ops | depthwise_convolution_ops | transpose_convolution_ops @@ -49,13 +55,36 @@ class TFLiteSemantic: avg_pooling_ops = Op.op_set(Op.is_avgpool_op) pooling_ops = set((Op.ReduceSum,)) | max_pooling_ops | avg_pooling_ops unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op) - binary_elem_wise_min_max_ops = set((Op.Minimum, Op.Maximum,)) - binary_elem_wise_shift_ops = set((Op.SHL, Op.SHR,)) - binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.Sub,)) + binary_elem_wise_min_max_ops = set( + ( + Op.Minimum, + Op.Maximum, + ) + ) + binary_elem_wise_shift_ops = set( + ( + Op.SHL, + Op.SHR, + ) + ) + binary_elem_wise_add_mul_sub = set( + ( + Op.Add, + Op.Mul, + Op.Sub, + ) + ) binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops shapeless_input_ops = binary_elem_wise_main_ops | set((Op.Split, Op.SplitV, Op.Mean, Op.ExpandDims)) - reshape_ops = set((Op.Reshape, Op.QuantizedReshape, Op.Squeeze, Op.ExpandDims,)) + reshape_ops = set( + ( + Op.Reshape, + Op.QuantizedReshape, + Op.Squeeze, + Op.ExpandDims, + ) + ) def __init__(self): # Setup the generic constraints. Note: the order matters diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py index 4d826770..6328a4e5 100644 --- a/ethosu/vela/tflite_supported_operators.py +++ b/ethosu/vela/tflite_supported_operators.py @@ -40,7 +40,13 @@ def _optype_formatter(op_list): class TFLiteSupportedOperators: # Categorised lists of supported operators npu_pre_ops = set((Op.SplitSliceRead,)) - convolution_ops = set((Op.Conv2DBias, Op.Conv2D, Op.QuantizedConv2D,)) + convolution_ops = set( + ( + Op.Conv2DBias, + Op.Conv2D, + Op.QuantizedConv2D, + ) + ) depthwise_convolution_ops = set((Op.DepthwiseConv2DBias,)) transpose_convolution_ops = set((Op.Conv2DBackpropInput,)) convolution_like_ops = convolution_ops | depthwise_convolution_ops | transpose_convolution_ops @@ -48,7 +54,13 @@ class TFLiteSupportedOperators: avg_pooling_ops = Op.op_set(Op.is_avgpool_op) pooling_ops = set((Op.ReduceSum,)) | max_pooling_ops | avg_pooling_ops resizing_ops = set((Op.ResizeBilinear,)) - fc_vector_products = set((Op.QuantizedMatMul, Op.MatMul, Op.FullyConnected,)) + fc_vector_products = set( + ( + Op.QuantizedMatMul, + Op.MatMul, + Op.FullyConnected, + ) + ) mac_main_ops = ( # RNN/LSTM/GRU set((Op.BlockLSTM,)) @@ -64,17 +76,47 @@ class TFLiteSupportedOperators: | set((Op.Mean,)) ) unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op) - binary_elem_wise_min_max_ops = set((Op.Minimum, Op.Maximum,)) - binary_elem_wise_shift_ops = set((Op.SHL, Op.SHR,)) - binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.Sub,)) + binary_elem_wise_min_max_ops = set( + ( + Op.Minimum, + Op.Maximum, + ) + ) + binary_elem_wise_shift_ops = set( + ( + Op.SHL, + Op.SHR, + ) + ) + binary_elem_wise_add_mul_sub = set( + ( + Op.Add, + Op.Mul, + Op.Sub, + ) + ) binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops pad_ops = set((Op.Pad,)) supported_int32_tensor_ops = ( - set((Op.ReduceSum, Op.CLZ,)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops + set( + ( + Op.ReduceSum, + Op.CLZ, + ) + ) + | binary_elem_wise_add_mul_sub + | binary_elem_wise_shift_ops ) - relu_ops = set((Op.Relu, Op.Relu6, Op.ReluN1To1, Op.Clip,)) + relu_ops = set( + ( + Op.Relu, + Op.Relu6, + Op.ReluN1To1, + Op.Clip, + ) + ) activation_ops = relu_ops | set((Op.Tanh, Op.Sigmoid, Op.Softmax, Op.HardSwish)) npu_post_ops = ( # activation functions @@ -84,11 +126,44 @@ class TFLiteSupportedOperators: # Quantization | set((Op.Quantize,)) ) - split_ops = set((Op.Split, Op.SplitV, Op.StridedSlice, Op.Slice, Op.UnpackReshaped, Op.Unpack,)) - concat_ops = set((Op.Concat, Op.ConcatTFLite, Op.PackReshaped, Op.Pack,)) - memory_only_ops = set((Op.Reshape, Op.QuantizedReshape, Op.Squeeze, Op.ExpandDims,)) | concat_ops | split_ops + split_ops = set( + ( + Op.Split, + Op.SplitV, + Op.StridedSlice, + Op.Slice, + Op.UnpackReshaped, + Op.Unpack, + ) + ) + concat_ops = set( + ( + Op.Concat, + Op.ConcatTFLite, + Op.PackReshaped, + Op.Pack, + ) + ) + memory_only_ops = ( + set( + ( + Op.Reshape, + Op.QuantizedReshape, + Op.Squeeze, + Op.ExpandDims, + ) + ) + | concat_ops + | split_ops + ) per_axis_quant_ops = convolution_like_ops # per-axis/channel quantization only currently supported for conv ops - supported_fused_activations = relu_ops | set((Op.Tanh, Op.Sigmoid, Op.LUT,)) + supported_fused_activations = relu_ops | set( + ( + Op.Tanh, + Op.Sigmoid, + Op.LUT, + ) + ) supported_operators = npu_pre_ops | mac_main_ops | elem_wise_main_ops | pad_ops | npu_post_ops | memory_only_ops # Supported data types supported_op_dtypes = set((DataType.uint8, DataType.int8, DataType.int16, DataType.int32)) @@ -441,7 +516,7 @@ class TFLiteSupportedOperators: @staticmethod def constraint_tconv_valid(op): """VALID padding: OFM dimensions must equal IFM dimensions multiplied by stride, - minus difference between kernel size and stride""" + minus difference between kernel size and stride""" if op.attrs["padding"] == Padding.VALID: s_w = op.kernel.stride.x s_h = op.kernel.stride.y diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py index 9e72a6c1..778aa2ac 100644 --- a/ethosu/vela/tosa_graph_optimiser.py +++ b/ethosu/vela/tosa_graph_optimiser.py @@ -876,7 +876,12 @@ def tosa_optimise_graph(nng, arch): # TODO the supported operator checking need to be split in semantic and HW checks for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], [supported_operator_check], rewrite_unsupported=False, + nng, + sg, + arch, + [], + [supported_operator_check], + rewrite_unsupported=False, ) # Decomposing and rewrite of concat @@ -893,7 +898,12 @@ def tosa_optimise_graph(nng, arch): # Handle sg input output for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], [fix_sg_input_output_tosa], rewrite_unsupported=True, + nng, + sg, + arch, + [], + [fix_sg_input_output_tosa], + rewrite_unsupported=True, ) # Removal of reshapes @@ -909,19 +919,34 @@ def tosa_optimise_graph(nng, arch): for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], [set_ifm_ofm_op_shapes], rewrite_unsupported=False, + nng, + sg, + arch, + [], + [set_ifm_ofm_op_shapes], + rewrite_unsupported=False, ) # Removal of Transpose for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], [remove_const_transpose], rewrite_unsupported=False, + nng, + sg, + arch, + [], + [remove_const_transpose], + rewrite_unsupported=False, ) # TODO, when and where to best handle calc_scaling_avgpool for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], [calc_scaling_avgpool], rewrite_unsupported=False, + nng, + sg, + arch, + [], + [calc_scaling_avgpool], + rewrite_unsupported=False, ) # Rewite Operators step @@ -929,13 +954,22 @@ def tosa_optimise_graph(nng, arch): for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False, + nng, + sg, + arch, + [], + op_rewrite_list, + rewrite_unsupported=False, ) # Post-processing step 1 for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], [rewrite_activation, add_padding_fields], + nng, + sg, + arch, + [], + [rewrite_activation, add_padding_fields], ) # Removal of Slice, need to be done after optimisation has been performed, @@ -946,6 +980,12 @@ def tosa_optimise_graph(nng, arch): # Post-processing step 2 for idx, sg in enumerate(nng.subgraphs): - nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [fixup_quantization],) + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( + nng, + sg, + arch, + [], + [fixup_quantization], + ) return nng diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py index e3785113..15e15695 100644 --- a/ethosu/vela/tosa_supported_operators.py +++ b/ethosu/vela/tosa_supported_operators.py @@ -38,11 +38,30 @@ class TosaSupportedOperators: fc_vector_products = set((Op.FullyConnected,)) mac_main_ops = convolution_like_ops | pooling_ops | fc_vector_products - memory_only_ops = set((Op.Reshape, Op.Transpose, Op.Concat, Op.SplitSliceRead,)) - binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.RescaleMul, Op.Sub,)) + memory_only_ops = set( + ( + Op.Reshape, + Op.Transpose, + Op.Concat, + Op.SplitSliceRead, + ) + ) + binary_elem_wise_add_mul_sub = set( + ( + Op.Add, + Op.Mul, + Op.RescaleMul, + Op.Sub, + ) + ) elem_wise_ops = binary_elem_wise_add_mul_sub type_conversion_ops = set((Op.Rescale,)) - relu_ops = set((Op.Clamp, Op.ReluN,)) + relu_ops = set( + ( + Op.Clamp, + Op.ReluN, + ) + ) activation_ops = relu_ops | set((Op.Table,)) pad_ops = set((Op.Pad,)) diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py index cdac6416..78c43511 100644 --- a/ethosu/vela/weight_compressor.py +++ b/ethosu/vela/weight_compressor.py @@ -42,7 +42,8 @@ from ethosu import mlw_codec # Contains meta info for a weight compression. If two tensors have identical weight compression config, # then they also will have identical compressed weights. WeightCompressionConfig = namedtuple( - "WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"], + "WeightCompressionConfig", + ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "weight_value_id"], ) ScaleCompressionConfig = namedtuple("ScaleCompressionConfig", ["scale_value_id", "ifm_scale", "ofm_scale"]) -- cgit v1.2.1