diff options
-rw-r--r-- | SUPPORTED_OPS.md | 12 | ||||
-rw-r--r-- | ethosu/vela/graph_optimiser_util.py | 39 | ||||
-rw-r--r-- | ethosu/vela/lut.py | 114 | ||||
-rw-r--r-- | ethosu/vela/operation.py | 2 | ||||
-rw-r--r-- | ethosu/vela/tflite_graph_optimiser.py | 18 | ||||
-rw-r--r-- | ethosu/vela/tflite_mapping.py | 2 | ||||
-rw-r--r-- | ethosu/vela/tflite_model_semantic.py | 3 | ||||
-rw-r--r-- | ethosu/vela/tosa_graph_optimiser.py | 2 |
8 files changed, 148 insertions, 44 deletions
diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md index ab9b0096..80647f84 100644 --- a/SUPPORTED_OPS.md +++ b/SUPPORTED_OPS.md @@ -1,7 +1,7 @@ # Supported Ops This file was automatically generated by Vela using the `--supported-ops-report` parameter. -Vela version: `3.7.1.dev17+g7b3008a.d20230420` +Vela version: `3.7.1.dev23+g3734897.d20230427` This file complies with [**Gitiles Markdown syntax**](https://github.com/google/gitiles/blob/master/Documentation/markdown.md) @@ -25,6 +25,7 @@ Please check the supported operator list for your chosen runtime for further inf | CONCATENATION | [Generic](#tflite-generic-constraints), [Specific](#tflite-concatenation-constraints) | | CONV_2D | [Generic](#tflite-generic-constraints), [Specific](#tflite-conv_2d-constraints) | | DEPTHWISE_CONV_2D | [Generic](#tflite-generic-constraints), [Specific](#tflite-depthwise_conv_2d-constraints) | +| EXP | [Generic](#tflite-generic-constraints), [Specific](#tflite-exp-constraints) | | EXPAND_DIMS | [Generic](#tflite-generic-constraints), [Specific](#tflite-expand_dims-constraints) | | FULLY_CONNECTED | [Generic](#tflite-generic-constraints), [Specific](#tflite-fully_connected-constraints) | | HARD_SWISH | [Generic](#tflite-generic-constraints), [Specific](#tflite-hard_swish-constraints) | @@ -63,6 +64,7 @@ Please check the supported operator list for your chosen runtime for further inf This is a list of constraints most NPU operators must satisfy in order to be scheduled on the NPU. (Operators excluded from certain constraints are shown in brackets [ ] ) +- All required operator attributes must be specified - Input(s) and Output tensors must not be dynamic - [QUANTIZE] - Input(s) and Output tensors must have a defined shape - Output tensors cannot be scalar - [QUANTIZE] @@ -161,6 +163,14 @@ This is a list of constraints that the DEPTHWISE_CONV_2D operator must satisfy i - Stride values for both width and height must be between 1 and 3 - For depth multipliers > 1, IFM channels must be 1 and OFM channels must be equal to the depth multiplier +### TFLite EXP Constraints + +This is a list of constraints that the EXP operator must satisfy in order to be scheduled on the NPU. + +- At least one Input's shape must match the OFM's shape +- IFM and OFM data types must match +- IFM must be int8 or int16 + ### TFLite EXPAND_DIMS Constraints This is a list of constraints that the EXPAND_DIMS operator must satisfy in order to be scheduled on the NPU. diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py index 82790364..da3fe138 100644 --- a/ethosu/vela/graph_optimiser_util.py +++ b/ethosu/vela/graph_optimiser_util.py @@ -20,7 +20,6 @@ from typing import Tuple import numpy as np -from . import lut from .architecture_features import Accelerator from .data_type import DataType from .debug_database import DebugDatabase @@ -29,8 +28,6 @@ from .errors import VelaError from .operation import Op from .operation_util import create_avgpool_nop from .shape4d import Shape4D -from .tensor import create_const_tensor -from .tensor import QuantizationParameters from .tensor import Tensor memory_only_ops = ( @@ -329,42 +326,6 @@ def convert_depthwise_to_conv(op, arch, nng): return op -def convert_to_lut(op, lut_values, lut_name): - # Rewrite the operation by Add with scalar 0 + LUT activation - ifm = op.ifm - ofm = op.ofm - if ifm is None: - return op - assert ifm.dtype.size_in_bytes() == 1 - op.type = Op.Add - op.name = op.name + "_lut_" + lut_name - # Mark as no-op to enable potential fusing optimizations - op.attrs["is_nop"] = True - # Create an input tensor containing scalar zero - quantization = QuantizationParameters(0.0, 255.0) - quantization.scale_f32 = ifm.quantization.scale_f32 - quantization.zero_point = 0 - tens = create_const_tensor(ifm.name + "_scalar0", [], ifm.dtype, [0], quantization=quantization) - op.add_input_tensor(tens) - op.ifm_shapes.append(Shape4D(tens.shape)) # TODO no shape? - - # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale), - # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions - # should be the same as the IFM - op.forced_output_quantization = ifm.quantization - - # the lut tensor datatype needs to match both; the ofm datatype, because these are the values output; and the - # datatype used to generate the lut values (which is probably the ifm datatype), because we want to avoid any - # potential overflow errors in create_lut_tensor() caused by converting Python int (which could represent a uint) - # to NumPy int. this can be guaranteed by checking that the ifm and ofm datatypes are the same - assert ifm.dtype == ofm.dtype - lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, ofm.dtype) - op.set_activation_lut(lut_tensor) - op.set_ifm_ofm_shapes() - DebugDatabase.add_optimised(op, op) - return op - - def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D): """Creates an average pool for the given concat op/input feature map""" ofm = concat_op.ofm diff --git a/ethosu/vela/lut.py b/ethosu/vela/lut.py index d0ac9706..c8fb7bc0 100644 --- a/ethosu/vela/lut.py +++ b/ethosu/vela/lut.py @@ -21,10 +21,15 @@ import uuid import numpy as np from . import numeric_util +from .data_type import DataType +from .debug_database import DebugDatabase from .high_level_command_stream import DMA from .high_level_command_stream import NpuStripe +from .numeric_util import round_away_zero +from .operation import Op from .tensor import create_const_tensor from .tensor import create_equivalence_id +from .tensor import QuantizationParameters from .tensor import TensorPurpose @@ -88,6 +93,8 @@ def create_lut_tensor(name, values, dtype): # address in constant memory, and unnecessary DMA operations can be avoided. sz = len(values) assert sz in (256, 512) + # int16 lut uses uint32 lut with base + slope + dtype = DataType.uint32 if dtype == DataType.int16 else dtype tens = create_const_tensor(name, [1, 1, 1, sz], dtype, values, TensorPurpose.LUT) tens.equivalence_id = create_equivalence_id(tuple(values)) return tens @@ -128,3 +135,110 @@ def optimize_high_level_cmd_stream(sg, arch): lut_state = lut_state.put(lut_tens) cmd_stream.append(cmd) sg.high_level_command_stream = cmd_stream + + +def convert_to_lut(op, lut_values, lut_name): + # Rewrite the operation by Add with scalar 0 + LUT activation + ifm = op.ifm + ofm = op.ofm + if ifm is None: + return op + assert ifm.dtype in (DataType.int8, DataType.uint8, DataType.int16) + op.type = Op.Add + op.name = f"{op.name}_lut_{lut_name}" + # Mark as no-op to enable potential fusing optimizations + op.attrs["is_nop"] = True + # Create an input tensor containing scalar zero + _max = 65536.0 if ifm.dtype == DataType.int16 else 255.0 + quantization = QuantizationParameters(0.0, _max) + quantization.scale_f32 = ifm.quantization.scale_f32 + quantization.zero_point = 0 + tens = create_const_tensor(ifm.name + "_scalar0", [], ifm.dtype, [0], quantization=quantization) + op.add_input_tensor(tens) + + # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale), + # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions + # should be the same as the IFM + op.forced_output_quantization = ifm.quantization + + # the lut tensor datatype needs to match both; the ofm datatype, because these are the values output; and the + # datatype used to generate the lut values (which is probably the ifm datatype), because we want to avoid any + # potential overflow errors in create_lut_tensor() caused by converting Python int (which could represent a uint) + # to NumPy int. this can be guaranteed by checking that the ifm and ofm datatypes are the same + assert ifm.dtype == ofm.dtype + lut_tensor = create_lut_tensor(op.name + "_values", lut_values, ofm.dtype) + op.set_activation_lut(lut_tensor) + op.set_ifm_ofm_shapes() + DebugDatabase.add_optimised(op, op) + return op + + +def create_lut_8bit_op(op, lut_fn, fn_name): + ifm_scale = op.ifm.quantization.scale_f32 + ofm_scale = op.ofm.quantization.scale_f32 + zp_in = op.ifm.quantization.zero_point + zp_out = op.ofm.quantization.zero_point + + values = [] + ix = range(256) if op.ifm.dtype == DataType.uint8 else range(-128, 128) + quantized_min = min(ix) + quantized_max = max(ix) + for x in ix: + x_real = ifm_scale * (x - zp_in) + y_real = lut_fn(x_real) + lut_result = round_away_zero(y_real / ofm_scale) + zp_out + lut_result = min(quantized_max, max(quantized_min, lut_result)) + values.append(lut_result) + + return convert_to_lut(op, values, fn_name) + + +def create_lut_int16_op(op, lut_fn, fn_name): + ifm_scale = op.ifm.quantization.scale_f32 + ofm_scale = op.ofm.quantization.scale_f32 + zp_in = op.ifm.quantization.zero_point + zp_out = op.ofm.quantization.zero_point + + input_min = ifm_scale * (np.iinfo(np.int16).min - zp_in) + input_max = ifm_scale * (np.iinfo(np.int16).max - zp_in) + output_min = ofm_scale * (np.iinfo(np.int16).min - zp_out) + output_max = ofm_scale * (np.iinfo(np.int16).max - zp_out) + + # Create 16bit lut following the reference + nbr_steps = 512 + step = (input_max - input_min) / nbr_steps + half_step = step / 2 + output_scaling_inv = (np.iinfo(np.int16).max - np.iinfo(np.int16).min + 1) / (output_max - output_min) + + table_min = np.iinfo(np.int16).min + table_max = np.iinfo(np.int16).max + + values = [] + for i in range(nbr_steps): + val = lut_fn(input_min + i * step) + val_midpoint = lut_fn(input_min + i * step + half_step) + val_next = lut_fn(input_min + (i + 1) * step) + + sample_val = round_away_zero(val * output_scaling_inv) + midpoint_interp_val = round_away_zero( + (val_next * output_scaling_inv + round_away_zero(val * output_scaling_inv)) / 2 + ) + midpoint_val = round_away_zero(val_midpoint * output_scaling_inv) + midpoint_err = midpoint_interp_val - midpoint_val + bias = round_away_zero(midpoint_err / 2) + + lut_result = min(max(sample_val - bias, table_min), table_max) + values.append(lut_result) + + val = round_away_zero(lut_fn(input_max) * output_scaling_inv) + lut_result = min(max(val, table_min), table_max) + values.append(lut_result) + + # Convert to hardware 16bit lut with base and slope + lut = [0] * nbr_steps + for i in range(nbr_steps): + slope = (int(values[i + 1]) - int(values[i])) << 16 + base = int(values[i]) + lut[i] = slope + base + + return convert_to_lut(op, lut, fn_name) diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index eafe3bd1..69596522 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -179,7 +179,7 @@ class Op(Enum): EmbeddingLookup = OperatorInfo() EmbeddingLookupSparse = OperatorInfo() Equal = OperatorInfo() - Exp = OperatorInfo() + Exp = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_INDICES, is_unary=True) ExpandDims = OperatorInfo(indices=NNG_IFM_INDICES) FakeQuantWithMinMaxArgs = OperatorInfo() Fill = OperatorInfo() diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index c79f154a..1b70165e 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -34,7 +34,6 @@ from .ethos_u55_regs.ethos_u55_regs import resampling_mode from .graph_optimiser_util import bypass_memory_only_ops from .graph_optimiser_util import calc_explicit_padding from .graph_optimiser_util import convert_depthwise_to_conv -from .graph_optimiser_util import convert_to_lut from .graph_optimiser_util import create_avg_pool_for_concat from .graph_optimiser_util import memory_only_ops from .graph_optimiser_util import move_splitsliceread_to_consumer @@ -42,6 +41,9 @@ from .graph_optimiser_util import needed_total_padding from .graph_optimiser_util import set_ifm_ofm_op_shapes from .graph_optimiser_util import set_tensor_equivalence from .lstm import Lstm +from .lut import convert_to_lut +from .lut import create_lut_8bit_op +from .lut import create_lut_int16_op from .numeric_util import clamp_sigmoid from .numeric_util import full_shape from .numeric_util import round_away_zero @@ -1935,6 +1937,19 @@ def convert_mean_to_depthwise_conv(op, arch, nng): return op +def convert_ops_to_lut(op, arch, nng): + if op.type == Op.Exp: + if op.ifm.dtype == DataType.int8: + return create_lut_8bit_op(op, math.exp, "exp") + elif op.ifm.dtype == DataType.int16: + return create_lut_int16_op(op, math.exp, "exp") + else: + # Should already be catched in tflite supported ops + assert False, f"Unsupported data type {op.ifm.dtype} for {op.type}" + + return op + + def optimise_quantize(op: Operation, arch, nng): if op.type == Op.Quantize and op.run_on_npu: @@ -2214,6 +2229,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights): # Rewrite of operators op_rewrite_list = [ set_tensor_equivalence, + convert_ops_to_lut, convert_mean_to_depthwise_conv, convert_depthwise_to_conv, convert_conv_to_fc, diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py index bb45a7f2..dda418c9 100644 --- a/ethosu/vela/tflite_mapping.py +++ b/ethosu/vela/tflite_mapping.py @@ -734,7 +734,7 @@ builtin_operator_map = { ), TFLITE_IFM_WEIGHTS_INDICES, ), - BuiltinOperator.EXP: (Op.Exp, OptionsSerializer("ExpOptions"), TFLITE_NO_INDICES), + BuiltinOperator.EXP: (Op.Exp, OptionsSerializer("ExpOptions"), TFLITE_IFM_INDICES), BuiltinOperator.TOPK_V2: (Op.TopKV2, OptionsSerializer("TopKV2Options"), TFLITE_NO_INDICES), BuiltinOperator.SPLIT: (Op.Split, OptionsSerializer("SplitOptions", ("num_splits",)), TFLITE_SPLIT_IFM_INDICES), BuiltinOperator.LOG_SOFTMAX: (Op.LogSoftmax, OptionsSerializer("LogSoftmaxOptions"), TFLITE_NO_INDICES), diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py index 7537d7da..24c0794a 100644 --- a/ethosu/vela/tflite_model_semantic.py +++ b/ethosu/vela/tflite_model_semantic.py @@ -201,6 +201,9 @@ class TFLiteSemantic: self.specific_constraints[Op.UnidirectionalSequenceLstm].append(TFLiteSemantic.constraint_lstm_intermediates) self.specific_constraints[Op.UnidirectionalSequenceLstm].append(TFLiteSemantic.constraint_lstm_variables) + # Exp specific checks + self.specific_constraints[Op.Exp].append(TFLiteSemantic.constraint_input_signed) + def is_operator_semantic_valid(self, op): ext_type = optype_to_builtintype(op.type) diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py index 2a599aaa..b3474147 100644 --- a/ethosu/vela/tosa_graph_optimiser.py +++ b/ethosu/vela/tosa_graph_optimiser.py @@ -25,11 +25,11 @@ from .debug_database import DebugDatabase from .graph_optimiser_util import bypass_memory_only_ops from .graph_optimiser_util import calc_explicit_padding from .graph_optimiser_util import convert_depthwise_to_conv -from .graph_optimiser_util import convert_to_lut from .graph_optimiser_util import move_splitsliceread_to_consumer from .graph_optimiser_util import needed_total_padding from .graph_optimiser_util import set_ifm_ofm_op_shapes from .graph_optimiser_util import set_tensor_equivalence +from .lut import convert_to_lut from .operation import ExplicitScaling from .operation import Op from .operation_util import create_add_nop |