aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfven <johan.alfven@arm.com>2023-04-24 13:35:40 +0200
committerRickard Bolin <rickard.bolin@arm.com>2023-05-02 11:03:37 +0000
commitce5027328d2330d33bdfc5d5b016d171e4f8a2fc (patch)
tree16528c451c430d14d3221bfe968e94a54c6af33a
parent463f74b2ed65753811ddf7460e760aaf983ef5bb (diff)
downloadethos-u-vela-ce5027328d2330d33bdfc5d5b016d171e4f8a2fc.tar.gz
MLBEDSW-2082: Add Exp support
- Added int8 and int16 Exp support, implemented as LUT. - Added generic 8bit and 16bit LUT table functions following the implementation in the latest reference. If new ops are added by the reference, they can easily be implemented in Vela using the generic functions. - Moved convert_to_lut to lut.py to have all LUT related code in one file. - Updated SUPPORTED_OPS.md Change-Id: I388e76ea4b39162313599a5341cfb9bad71a782c Signed-off-by: Johan Alfven <johan.alfven@arm.com>
-rw-r--r--SUPPORTED_OPS.md12
-rw-r--r--ethosu/vela/graph_optimiser_util.py39
-rw-r--r--ethosu/vela/lut.py114
-rw-r--r--ethosu/vela/operation.py2
-rw-r--r--ethosu/vela/tflite_graph_optimiser.py18
-rw-r--r--ethosu/vela/tflite_mapping.py2
-rw-r--r--ethosu/vela/tflite_model_semantic.py3
-rw-r--r--ethosu/vela/tosa_graph_optimiser.py2
8 files changed, 148 insertions, 44 deletions
diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md
index ab9b0096..80647f84 100644
--- a/SUPPORTED_OPS.md
+++ b/SUPPORTED_OPS.md
@@ -1,7 +1,7 @@
# Supported Ops
This file was automatically generated by Vela using the `--supported-ops-report` parameter.
-Vela version: `3.7.1.dev17+g7b3008a.d20230420`
+Vela version: `3.7.1.dev23+g3734897.d20230427`
This file complies with
[**Gitiles Markdown syntax**](https://github.com/google/gitiles/blob/master/Documentation/markdown.md)
@@ -25,6 +25,7 @@ Please check the supported operator list for your chosen runtime for further inf
| CONCATENATION | [Generic](#tflite-generic-constraints), [Specific](#tflite-concatenation-constraints) |
| CONV_2D | [Generic](#tflite-generic-constraints), [Specific](#tflite-conv_2d-constraints) |
| DEPTHWISE_CONV_2D | [Generic](#tflite-generic-constraints), [Specific](#tflite-depthwise_conv_2d-constraints) |
+| EXP | [Generic](#tflite-generic-constraints), [Specific](#tflite-exp-constraints) |
| EXPAND_DIMS | [Generic](#tflite-generic-constraints), [Specific](#tflite-expand_dims-constraints) |
| FULLY_CONNECTED | [Generic](#tflite-generic-constraints), [Specific](#tflite-fully_connected-constraints) |
| HARD_SWISH | [Generic](#tflite-generic-constraints), [Specific](#tflite-hard_swish-constraints) |
@@ -63,6 +64,7 @@ Please check the supported operator list for your chosen runtime for further inf
This is a list of constraints most NPU operators must satisfy in order to be scheduled on the NPU.
(Operators excluded from certain constraints are shown in brackets [ ] )
+- All required operator attributes must be specified
- Input(s) and Output tensors must not be dynamic - [QUANTIZE]
- Input(s) and Output tensors must have a defined shape
- Output tensors cannot be scalar - [QUANTIZE]
@@ -161,6 +163,14 @@ This is a list of constraints that the DEPTHWISE_CONV_2D operator must satisfy i
- Stride values for both width and height must be between 1 and 3
- For depth multipliers > 1, IFM channels must be 1 and OFM channels must be equal to the depth multiplier
+### TFLite EXP Constraints
+
+This is a list of constraints that the EXP operator must satisfy in order to be scheduled on the NPU.
+
+- At least one Input's shape must match the OFM's shape
+- IFM and OFM data types must match
+- IFM must be int8 or int16
+
### TFLite EXPAND_DIMS Constraints
This is a list of constraints that the EXPAND_DIMS operator must satisfy in order to be scheduled on the NPU.
diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py
index 82790364..da3fe138 100644
--- a/ethosu/vela/graph_optimiser_util.py
+++ b/ethosu/vela/graph_optimiser_util.py
@@ -20,7 +20,6 @@ from typing import Tuple
import numpy as np
-from . import lut
from .architecture_features import Accelerator
from .data_type import DataType
from .debug_database import DebugDatabase
@@ -29,8 +28,6 @@ from .errors import VelaError
from .operation import Op
from .operation_util import create_avgpool_nop
from .shape4d import Shape4D
-from .tensor import create_const_tensor
-from .tensor import QuantizationParameters
from .tensor import Tensor
memory_only_ops = (
@@ -329,42 +326,6 @@ def convert_depthwise_to_conv(op, arch, nng):
return op
-def convert_to_lut(op, lut_values, lut_name):
- # Rewrite the operation by Add with scalar 0 + LUT activation
- ifm = op.ifm
- ofm = op.ofm
- if ifm is None:
- return op
- assert ifm.dtype.size_in_bytes() == 1
- op.type = Op.Add
- op.name = op.name + "_lut_" + lut_name
- # Mark as no-op to enable potential fusing optimizations
- op.attrs["is_nop"] = True
- # Create an input tensor containing scalar zero
- quantization = QuantizationParameters(0.0, 255.0)
- quantization.scale_f32 = ifm.quantization.scale_f32
- quantization.zero_point = 0
- tens = create_const_tensor(ifm.name + "_scalar0", [], ifm.dtype, [0], quantization=quantization)
- op.add_input_tensor(tens)
- op.ifm_shapes.append(Shape4D(tens.shape)) # TODO no shape?
-
- # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
- # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
- # should be the same as the IFM
- op.forced_output_quantization = ifm.quantization
-
- # the lut tensor datatype needs to match both; the ofm datatype, because these are the values output; and the
- # datatype used to generate the lut values (which is probably the ifm datatype), because we want to avoid any
- # potential overflow errors in create_lut_tensor() caused by converting Python int (which could represent a uint)
- # to NumPy int. this can be guaranteed by checking that the ifm and ofm datatypes are the same
- assert ifm.dtype == ofm.dtype
- lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, ofm.dtype)
- op.set_activation_lut(lut_tensor)
- op.set_ifm_ofm_shapes()
- DebugDatabase.add_optimised(op, op)
- return op
-
-
def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):
"""Creates an average pool for the given concat op/input feature map"""
ofm = concat_op.ofm
diff --git a/ethosu/vela/lut.py b/ethosu/vela/lut.py
index d0ac9706..c8fb7bc0 100644
--- a/ethosu/vela/lut.py
+++ b/ethosu/vela/lut.py
@@ -21,10 +21,15 @@ import uuid
import numpy as np
from . import numeric_util
+from .data_type import DataType
+from .debug_database import DebugDatabase
from .high_level_command_stream import DMA
from .high_level_command_stream import NpuStripe
+from .numeric_util import round_away_zero
+from .operation import Op
from .tensor import create_const_tensor
from .tensor import create_equivalence_id
+from .tensor import QuantizationParameters
from .tensor import TensorPurpose
@@ -88,6 +93,8 @@ def create_lut_tensor(name, values, dtype):
# address in constant memory, and unnecessary DMA operations can be avoided.
sz = len(values)
assert sz in (256, 512)
+ # int16 lut uses uint32 lut with base + slope
+ dtype = DataType.uint32 if dtype == DataType.int16 else dtype
tens = create_const_tensor(name, [1, 1, 1, sz], dtype, values, TensorPurpose.LUT)
tens.equivalence_id = create_equivalence_id(tuple(values))
return tens
@@ -128,3 +135,110 @@ def optimize_high_level_cmd_stream(sg, arch):
lut_state = lut_state.put(lut_tens)
cmd_stream.append(cmd)
sg.high_level_command_stream = cmd_stream
+
+
+def convert_to_lut(op, lut_values, lut_name):
+ # Rewrite the operation by Add with scalar 0 + LUT activation
+ ifm = op.ifm
+ ofm = op.ofm
+ if ifm is None:
+ return op
+ assert ifm.dtype in (DataType.int8, DataType.uint8, DataType.int16)
+ op.type = Op.Add
+ op.name = f"{op.name}_lut_{lut_name}"
+ # Mark as no-op to enable potential fusing optimizations
+ op.attrs["is_nop"] = True
+ # Create an input tensor containing scalar zero
+ _max = 65536.0 if ifm.dtype == DataType.int16 else 255.0
+ quantization = QuantizationParameters(0.0, _max)
+ quantization.scale_f32 = ifm.quantization.scale_f32
+ quantization.zero_point = 0
+ tens = create_const_tensor(ifm.name + "_scalar0", [], ifm.dtype, [0], quantization=quantization)
+ op.add_input_tensor(tens)
+
+ # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
+ # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
+ # should be the same as the IFM
+ op.forced_output_quantization = ifm.quantization
+
+ # the lut tensor datatype needs to match both; the ofm datatype, because these are the values output; and the
+ # datatype used to generate the lut values (which is probably the ifm datatype), because we want to avoid any
+ # potential overflow errors in create_lut_tensor() caused by converting Python int (which could represent a uint)
+ # to NumPy int. this can be guaranteed by checking that the ifm and ofm datatypes are the same
+ assert ifm.dtype == ofm.dtype
+ lut_tensor = create_lut_tensor(op.name + "_values", lut_values, ofm.dtype)
+ op.set_activation_lut(lut_tensor)
+ op.set_ifm_ofm_shapes()
+ DebugDatabase.add_optimised(op, op)
+ return op
+
+
+def create_lut_8bit_op(op, lut_fn, fn_name):
+ ifm_scale = op.ifm.quantization.scale_f32
+ ofm_scale = op.ofm.quantization.scale_f32
+ zp_in = op.ifm.quantization.zero_point
+ zp_out = op.ofm.quantization.zero_point
+
+ values = []
+ ix = range(256) if op.ifm.dtype == DataType.uint8 else range(-128, 128)
+ quantized_min = min(ix)
+ quantized_max = max(ix)
+ for x in ix:
+ x_real = ifm_scale * (x - zp_in)
+ y_real = lut_fn(x_real)
+ lut_result = round_away_zero(y_real / ofm_scale) + zp_out
+ lut_result = min(quantized_max, max(quantized_min, lut_result))
+ values.append(lut_result)
+
+ return convert_to_lut(op, values, fn_name)
+
+
+def create_lut_int16_op(op, lut_fn, fn_name):
+ ifm_scale = op.ifm.quantization.scale_f32
+ ofm_scale = op.ofm.quantization.scale_f32
+ zp_in = op.ifm.quantization.zero_point
+ zp_out = op.ofm.quantization.zero_point
+
+ input_min = ifm_scale * (np.iinfo(np.int16).min - zp_in)
+ input_max = ifm_scale * (np.iinfo(np.int16).max - zp_in)
+ output_min = ofm_scale * (np.iinfo(np.int16).min - zp_out)
+ output_max = ofm_scale * (np.iinfo(np.int16).max - zp_out)
+
+ # Create 16bit lut following the reference
+ nbr_steps = 512
+ step = (input_max - input_min) / nbr_steps
+ half_step = step / 2
+ output_scaling_inv = (np.iinfo(np.int16).max - np.iinfo(np.int16).min + 1) / (output_max - output_min)
+
+ table_min = np.iinfo(np.int16).min
+ table_max = np.iinfo(np.int16).max
+
+ values = []
+ for i in range(nbr_steps):
+ val = lut_fn(input_min + i * step)
+ val_midpoint = lut_fn(input_min + i * step + half_step)
+ val_next = lut_fn(input_min + (i + 1) * step)
+
+ sample_val = round_away_zero(val * output_scaling_inv)
+ midpoint_interp_val = round_away_zero(
+ (val_next * output_scaling_inv + round_away_zero(val * output_scaling_inv)) / 2
+ )
+ midpoint_val = round_away_zero(val_midpoint * output_scaling_inv)
+ midpoint_err = midpoint_interp_val - midpoint_val
+ bias = round_away_zero(midpoint_err / 2)
+
+ lut_result = min(max(sample_val - bias, table_min), table_max)
+ values.append(lut_result)
+
+ val = round_away_zero(lut_fn(input_max) * output_scaling_inv)
+ lut_result = min(max(val, table_min), table_max)
+ values.append(lut_result)
+
+ # Convert to hardware 16bit lut with base and slope
+ lut = [0] * nbr_steps
+ for i in range(nbr_steps):
+ slope = (int(values[i + 1]) - int(values[i])) << 16
+ base = int(values[i])
+ lut[i] = slope + base
+
+ return convert_to_lut(op, lut, fn_name)
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index eafe3bd1..69596522 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -179,7 +179,7 @@ class Op(Enum):
EmbeddingLookup = OperatorInfo()
EmbeddingLookupSparse = OperatorInfo()
Equal = OperatorInfo()
- Exp = OperatorInfo()
+ Exp = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_INDICES, is_unary=True)
ExpandDims = OperatorInfo(indices=NNG_IFM_INDICES)
FakeQuantWithMinMaxArgs = OperatorInfo()
Fill = OperatorInfo()
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index c79f154a..1b70165e 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -34,7 +34,6 @@ from .ethos_u55_regs.ethos_u55_regs import resampling_mode
from .graph_optimiser_util import bypass_memory_only_ops
from .graph_optimiser_util import calc_explicit_padding
from .graph_optimiser_util import convert_depthwise_to_conv
-from .graph_optimiser_util import convert_to_lut
from .graph_optimiser_util import create_avg_pool_for_concat
from .graph_optimiser_util import memory_only_ops
from .graph_optimiser_util import move_splitsliceread_to_consumer
@@ -42,6 +41,9 @@ from .graph_optimiser_util import needed_total_padding
from .graph_optimiser_util import set_ifm_ofm_op_shapes
from .graph_optimiser_util import set_tensor_equivalence
from .lstm import Lstm
+from .lut import convert_to_lut
+from .lut import create_lut_8bit_op
+from .lut import create_lut_int16_op
from .numeric_util import clamp_sigmoid
from .numeric_util import full_shape
from .numeric_util import round_away_zero
@@ -1935,6 +1937,19 @@ def convert_mean_to_depthwise_conv(op, arch, nng):
return op
+def convert_ops_to_lut(op, arch, nng):
+ if op.type == Op.Exp:
+ if op.ifm.dtype == DataType.int8:
+ return create_lut_8bit_op(op, math.exp, "exp")
+ elif op.ifm.dtype == DataType.int16:
+ return create_lut_int16_op(op, math.exp, "exp")
+ else:
+ # Should already be catched in tflite supported ops
+ assert False, f"Unsupported data type {op.ifm.dtype} for {op.type}"
+
+ return op
+
+
def optimise_quantize(op: Operation, arch, nng):
if op.type == Op.Quantize and op.run_on_npu:
@@ -2214,6 +2229,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
# Rewrite of operators
op_rewrite_list = [
set_tensor_equivalence,
+ convert_ops_to_lut,
convert_mean_to_depthwise_conv,
convert_depthwise_to_conv,
convert_conv_to_fc,
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
index bb45a7f2..dda418c9 100644
--- a/ethosu/vela/tflite_mapping.py
+++ b/ethosu/vela/tflite_mapping.py
@@ -734,7 +734,7 @@ builtin_operator_map = {
),
TFLITE_IFM_WEIGHTS_INDICES,
),
- BuiltinOperator.EXP: (Op.Exp, OptionsSerializer("ExpOptions"), TFLITE_NO_INDICES),
+ BuiltinOperator.EXP: (Op.Exp, OptionsSerializer("ExpOptions"), TFLITE_IFM_INDICES),
BuiltinOperator.TOPK_V2: (Op.TopKV2, OptionsSerializer("TopKV2Options"), TFLITE_NO_INDICES),
BuiltinOperator.SPLIT: (Op.Split, OptionsSerializer("SplitOptions", ("num_splits",)), TFLITE_SPLIT_IFM_INDICES),
BuiltinOperator.LOG_SOFTMAX: (Op.LogSoftmax, OptionsSerializer("LogSoftmaxOptions"), TFLITE_NO_INDICES),
diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py
index 7537d7da..24c0794a 100644
--- a/ethosu/vela/tflite_model_semantic.py
+++ b/ethosu/vela/tflite_model_semantic.py
@@ -201,6 +201,9 @@ class TFLiteSemantic:
self.specific_constraints[Op.UnidirectionalSequenceLstm].append(TFLiteSemantic.constraint_lstm_intermediates)
self.specific_constraints[Op.UnidirectionalSequenceLstm].append(TFLiteSemantic.constraint_lstm_variables)
+ # Exp specific checks
+ self.specific_constraints[Op.Exp].append(TFLiteSemantic.constraint_input_signed)
+
def is_operator_semantic_valid(self, op):
ext_type = optype_to_builtintype(op.type)
diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py
index 2a599aaa..b3474147 100644
--- a/ethosu/vela/tosa_graph_optimiser.py
+++ b/ethosu/vela/tosa_graph_optimiser.py
@@ -25,11 +25,11 @@ from .debug_database import DebugDatabase
from .graph_optimiser_util import bypass_memory_only_ops
from .graph_optimiser_util import calc_explicit_padding
from .graph_optimiser_util import convert_depthwise_to_conv
-from .graph_optimiser_util import convert_to_lut
from .graph_optimiser_util import move_splitsliceread_to_consumer
from .graph_optimiser_util import needed_total_padding
from .graph_optimiser_util import set_ifm_ofm_op_shapes
from .graph_optimiser_util import set_tensor_equivalence
+from .lut import convert_to_lut
from .operation import ExplicitScaling
from .operation import Op
from .operation_util import create_add_nop