From f436ada9caea87ec2dd686a92e41a15c1dcdeb1d Mon Sep 17 00:00:00 2001 From: Patrik Gustavsson Date: Tue, 14 Sep 2021 14:56:48 +0200 Subject: TOSA: Support for TABLE operator (int8) Added support to map TABLE operator to LUT. Limitations: -Only supported for int8 -TABLE input must be constant This also adds the support for TFLite legalisation of Tanh/Sigmoid (int8/uint8). Signed-off-by: Patrik Gustavsson Change-Id: I1a95f61fb02fdd42c4a690494418cc0765c8b275 --- ethosu/vela/graph_optimiser_util.py | 31 +++++++++++++++++++++++++++++++ ethosu/vela/operation.py | 1 + ethosu/vela/tflite_graph_optimiser.py | 30 +----------------------------- ethosu/vela/tosa_graph_optimiser.py | 22 ++++++++++++++++++---- ethosu/vela/tosa_mapping.py | 4 ++-- ethosu/vela/tosa_supported_operators.py | 24 +++++++++++++++++++++++- 6 files changed, 76 insertions(+), 36 deletions(-) diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py index dafd2849..d2d3d833 100644 --- a/ethosu/vela/graph_optimiser_util.py +++ b/ethosu/vela/graph_optimiser_util.py @@ -19,6 +19,7 @@ from typing import Tuple import numpy as np +from . import lut from .data_type import DataType from .debug_database import DebugDatabase from .errors import UnsupportedFeatureError @@ -26,6 +27,8 @@ from .errors import VelaError from .operation import Op from .operation_util import create_avgpool_nop from .shape4d import Shape4D +from .tensor import create_const_tensor +from .tensor import QuantizationParameters memory_only_ops = ( Op.Reshape, @@ -320,3 +323,31 @@ def convert_depthwise_to_conv(op, arch, nng): ) DebugDatabase.add_optimised(op, op) return op + + +def convert_to_lut(op, lut_values, lut_name): + # Rewrite the operation by Add with scalar 0 + LUT activation + ifm = op.inputs[0] + if ifm is None: + return op + assert ifm.dtype.size_in_bytes() == 1 + op.type = Op.Add + op.name = op.name + "_lut_" + lut_name + # Mark as no-op to enable potential fusing optimizations + op.attrs["is_nop"] = True + # Create an input tensor containing scalar zero + quantization = QuantizationParameters(0.0, 255.0) + quantization.scale_f32 = ifm.quantization.scale_f32 + quantization.zero_point = 0 + tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization) + op.add_input_tensor(tens) + op.ifm_shapes.append(Shape4D(tens.shape)) # TODO no shape? + + # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale), + # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions + # should be the same as the IFM + op.forced_output_quantization = ifm.quantization + lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8) + op.set_activation_lut(lut_tensor) + op.set_ifm_ofm_shapes() + return op diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index e9d364ea..1558b943 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -281,6 +281,7 @@ class Op(Enum): SubgraphInput = OperatorInfo() # Only used in CPU subgraphs Sum = OperatorInfo() Svdf = OperatorInfo() + Table = OperatorInfo(indices=NNG_IFM_INDICES) Tanh = OperatorInfo(indices=NNG_IFM_INDICES) Tile = OperatorInfo() TopKV2 = OperatorInfo() diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index b48cc7af..cf211de4 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -22,7 +22,6 @@ import uuid import numpy as np from . import fp_math -from . import lut from . import rewrite_graph from . import scaling from .api import NpuRoundingMode @@ -33,6 +32,7 @@ from .ethos_u55_regs.ethos_u55_regs import resampling_mode from .graph_optimiser_util import bypass_memory_only_ops from .graph_optimiser_util import calc_explicit_padding from .graph_optimiser_util import convert_depthwise_to_conv +from .graph_optimiser_util import convert_to_lut from .graph_optimiser_util import fix_sg_input_output from .graph_optimiser_util import memory_only_ops from .graph_optimiser_util import move_splitsliceread_to_consumer @@ -858,34 +858,6 @@ def convert_lrelu_to_mul_max(op, arch): return op -def convert_to_lut(op, lut_values, lut_name): - # Rewrite the operation by Add with scalar 0 + LUT activation - ifm = op.inputs[0] - if ifm is None: - return op - assert ifm.dtype.size_in_bytes() == 1 - op.type = Op.Add - op.name = op.name + "_lut_" + lut_name - # Mark as no-op to enable potential fusing optimizations - op.attrs["is_nop"] = True - # Create an input tensor containing scalar zero - quantization = QuantizationParameters(0.0, 255.0) - quantization.scale_f32 = ifm.quantization.scale_f32 - quantization.zero_point = 0 - tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization) - op.add_input_tensor(tens) - op.ifm_shapes.append(Shape4D(tens.shape)) - - # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale), - # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions - # should be the same as the IFM - op.forced_output_quantization = ifm.quantization - lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8) - op.set_activation_lut(lut_tensor) - op.set_ifm_ofm_shapes() - return op - - def convert_to_lut8(op, fn, fn_name): # Converts op to a no-op + int8/uint8 LUT which is generated with the given function. # fn is a function(real) -> real diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py index a298ddbb..1ef04449 100644 --- a/ethosu/vela/tosa_graph_optimiser.py +++ b/ethosu/vela/tosa_graph_optimiser.py @@ -24,6 +24,7 @@ from .debug_database import DebugDatabase from .graph_optimiser_util import bypass_memory_only_ops from .graph_optimiser_util import calc_explicit_padding from .graph_optimiser_util import convert_depthwise_to_conv +from .graph_optimiser_util import convert_to_lut from .graph_optimiser_util import move_splitsliceread_to_consumer from .graph_optimiser_util import needed_total_padding from .graph_optimiser_util import set_ifm_ofm_op_shapes @@ -490,13 +491,26 @@ def convert_pad(op, arch, nng): return add_op +def convert_table_to_lut(op, arch, nng): + # Converts table op to a no-op + LUT + if op.type is not Op.Table: + return op + + table = op.inputs[1] + op.inputs.remove(table) + op.set_ifm_ofm_shapes() + + return convert_to_lut(op, table.values, "table") + + def fixup_quantization(op, arch, nng): if op.ifm and op.ifm.quantization.zero_point is None: op.ifm.quantization.zero_point = 0 if op.ifm2 and op.ifm2.quantization.zero_point is None: - op.ifm.quantization.zero_point = 0 - if op.ofm and op.ofm.quantization.zero_point is None: - op.ofm.quantization.zero_point = 0 + op.ifm2.quantization.zero_point = 0 + if not op.forced_output_quantization: + if op.ofm and op.ofm.quantization and op.ofm.quantization.zero_point is None: + op.ofm.quantization.zero_point = 0 return op @@ -547,7 +561,7 @@ def tosa_optimise_graph(nng, arch): ) # Rewite Operators step - op_rewrite_list = [set_tensor_equivalence, rewrite_rescale, convert_depthwise_to_conv] + op_rewrite_list = [set_tensor_equivalence, rewrite_rescale, convert_depthwise_to_conv, convert_table_to_lut] for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( diff --git a/ethosu/vela/tosa_mapping.py b/ethosu/vela/tosa_mapping.py index ebbaa0a1..f80a9156 100644 --- a/ethosu/vela/tosa_mapping.py +++ b/ethosu/vela/tosa_mapping.py @@ -196,7 +196,6 @@ unsupported_tosa_operators = { TosaOp.MAXIMUM, TosaOp.MINIMUM, TosaOp.POW, - TosaOp.TABLE, TosaOp.ABS, TosaOp.BITWISE_NOT, TosaOp.CEIL, @@ -274,7 +273,8 @@ tosa_operator_map = { TosaOp.MUL: (Op.Mul, mul_attrs, None, TOSA_IFM_IFM2_INDICES), # TODO TosaOp.POW TosaOp.SUB: (Op.Sub, None, None, TOSA_IFM_IFM2_INDICES), - # TODO TosaOp.TABLE + # TODO is table content in input[1] always constant? + TosaOp.TABLE: (Op.Table, None, None, TOSA_IFM_INDICES), # TODO TosaOp.ABS # TODO TosaOp.BITWISE_NOT # TODO TosaOp.CEIL diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py index a4f822eb..98df27e3 100644 --- a/ethosu/vela/tosa_supported_operators.py +++ b/ethosu/vela/tosa_supported_operators.py @@ -42,7 +42,7 @@ class TosaSupportedOperators: binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.RescaleMul, Op.Sub,)) type_conversion_ops = set((Op.Rescale,)) relu_ops = set((Op.Clamp, Op.ReluN,)) - activation_ops = relu_ops + activation_ops = relu_ops | set((Op.Table,)) pad_ops = set((Op.Pad,)) npu_post_ops = activation_ops @@ -68,6 +68,8 @@ class TosaSupportedOperators: self.specific_constraints[Op.Transpose].append(TosaSupportedOperators.constraint_ifm_producer) self.specific_constraints[Op.Pad].append(TosaSupportedOperators.constraint_padding_producer) + self.specific_constraints[Op.Table].append(TosaSupportedOperators.constraint_table_dtype) + self.specific_constraints[Op.Table].append(TosaSupportedOperators.constraint_table_producer) # Depthwise Conv specific checks: for op_type in TosaSupportedOperators.depthwise_convolution_ops: @@ -200,3 +202,23 @@ class TosaSupportedOperators: ) return valid, extra return True, "Op has depth_multiplier=1" + + # TODO Table operator support limited to int8 for now. + # For TFLite it is assumed to be constant. + @staticmethod + def constraint_table_dtype(op): + "Only supported is int8" + valid = True + tensors = [op.ifm, op.ofm, op.inputs[1]] + for tens in tensors: + if tens.dtype != DataType.int8: + valid = False + return valid, "Table operator with non int8 tensor" + + # TODO limit table to be constant data for now. + # Can it be non-constant? + @staticmethod + def constraint_table_producer(op): + "Input must be constant data" + valid = op.inputs[1].ops and op.inputs[1].ops[0].type == Op.Const + return valid, "Table Op with non-constant table input" -- cgit v1.2.1