From 906c9e84d60de86f5b2584ae426bbc8e11932a03 Mon Sep 17 00:00:00 2001 From: Johan Alfven Date: Thu, 25 May 2023 11:18:50 +0200 Subject: MLBEDSW-8042: MLCE: Add SQUARED_DIFFERENCE support - Added SQUARED_DIFFERENCE support - Updated SUPPORTED_OPS.md Change-Id: Id83d9d92129e645390c7979759dfdeff7a14c2ee Signed-off-by: Johan Alfven --- SUPPORTED_OPS.md | 9 ++- ethosu/vela/operation.py | 2 +- ethosu/vela/operation_util.py | 3 +- ethosu/vela/tflite_graph_optimiser.py | 110 ++++++++++++++++++++++++++++++ ethosu/vela/tflite_mapping.py | 2 +- ethosu/vela/tflite_model_semantic.py | 2 +- ethosu/vela/tflite_supported_operators.py | 2 +- 7 files changed, 124 insertions(+), 6 deletions(-) diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md index 0fef738..0d60c67 100644 --- a/SUPPORTED_OPS.md +++ b/SUPPORTED_OPS.md @@ -19,7 +19,7 @@ limitations under the License. # Supported Ops This file was automatically generated by Vela using the `--supported-ops-report` parameter. -Vela version: `3.9.1.dev2+gc02eaa3.d20230904` +Vela version: `3.9.1.dev7+g3a3f35e.d20230912` This file complies with [**Gitiles Markdown syntax**](https://github.com/google/gitiles/blob/master/Documentation/markdown.md) @@ -70,6 +70,7 @@ Please check the supported operator list for your chosen runtime for further inf | SOFTMAX | [Generic](#tflite-generic-constraints), [Specific](#tflite-softmax-constraints) | | SPLIT | [Generic](#tflite-generic-constraints), [Specific](#tflite-split-constraints) | | SPLIT_V | [Generic](#tflite-generic-constraints), [Specific](#tflite-split_v-constraints) | +| SQUARED_DIFFERENCE | [Generic](#tflite-generic-constraints), [Specific](#tflite-squared_difference-constraints) | | SQUEEZE | [Generic](#tflite-generic-constraints), [Specific](#tflite-squeeze-constraints) | | STRIDED_SLICE | [Generic](#tflite-generic-constraints), [Specific](#tflite-strided_slice-constraints) | | SUB | [Generic](#tflite-generic-constraints), [Specific](#tflite-sub-constraints) | @@ -367,6 +368,12 @@ This is a list of constraints that the SPLIT_V operator must satisfy in order to - Only one size is allowed to be inferred +### TFLite SQUARED_DIFFERENCE Constraints + +This is a list of constraints that the SQUARED_DIFFERENCE operator must satisfy in order to be scheduled on the NPU. + +- At least one Input's shape must match the OFM's shape + ### TFLite SQUEEZE Constraints This is a list of constraints that the SQUEEZE operator must satisfy in order to be scheduled on the NPU. diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index 94d256c..c9a30b2 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -286,7 +286,7 @@ class Op(Enum): SplitV = OperatorInfo(indices=NNG_IFM_INDICES) Sqrt = OperatorInfo() Square = OperatorInfo() - SquaredDifference = OperatorInfo() + SquaredDifference = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES) Squeeze = OperatorInfo(indices=NNG_IFM_INDICES) StridedSlice = OperatorInfo(indices=NNG_IFM_INDICES) Sub = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES) diff --git a/ethosu/vela/operation_util.py b/ethosu/vela/operation_util.py index ef4949f..44a80b2 100644 --- a/ethosu/vela/operation_util.py +++ b/ethosu/vela/operation_util.py @@ -98,7 +98,8 @@ def create_cast_op( c = ifm.shape[-1] - shape = [1, 1, 1, c] + # Weigth shape is in format [h, w, c, b] + shape = [1, 1, c, 1] kernel = np.dstack([1] * c) identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0) op.add_input_tensor( diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index 2fb75e6..794a6ec 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -1986,6 +1986,115 @@ def fixup_or_check_asymmetric_weights(force_symmetric_int_weights): return check_asymmetric_weights +def convert_squared_difference(op, arch, nng): + if op.type == Op.SquaredDifference and op.run_on_npu: + ifm, ifm2, ofm = op.get_ifm_ifm2_ofm() + + identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0) + + # All the calculations/parameters same as reference kernel + twice_max_input_scale = np.double(2.0 * max(ifm.quantization.scale_f32, ifm2.quantization.scale_f32)) + real_input1_multiplier = np.double(ifm.quantization.scale_f32) / twice_max_input_scale + real_input2_multiplier = np.double(ifm2.quantization.scale_f32) / twice_max_input_scale + + left_shift = 0 if op.ifm.dtype == DataType.int16 else 7 + + real_output_multiplier = (twice_max_input_scale * twice_max_input_scale) / ( + np.double((1 << (left_shift * 2)) * ofm.quantization.scale_f32) + ) + + input1_multiplier, input1_shift = quantise_scale(real_input1_multiplier) + input2_multiplier, input2_shift = quantise_scale(real_input2_multiplier) + output_multiplier, output_shift = quantise_scale(real_output_multiplier) + + input1_multiplier_const = create_const_tensor( + op.name + "_input1_multiplier", [1], DataType.int32, [input1_multiplier], quantization=identity_quant + ) + input2_multiplier_const = create_const_tensor( + op.name + "_input2_multiplier", [1], DataType.int32, [input2_multiplier], quantization=identity_quant + ) + output_multiplier_const = create_const_tensor( + op.name + "_output_multiplier", [1], DataType.int32, [output_multiplier], quantization=identity_quant + ) + + # Convert ifm to 32 bit + ifm_32bit_shifted = ifm.clone(suffix="_ifm_32bit_shifted", set_unique=True) + ifm_32bit_shifted.dtype = DataType.int32 + ifm_32bit_shifted.quantization = identity_quant + cast_op = create_cast_op(op.name + "_ifm_32bit_shifted", ifm, ifm_32bit_shifted) + # Use explicit scaling (multiplier) for the left shift + cast_op.explicit_scaling = ExplicitScaling(False, [0], [1 << left_shift]) + DebugDatabase.add_optimised(op, cast_op) + + # 32 bit Mul op do not scale the value so the input has to be multiplied with the "multiplier" calculated above + ifm_scaled = ifm.clone(suffix="_scaled", set_unique=True) + ifm_scaled.dtype = DataType.int32 + ifm_scaled.quantization = identity_quant + mul_op = Operation(Op.Mul, op.name + "_scaled_input1") + mul_op.add_input_tensor(ifm_32bit_shifted) + mul_op.add_input_tensor(input1_multiplier_const) + mul_op.set_output_tensor(ifm_scaled) + # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty) + mul_op.explicit_scaling = ExplicitScaling(False, [input1_shift], [input1_multiplier]) + mul_op.set_ifm_ofm_shapes() + DebugDatabase.add_optimised(op, mul_op) + + # Convert ifm2 to 32 bit + ifm2_32bit_shifted = ifm2.clone(suffix="_ifm2_32bit_shifted", set_unique=True) + ifm2_32bit_shifted.dtype = DataType.int32 + ifm2_32bit_shifted.quantization = identity_quant + cast_op = create_cast_op(op.name + "_ifm2_32bit_shifted", ifm2, ifm2_32bit_shifted) + # Use explicit scaling (multiplier) for the left shift + cast_op.explicit_scaling = ExplicitScaling(False, [0], [1 << left_shift]) + DebugDatabase.add_optimised(op, cast_op) + + # 32 bit Mul op do not scale the value so input has to be multiplied with the "multiplier" calculated above + ifm2_scaled = ifm2.clone(suffix="_scaled", set_unique=True) + ifm2_scaled.dtype = DataType.int32 + ifm2_scaled.quantization = identity_quant + mul_op = Operation(Op.Mul, op.name + "_scaled_input2") + mul_op.add_input_tensor(ifm2_32bit_shifted) + mul_op.add_input_tensor(input2_multiplier_const) + mul_op.set_output_tensor(ifm2_scaled) + # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty) + mul_op.explicit_scaling = ExplicitScaling(False, [input2_shift], [input2_multiplier]) + mul_op.set_ifm_ofm_shapes() + DebugDatabase.add_optimised(op, mul_op) + + # Calculate the raw diff + raw_diff = ifm.clone(suffix="_raw_diff", set_unique=True) + raw_diff.dtype = DataType.int32 + raw_diff.quantization = None + sub_op = Operation(Op.Sub, op.name + "_raw_diff") + sub_op.add_input_tensor(ifm_scaled) + sub_op.add_input_tensor(ifm2_scaled) + sub_op.set_output_tensor(raw_diff) + sub_op.set_ifm_ofm_shapes() + DebugDatabase.add_optimised(op, sub_op) + + # Calculate the squared diff + squared_raw = ifm.clone(suffix="_squared_raw", set_unique=True) + squared_raw.dtype = DataType.int32 + squared_raw.quantization = None + mul_op = Operation(Op.Mul, op.name + "_squared_raw") + mul_op.add_input_tensor(raw_diff) + mul_op.add_input_tensor(raw_diff) + mul_op.set_output_tensor(squared_raw) + mul_op.set_ifm_ofm_shapes() + DebugDatabase.add_optimised(op, mul_op) + + # 32 bit Mul op do not scale the value so output has to be multiplied with "multiplier" calculated above + op.set_input_tensor(squared_raw, 0) + op.set_input_tensor(output_multiplier_const, 1) + op.type = Op.Mul + # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty) + op.explicit_scaling = ExplicitScaling(False, [output_shift], [output_multiplier]) + op.set_ifm_ofm_shapes() + DebugDatabase.add_optimised(op, op) + + return op + + def convert_mean_to_depthwise_conv(op, arch, nng): """ When h x w <= 4096 When h x w > 4096 there is a need to split into several ops. @@ -2669,6 +2778,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights): op_rewrite_list = [ set_tensor_equivalence, convert_ops_to_lut, + convert_squared_difference, convert_mean_to_depthwise_conv, convert_depthwise_to_conv, convert_conv_to_fc, diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py index 647430e..b1e0eae 100644 --- a/ethosu/vela/tflite_mapping.py +++ b/ethosu/vela/tflite_mapping.py @@ -848,7 +848,7 @@ builtin_operator_map = { BuiltinOperator.SQUARED_DIFFERENCE: ( Op.SquaredDifference, OptionsSerializer("SquaredDifferenceOptions"), - TFLITE_NO_INDICES, + TFLITE_IFM_IFM2_INDICES, ), BuiltinOperator.MIRROR_PAD: (Op.MirrorPad, OptionsSerializer("MirrorPadOptions", ("mode",)), TFLITE_NO_INDICES), BuiltinOperator.ABS: (Op.Abs, OptionsSerializer("AbsOptions"), TFLITE_IFM_INDICES), diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py index d2e0ba5..258af93 100644 --- a/ethosu/vela/tflite_model_semantic.py +++ b/ethosu/vela/tflite_model_semantic.py @@ -76,7 +76,7 @@ class TFLiteSemantic: ) ) binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops - elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops + elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,)) shapeless_input_ops = binary_elem_wise_main_ops | set( (Op.Split, Op.SplitV, Op.Mean, Op.ExpandDims, Op.Quantize, Op.ArgMax) ) diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py index 52b0485..3dbde84 100644 --- a/ethosu/vela/tflite_supported_operators.py +++ b/ethosu/vela/tflite_supported_operators.py @@ -106,7 +106,7 @@ class TFLiteSupportedOperators: ) ) binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops - elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops + elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,)) pad_ops = set((Op.Pad,)) supported_int32_tensor_ops = ( set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops -- cgit v1.2.1