aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfven <johan.alfven@arm.com>2023-05-25 11:18:50 +0200
committerJohan Alfven <johan.alfven@arm.com>2023-09-18 11:17:57 +0200
commit906c9e84d60de86f5b2584ae426bbc8e11932a03 (patch)
tree36ff5d1010105d1fb997252941f08835d51e2eac
parentb4e804bb53aba48985abf3bf8466bc02310f60fc (diff)
downloadethos-u-vela-906c9e84d60de86f5b2584ae426bbc8e11932a03.tar.gz
MLBEDSW-8042: MLCE: Add SQUARED_DIFFERENCE support
- Added SQUARED_DIFFERENCE support - Updated SUPPORTED_OPS.md Change-Id: Id83d9d92129e645390c7979759dfdeff7a14c2ee Signed-off-by: Johan Alfven <johan.alfven@arm.com>
-rw-r--r--SUPPORTED_OPS.md9
-rw-r--r--ethosu/vela/operation.py2
-rw-r--r--ethosu/vela/operation_util.py3
-rw-r--r--ethosu/vela/tflite_graph_optimiser.py110
-rw-r--r--ethosu/vela/tflite_mapping.py2
-rw-r--r--ethosu/vela/tflite_model_semantic.py2
-rw-r--r--ethosu/vela/tflite_supported_operators.py2
7 files changed, 124 insertions, 6 deletions
diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md
index 0fef738..0d60c67 100644
--- a/SUPPORTED_OPS.md
+++ b/SUPPORTED_OPS.md
@@ -19,7 +19,7 @@ limitations under the License.
# Supported Ops
This file was automatically generated by Vela using the `--supported-ops-report` parameter.
-Vela version: `3.9.1.dev2+gc02eaa3.d20230904`
+Vela version: `3.9.1.dev7+g3a3f35e.d20230912`
This file complies with
[**Gitiles Markdown syntax**](https://github.com/google/gitiles/blob/master/Documentation/markdown.md)
@@ -70,6 +70,7 @@ Please check the supported operator list for your chosen runtime for further inf
| SOFTMAX | [Generic](#tflite-generic-constraints), [Specific](#tflite-softmax-constraints) |
| SPLIT | [Generic](#tflite-generic-constraints), [Specific](#tflite-split-constraints) |
| SPLIT_V | [Generic](#tflite-generic-constraints), [Specific](#tflite-split_v-constraints) |
+| SQUARED_DIFFERENCE | [Generic](#tflite-generic-constraints), [Specific](#tflite-squared_difference-constraints) |
| SQUEEZE | [Generic](#tflite-generic-constraints), [Specific](#tflite-squeeze-constraints) |
| STRIDED_SLICE | [Generic](#tflite-generic-constraints), [Specific](#tflite-strided_slice-constraints) |
| SUB | [Generic](#tflite-generic-constraints), [Specific](#tflite-sub-constraints) |
@@ -367,6 +368,12 @@ This is a list of constraints that the SPLIT_V operator must satisfy in order to
- Only one size is allowed to be inferred
+### TFLite SQUARED_DIFFERENCE Constraints
+
+This is a list of constraints that the SQUARED_DIFFERENCE operator must satisfy in order to be scheduled on the NPU.
+
+- At least one Input's shape must match the OFM's shape
+
### TFLite SQUEEZE Constraints
This is a list of constraints that the SQUEEZE operator must satisfy in order to be scheduled on the NPU.
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 94d256c..c9a30b2 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -286,7 +286,7 @@ class Op(Enum):
SplitV = OperatorInfo(indices=NNG_IFM_INDICES)
Sqrt = OperatorInfo()
Square = OperatorInfo()
- SquaredDifference = OperatorInfo()
+ SquaredDifference = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES)
Squeeze = OperatorInfo(indices=NNG_IFM_INDICES)
StridedSlice = OperatorInfo(indices=NNG_IFM_INDICES)
Sub = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=NNG_IFM_IFM2_INDICES)
diff --git a/ethosu/vela/operation_util.py b/ethosu/vela/operation_util.py
index ef4949f..44a80b2 100644
--- a/ethosu/vela/operation_util.py
+++ b/ethosu/vela/operation_util.py
@@ -98,7 +98,8 @@ def create_cast_op(
c = ifm.shape[-1]
- shape = [1, 1, 1, c]
+ # Weigth shape is in format [h, w, c, b]
+ shape = [1, 1, c, 1]
kernel = np.dstack([1] * c)
identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)
op.add_input_tensor(
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 2fb75e6..794a6ec 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -1986,6 +1986,115 @@ def fixup_or_check_asymmetric_weights(force_symmetric_int_weights):
return check_asymmetric_weights
+def convert_squared_difference(op, arch, nng):
+ if op.type == Op.SquaredDifference and op.run_on_npu:
+ ifm, ifm2, ofm = op.get_ifm_ifm2_ofm()
+
+ identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)
+
+ # All the calculations/parameters same as reference kernel
+ twice_max_input_scale = np.double(2.0 * max(ifm.quantization.scale_f32, ifm2.quantization.scale_f32))
+ real_input1_multiplier = np.double(ifm.quantization.scale_f32) / twice_max_input_scale
+ real_input2_multiplier = np.double(ifm2.quantization.scale_f32) / twice_max_input_scale
+
+ left_shift = 0 if op.ifm.dtype == DataType.int16 else 7
+
+ real_output_multiplier = (twice_max_input_scale * twice_max_input_scale) / (
+ np.double((1 << (left_shift * 2)) * ofm.quantization.scale_f32)
+ )
+
+ input1_multiplier, input1_shift = quantise_scale(real_input1_multiplier)
+ input2_multiplier, input2_shift = quantise_scale(real_input2_multiplier)
+ output_multiplier, output_shift = quantise_scale(real_output_multiplier)
+
+ input1_multiplier_const = create_const_tensor(
+ op.name + "_input1_multiplier", [1], DataType.int32, [input1_multiplier], quantization=identity_quant
+ )
+ input2_multiplier_const = create_const_tensor(
+ op.name + "_input2_multiplier", [1], DataType.int32, [input2_multiplier], quantization=identity_quant
+ )
+ output_multiplier_const = create_const_tensor(
+ op.name + "_output_multiplier", [1], DataType.int32, [output_multiplier], quantization=identity_quant
+ )
+
+ # Convert ifm to 32 bit
+ ifm_32bit_shifted = ifm.clone(suffix="_ifm_32bit_shifted", set_unique=True)
+ ifm_32bit_shifted.dtype = DataType.int32
+ ifm_32bit_shifted.quantization = identity_quant
+ cast_op = create_cast_op(op.name + "_ifm_32bit_shifted", ifm, ifm_32bit_shifted)
+ # Use explicit scaling (multiplier) for the left shift
+ cast_op.explicit_scaling = ExplicitScaling(False, [0], [1 << left_shift])
+ DebugDatabase.add_optimised(op, cast_op)
+
+ # 32 bit Mul op do not scale the value so the input has to be multiplied with the "multiplier" calculated above
+ ifm_scaled = ifm.clone(suffix="_scaled", set_unique=True)
+ ifm_scaled.dtype = DataType.int32
+ ifm_scaled.quantization = identity_quant
+ mul_op = Operation(Op.Mul, op.name + "_scaled_input1")
+ mul_op.add_input_tensor(ifm_32bit_shifted)
+ mul_op.add_input_tensor(input1_multiplier_const)
+ mul_op.set_output_tensor(ifm_scaled)
+ # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty)
+ mul_op.explicit_scaling = ExplicitScaling(False, [input1_shift], [input1_multiplier])
+ mul_op.set_ifm_ofm_shapes()
+ DebugDatabase.add_optimised(op, mul_op)
+
+ # Convert ifm2 to 32 bit
+ ifm2_32bit_shifted = ifm2.clone(suffix="_ifm2_32bit_shifted", set_unique=True)
+ ifm2_32bit_shifted.dtype = DataType.int32
+ ifm2_32bit_shifted.quantization = identity_quant
+ cast_op = create_cast_op(op.name + "_ifm2_32bit_shifted", ifm2, ifm2_32bit_shifted)
+ # Use explicit scaling (multiplier) for the left shift
+ cast_op.explicit_scaling = ExplicitScaling(False, [0], [1 << left_shift])
+ DebugDatabase.add_optimised(op, cast_op)
+
+ # 32 bit Mul op do not scale the value so input has to be multiplied with the "multiplier" calculated above
+ ifm2_scaled = ifm2.clone(suffix="_scaled", set_unique=True)
+ ifm2_scaled.dtype = DataType.int32
+ ifm2_scaled.quantization = identity_quant
+ mul_op = Operation(Op.Mul, op.name + "_scaled_input2")
+ mul_op.add_input_tensor(ifm2_32bit_shifted)
+ mul_op.add_input_tensor(input2_multiplier_const)
+ mul_op.set_output_tensor(ifm2_scaled)
+ # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty)
+ mul_op.explicit_scaling = ExplicitScaling(False, [input2_shift], [input2_multiplier])
+ mul_op.set_ifm_ofm_shapes()
+ DebugDatabase.add_optimised(op, mul_op)
+
+ # Calculate the raw diff
+ raw_diff = ifm.clone(suffix="_raw_diff", set_unique=True)
+ raw_diff.dtype = DataType.int32
+ raw_diff.quantization = None
+ sub_op = Operation(Op.Sub, op.name + "_raw_diff")
+ sub_op.add_input_tensor(ifm_scaled)
+ sub_op.add_input_tensor(ifm2_scaled)
+ sub_op.set_output_tensor(raw_diff)
+ sub_op.set_ifm_ofm_shapes()
+ DebugDatabase.add_optimised(op, sub_op)
+
+ # Calculate the squared diff
+ squared_raw = ifm.clone(suffix="_squared_raw", set_unique=True)
+ squared_raw.dtype = DataType.int32
+ squared_raw.quantization = None
+ mul_op = Operation(Op.Mul, op.name + "_squared_raw")
+ mul_op.add_input_tensor(raw_diff)
+ mul_op.add_input_tensor(raw_diff)
+ mul_op.set_output_tensor(squared_raw)
+ mul_op.set_ifm_ofm_shapes()
+ DebugDatabase.add_optimised(op, mul_op)
+
+ # 32 bit Mul op do not scale the value so output has to be multiplied with "multiplier" calculated above
+ op.set_input_tensor(squared_raw, 0)
+ op.set_input_tensor(output_multiplier_const, 1)
+ op.type = Op.Mul
+ # Use explicit scaling for the shift (multiplier not actually used for int32, but value can not be empty)
+ op.explicit_scaling = ExplicitScaling(False, [output_shift], [output_multiplier])
+ op.set_ifm_ofm_shapes()
+ DebugDatabase.add_optimised(op, op)
+
+ return op
+
+
def convert_mean_to_depthwise_conv(op, arch, nng):
"""
When h x w <= 4096 When h x w > 4096 there is a need to split into several ops.
@@ -2669,6 +2778,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
op_rewrite_list = [
set_tensor_equivalence,
convert_ops_to_lut,
+ convert_squared_difference,
convert_mean_to_depthwise_conv,
convert_depthwise_to_conv,
convert_conv_to_fc,
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
index 647430e..b1e0eae 100644
--- a/ethosu/vela/tflite_mapping.py
+++ b/ethosu/vela/tflite_mapping.py
@@ -848,7 +848,7 @@ builtin_operator_map = {
BuiltinOperator.SQUARED_DIFFERENCE: (
Op.SquaredDifference,
OptionsSerializer("SquaredDifferenceOptions"),
- TFLITE_NO_INDICES,
+ TFLITE_IFM_IFM2_INDICES,
),
BuiltinOperator.MIRROR_PAD: (Op.MirrorPad, OptionsSerializer("MirrorPadOptions", ("mode",)), TFLITE_NO_INDICES),
BuiltinOperator.ABS: (Op.Abs, OptionsSerializer("AbsOptions"), TFLITE_IFM_INDICES),
diff --git a/ethosu/vela/tflite_model_semantic.py b/ethosu/vela/tflite_model_semantic.py
index d2e0ba5..258af93 100644
--- a/ethosu/vela/tflite_model_semantic.py
+++ b/ethosu/vela/tflite_model_semantic.py
@@ -76,7 +76,7 @@ class TFLiteSemantic:
)
)
binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
- elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
+ elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,))
shapeless_input_ops = binary_elem_wise_main_ops | set(
(Op.Split, Op.SplitV, Op.Mean, Op.ExpandDims, Op.Quantize, Op.ArgMax)
)
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 52b0485..3dbde84 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -106,7 +106,7 @@ class TFLiteSupportedOperators:
)
)
binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
- elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
+ elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops | set((Op.SquaredDifference,))
pad_ops = set((Op.Pad,))
supported_int32_tensor_ops = (
set((Op.ReduceSum, Op.CLZ, Op.Shape, Op.ArgMax)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops