aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Verhaard <louis.verhaard@arm.com>2020-09-30 09:01:52 +0200
committerLouis Verhaard <louis.verhaard@arm.com>2020-10-08 16:29:29 +0200
commitaee5d7537ff81ffda5ba222721b72f914ce50fb8 (patch)
tree495b9dfff2a188c6916f8ca2e390ee88f7da8ccc
parent36ad73a0fb46d3f844845c97c56d92de2a7a9b3d (diff)
downloadethos-u-vela-aee5d7537ff81ffda5ba222721b72f914ce50fb8.tar.gz
MLBEDSW-3148: Refactor Operation
- op.type is now an enum instead of a string - Removed unused operator codes - Refactored some attributes like npu_block_type, fused_activation_function - Refactored operator index calculation - Refactored a number of operator sets Change-Id: I641f65ee375794b7aec42abc0664251ae37d78e8 Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
-rw-r--r--ethosu/vela/architecture_features.py2
-rw-r--r--ethosu/vela/extract_npu_subgraphs.py13
-rw-r--r--ethosu/vela/graph_optimiser.py222
-rw-r--r--ethosu/vela/high_level_command_stream_generator.py26
-rw-r--r--ethosu/vela/insert_dma.py12
-rw-r--r--ethosu/vela/live_range.py7
-rw-r--r--ethosu/vela/mark_tensors.py210
-rw-r--r--ethosu/vela/nn_graph.py17
-rw-r--r--ethosu/vela/npu_performance.py9
-rw-r--r--ethosu/vela/npu_serialisation.py8
-rw-r--r--ethosu/vela/operation.py514
-rw-r--r--ethosu/vela/pass_packing.py99
-rw-r--r--ethosu/vela/register_command_stream_generator.py85
-rw-r--r--ethosu/vela/scheduler.py15
-rw-r--r--ethosu/vela/shared_buffer_allocation.py9
-rw-r--r--ethosu/vela/softmax.py71
-rw-r--r--ethosu/vela/stats_writer.py5
-rw-r--r--ethosu/vela/supported_operators.py91
-rw-r--r--ethosu/vela/tensor.py9
-rw-r--r--ethosu/vela/test/test_lut.py33
-rw-r--r--ethosu/vela/test/test_supported_operators.py28
-rw-r--r--ethosu/vela/test/test_tflite_reader.py17
-rw-r--r--ethosu/vela/test/testutil.py2
-rw-r--r--ethosu/vela/tflite_mapping.py279
-rw-r--r--ethosu/vela/tflite_reader.py39
-rw-r--r--ethosu/vela/tflite_writer.py35
-rw-r--r--ethosu/vela/weight_compressor.py12
27 files changed, 935 insertions, 934 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 7391d714..3ef4d1bf 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -574,7 +574,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
Cortex-Mx.intercept=<some float value>
Cortex-Mx.slope=<some float value>
"""
- section = "CpuPerformance." + op.type
+ section = "CpuPerformance." + op.type.name
if self.vela_config is not None and section in self.vela_config:
op_config = self.vela_config[section]
try:
diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py
index c0430b5d..e08392dc 100644
--- a/ethosu/vela/extract_npu_subgraphs.py
+++ b/ethosu/vela/extract_npu_subgraphs.py
@@ -25,17 +25,19 @@ import numpy as np
from .nn_graph import Pass
from .nn_graph import PassPlacement
from .nn_graph import Subgraph
+from .operation import CustomType
from .operation import NpuBlockType
+from .operation import Op
from .operation import Operation
def make_npu_call_op_pass(npu_subgraph):
- op = Operation("NpuOp", "call_" + npu_subgraph.name)
+ op = Operation(Op.CustomNpuOp, "call_" + npu_subgraph.name)
op.attrs["subgraph"] = npu_subgraph
+ op.attrs["custom_type"] = CustomType.NpuOp
ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
ps.ops = [op]
ps.primary_op = op
- op.attrs["npu_block_type"] = ps.npu_block_type
op.scheduled_pass = ps
# Inputs and outputs filled in later as we cut the graphs
@@ -69,14 +71,13 @@ def switch_tensor_for_op(op, orig_tens, new_tens):
def rewrite_tensor_cpu_producer_npu_consumers(
orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
):
- is_const = orig_tens.ops[0].type == "Const"
+ is_const = orig_tens.ops[0].type == Op.Const
new_tens = orig_tens.clone("_npu")
- op_type = "SubgraphInput"
+ op_type = Op.SubgraphInput
if is_const:
- op_type = "Const"
+ op_type = Op.Const
op = Operation(op_type, orig_tens.name + "_input")
- op.attrs["npu_block_type"] = NpuBlockType.Default
op.scheduled_pass = startup_init_ps
op.set_output_tensor(new_tens)
startup_init_ps.ops.append(op)
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 4f435dcb..1966a82d 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -32,6 +32,7 @@ from .numeric_util import full_shape
from .numeric_util import round_away_zero
from .operation import create_avgpool_nop
from .operation import NpuBlockType
+from .operation import Op
from .operation import Operation
from .softmax import SoftMax
from .tensor import create_const_tensor
@@ -39,33 +40,9 @@ from .tensor import create_reshape_tensor
from .tensor import QuantizationParameters
from .tensor import Tensor
-passthrough_nodes = set(("Identity",))
-
-conv_op = set(("Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitchedBias", "Conv2DBiasAct"))
-fc_op = set(
- (
- "MatMul",
- "QuantizedMatMul",
- "BlockLSTM",
- "RnnAct",
- "UnidirectionalSequenceRnnAct",
- "BidirectionalSequenceRnnAct",
- "LstmAct",
- "UnidirectionalSequenceLstmAct",
- "BidirectionalSequenceLstmAct",
- "FullyConnectedAct",
- )
-)
-depthwise_op = set(("DepthwiseConv2dNative", "DepthwiseConv2dBiasAct",))
-pool_op = set(
- ("AvgPool", "MaxPool", "QuantizedAvgPool", "QuantizedMaxPool", "AvgPoolAct", "MaxPoolAct", "ResizeBilinear")
-)
-reduce_sum_ops = set(("ReduceSum",))
-binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum"))
-elementwise_op = set(("LeakyRelu", "Abs", "CLZ", "SHL", "SHR")) | binary_elementwise_op
-relu_ops = set(("Relu", "Relu6", "ReluN1To1"))
-activation_ops = set(("Sigmoid", "Tanh")) | relu_ops
-memory_only_ops = set(("Reshape",))
+passthrough_nodes = set((Op.Identity,))
+
+memory_only_ops = set((Op.Reshape,))
def remove_passthrough_tensor(tens, arch, nng):
@@ -76,7 +53,7 @@ def remove_passthrough_tensor(tens, arch, nng):
def rewrite_concat(tens, arch, nng):
- if len(tens.ops) == 1 and tens.ops[0].is_concat_op():
+ if len(tens.ops) == 1 and tens.ops[0].type.is_concat_op():
concat_op = tens.ops[0]
if tens != concat_op.outputs[0]:
return tens # don't attempt to rewrite the min/max outputs of QuantizedConcat
@@ -90,7 +67,7 @@ def rewrite_concat(tens, arch, nng):
tens.ops = []
offset = 0
for idx, inp in enumerate(inputs):
- new_op = Operation("ConcatSliceWrite", concat_op.name + str(idx))
+ new_op = Operation(Op.ConcatSliceWrite, concat_op.name + str(idx))
new_op.inputs = [inp]
new_op.outputs = [tens]
new_op.attrs["concat_axis"] = axis
@@ -116,7 +93,7 @@ def rewrite_concat(tens, arch, nng):
def rewrite_split(tens, arch, nng):
- if len(tens.ops) == 1 and tens.ops[0].is_split_op():
+ if len(tens.ops) == 1 and tens.ops[0].type.is_split_op():
split_op = tens.ops[0]
# Not supported so leave it and run on CPU
@@ -126,7 +103,7 @@ def rewrite_split(tens, arch, nng):
inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
tens.ops = []
- new_op = Operation("SplitSliceRead", split_op.name)
+ new_op = Operation(Op.SplitSliceRead, split_op.name)
new_op.inputs = [inp]
# For Split the offset cannot be extracted from the tensor so it has to
@@ -206,10 +183,10 @@ def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_dim
def fixup_conv2d_backprop(op, arch, nng):
- if op.type == "Conv2DBackpropInput":
+ if op.type == Op.Conv2DBackpropInput:
# flip the inputs
op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
- op.type = "Conv2DBackpropInputSwitchedBias"
+ op.type = Op.Conv2DBackpropInputSwitchedBias
# Update strides
op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})
@@ -219,9 +196,8 @@ def fixup_conv2d_backprop(op, arch, nng):
# Convert the op to an elementwise add
def convert_resizebilinear_1x1_to_add(op):
- op.type = "AddAct"
+ op.type = Op.Add
op.name = op.name + "_add"
- op.attrs.update({"npu_block_type": NpuBlockType.ElementWise})
op.attrs["resizebilinear"] = True
# Create an input tensor filled with zeros
shape = op.outputs[0].shape
@@ -296,11 +272,11 @@ def convert_resizebilinear_to_2x2_pool(op):
def fixup_resizebilinear(op, arch, nng):
- if op.type == "ResizeBilinear" and op.run_on_npu:
+ if op.type == Op.ResizeBilinear and op.run_on_npu:
if op.inputs[0].shape == op.outputs[0].shape:
# Bypass nop resizebilinear
op.inputs = op.inputs[:1]
- op.type = "Identity"
+ op.type = Op.Identity
elif op.inputs[0].shape[1] == 1 and op.inputs[0].shape[2] == 1:
convert_resizebilinear_1x1_to_add(op)
else:
@@ -310,16 +286,16 @@ def fixup_resizebilinear(op, arch, nng):
def convert_nop_split_to_identity(op, arch, nng):
- if op.type == "Split" and op.attrs.get("num_splits") == 1:
+ if op.type == Op.Split and op.attrs.get("num_splits") == 1:
# the list comprehension should return a list with a single tensor
# if it shouldn't, remove_passthrough_tensor will fail appropriately
op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]
- op.type = "Identity"
+ op.type = Op.Identity
return op
def fixup_fully_connected_input(op, arch, nng):
- if op.type == "FullyConnectedAct":
+ if op.type == Op.FullyConnected:
inp = op.inputs[0]
weights = op.inputs[1]
@@ -337,7 +313,7 @@ def fixup_fully_connected_input(op, arch, nng):
def convert_batched_fc_to_conv(op, arch, nng):
- if op.type == "FullyConnectedAct":
+ if op.type == Op.FullyConnected:
ifm = op.inputs[0]
ofm = op.outputs[0]
# Check if the FC is 2D and first dimension indicates batching
@@ -348,14 +324,11 @@ def convert_batched_fc_to_conv(op, arch, nng):
# Convert to convolution
op.name += "_conv"
- op.type = "Conv2DBiasAct"
- faf = op.attrs.get("fused_activation_function", None)
+ op.type = Op.Conv2DBias
op.attrs = {
"dilation": (1, 1, 1, 1),
"dilation_h_factor": 1,
"dilation_w_factor": 1,
- "fused_activation_function": faf,
- "npu_block_type": NpuBlockType.ConvolutionMxN,
"padding": b"SAME",
"stride_h": 1,
"stride_w": 1,
@@ -364,7 +337,7 @@ def convert_batched_fc_to_conv(op, arch, nng):
prev_op = ifm.ops[0]
desired_shape = [1, h, w, ifm.shape[-1]]
- if len(ifm.consumer_list) == 1 and prev_op is not None and prev_op.type == "Reshape":
+ if len(ifm.consumer_list) == 1 and prev_op is not None and prev_op.type == Op.Reshape:
# There is a preceding Reshape
# Compare input of prev_op and input of op, to see if prev_op can be removed
ifm_prev_op = prev_op.inputs[0]
@@ -391,7 +364,7 @@ def convert_batched_fc_to_conv(op, arch, nng):
if (
len(ofm.consumer_list) == 1
and ofm.consumer_list[0] is not None
- and ofm.consumer_list[0].type == "Reshape"
+ and ofm.consumer_list[0].type == Op.Reshape
):
# There is a subsequent Reshape
# Compare desired shape and output of consumer op, to see if consumer op can be removed
@@ -408,7 +381,7 @@ def convert_batched_fc_to_conv(op, arch, nng):
def fixup_pack_input(op, arch, nng):
- if op.type == "Pack":
+ if op.type == Op.Pack:
# Pack is also referred to as Stack
# Requires the rewrite_concat function to be called on the op afterwards
axis = int(op.attrs["axis"])
@@ -421,24 +394,22 @@ def fixup_pack_input(op, arch, nng):
reshape_out = inp.clone("_reshaped")
reshape_out.set_all_shapes(desired_shape)
- reshape_op = Operation("Reshape", "{}{}_reshape".format(op.name, idx))
+ reshape_op = Operation(Op.Reshape, "{}{}_reshape".format(op.name, idx))
reshape_op.attrs["new_shape"] = desired_shape
reshape_op.inputs = [inp, new_shape_tens]
reshape_op.set_output_tensor(reshape_out)
op.inputs[idx] = reshape_out
- op.type = "PackReshaped"
+ op.type = Op.PackReshaped
return op
def unfuse_activation_function(op, arch, nng):
- unfuse_ops = ("ConcatTFLite",)
- if op.type in unfuse_ops and op.run_on_npu and op.attrs.get("fused_activation_function", None) is not None:
- act = op.attrs["fused_activation_function"]
- del op.attrs["fused_activation_function"]
- act_op = Operation(act, op.name + act)
+ if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:
+ act_op = Operation(op.activation, op.name + op.activation.name)
+ op.activation = None
out_tens = op.outputs[0]
intermediate_tens = out_tens.clone("_act_intermediate")
act_op.set_output_tensor(out_tens)
@@ -450,12 +421,12 @@ def unfuse_activation_function(op, arch, nng):
def fixup_unpack_output(tens, arch, nng):
op = tens.ops[0]
- if op.type in set(("Unpack", "StridedSlice")):
+ if op.type in set((Op.Unpack, Op.StridedSlice)):
# Unpack is also referred to as Unstack
# Requires the rewrite_split function to be called on the op afterwards
reshape_input_shape = tens.shape
- if op.type == "StridedSlice":
+ if op.type == Op.StridedSlice:
new_axis_mask = op.attrs["new_axis_mask"]
shrink_axis_mask = op.attrs["shrink_axis_mask"]
ellipsis_mask = op.attrs["ellipsis_mask"]
@@ -494,7 +465,7 @@ def fixup_unpack_output(tens, arch, nng):
op.attrs["new_axis_mask"] = 0
else:
axis = int(op.attrs["axis"])
- op.type = "UnpackReshaped"
+ op.type = Op.UnpackReshaped
reshape_input_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
# Construct 1 shape tensor to be used by all inserted reshape ops
@@ -505,7 +476,7 @@ def fixup_unpack_output(tens, arch, nng):
reshape_in.set_all_shapes(reshape_input_shape)
reshape_in.ops = [op]
- reshape_op = Operation("Reshape", "{}{}_reshape".format(op.name, idx))
+ reshape_op = Operation(Op.Reshape, "{}{}_reshape".format(op.name, idx))
reshape_op.attrs["new_shape"] = reshape_input_shape
reshape_op.inputs = [reshape_in, new_shape_tens]
reshape_op.set_output_tensor(out_tens)
@@ -518,19 +489,16 @@ def fixup_unpack_output(tens, arch, nng):
def add_padding_fields(op, arch, nng):
if op.run_on_npu:
if "padding" in op.attrs:
- if op.type in conv_op | depthwise_op:
+ if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
kernel_size = op.inputs[1].shape[:2]
input_shape = op.inputs[0].shape
- elif op.type in pool_op | reduce_sum_ops:
+ elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
kernel_size = op.attrs["ksize"][1:3]
input_shape = op.inputs[0].shape
- elif op.type == "ExtractImagePatches":
- kernel_size = op.attrs["ksizes"][1:3]
- input_shape = op.inputs[0].shape
else:
raise UnsupportedFeatureError("Unknown operation that uses padding: {}".format(op.type))
- if op.type == "Conv2DBackpropInputSwitchedBias":
+ if op.type == Op.Conv2DBackpropInputSwitchedBias:
upscaling_factor = op.outputs[0].shape[1] // input_shape[1]
padding, skirt = calc_upscaled_padding_and_skirt(
op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
@@ -564,38 +532,19 @@ def get_prepend_op(op):
return None
-def mark_npu_block_type(op, arch, nng):
- npu_block_type = NpuBlockType.Default
- if op.type in conv_op:
- npu_block_type = NpuBlockType.ConvolutionMxN
- elif op.type in fc_op:
- npu_block_type = NpuBlockType.VectorProduct
- elif op.type in depthwise_op:
- npu_block_type = NpuBlockType.ConvolutionDepthWise
- elif op.type in pool_op:
- npu_block_type = NpuBlockType.Pooling
- elif op.type in elementwise_op:
- npu_block_type = NpuBlockType.ElementWise
- elif op.type in reduce_sum_ops:
- npu_block_type = NpuBlockType.ReduceSum
-
- op.attrs["npu_block_type"] = npu_block_type
- return op
-
-
def convert_depthwise_to_conv(op, arch, nng):
# Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
# the ofm depth equals the depth multipler.
# If those conditions are true, then we can perform a simple
# switch of the operator type (and weight order)
- if (op.type in depthwise_op) and (op.attrs["depth_multiplier"] != 1):
+ if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
ifm_tensor = op.inputs[0]
weight_tensor = op.inputs[1]
ofm_tensor = op.outputs[0]
if (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"]):
# Change op type to Conv2d
- op.type = op.type.replace("DepthwiseConv2d", "Conv2D")
+ op.type = Op.Conv2DBias
del op.attrs["channel_multiplier"]
del op.attrs["depth_multiplier"]
@@ -611,7 +560,7 @@ def convert_depthwise_to_conv(op, arch, nng):
def reorder_depthwise_weights(op, arch, nng):
- if op.type in depthwise_op:
+ if op.type.is_depthwise_conv2d_op():
weight_tensor = op.inputs[1]
weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2))
weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))
@@ -625,18 +574,15 @@ def convert_conv_to_fc(op, arch, nng):
# By representing certain convs as fully connected layers, Vela can better determine wether or not to use
# caching/double buffering for the weights.
# (Weights dont need to be reloaded for convs when IFM H and W are 1)
- if op.type == "Conv2DBiasAct":
+ if op.type == Op.Conv2DBias:
_, h, w, _ = op.inputs[0].shape
kh, kw, _, _ = op.inputs[1].shape
if h == 1 and w == 1 and kh == 1 and kw == 1:
# Overwrite this op as a Fully Connected Op
op.name += "_fc"
- op.type = "FullyConnectedAct"
- faf = op.attrs.get("fused_activation_function", None)
+ op.type = Op.FullyConnected
op.attrs = {
- "fused_activation_function": faf,
"weights_format": 0,
- "npu_block_type": NpuBlockType.VectorProduct,
}
# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)
weight_tensor = op.inputs[1]
@@ -652,7 +598,7 @@ def convert_conv_to_fc(op, arch, nng):
# Add a reshape after the new OFM to convert it back to the original 4D shape
reshape_name = op.name + "_reshape"
new_shape_tens = create_const_tensor(reshape_name + "_shape", [1], DataType.int32, orig_ofm_tensor.shape)
- reshape_op = Operation("Reshape", reshape_name)
+ reshape_op = Operation(Op.Reshape, reshape_name)
reshape_op.attrs["new_shape"] = orig_ofm_tensor.shape
reshape_op.inputs = [fc_ofm_tensor, new_shape_tens]
reshape_op.set_output_tensor(orig_ofm_tensor)
@@ -662,7 +608,7 @@ def convert_conv_to_fc(op, arch, nng):
def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
- if op.run_on_npu and op.type in relu_ops:
+ if op.run_on_npu and op.type.is_relu_op():
ifm = op.inputs[0]
ofm = op.outputs[0]
# Relu with differing IFM and OFM scaling cannot be fused with another primary op
@@ -671,7 +617,7 @@ def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
# Override this op with its own primary op (avgpool)
relu_fused_op = create_avgpool_nop(op.name + "_avgpool")
# And fuse the original activation function to it
- relu_fused_op.attrs["fused_activation_function"] = op.type
+ relu_fused_op.activation = op.type
# Tidy up and assign the ifm and ofm to the new op
ifm.consumer_list.remove(op)
@@ -691,7 +637,7 @@ def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
# Reorder activation op if it's after the memory only operations
def fixup_act_reorder(op, arch, nng):
- if op.type in activation_ops:
+ if op.type.is_relu_op() or op in set((Op.Sigmoid, Op.Tanh)):
prep_op = get_prepend_op(op)
if prep_op is not None:
act_op = op.clone("_reordered")
@@ -711,12 +657,12 @@ def fixup_act_reorder(op, arch, nng):
prep_op.outputs[0].quantization = act_op_out.quantization.clone()
# Mark the op so that it will be removed as passthrough later on
- op.type = "Identity"
+ op.type = Op.Identity
return op
def fixup_elementwise_with_scalars(op, arch, nng):
- if op.type in binary_elementwise_op:
+ if op.type.is_binary_elementwise_op():
ifm_tensor, ifm2_tensor, _, _ = op.get_ifm_ifm2_weights_ofm()
if ifm2_tensor.shape != [] and ifm_tensor.shape != []:
diff = len(ifm_tensor.shape) - len(ifm2_tensor.shape)
@@ -745,7 +691,7 @@ def set_tensor_equivalence(op, arch, nng):
def convert_softmax(op, arch, nng):
- if op.type == "Softmax" and op.run_on_npu:
+ if op.type == Op.Softmax and op.run_on_npu:
softmax = SoftMax(op)
op = softmax.get_graph()
return op
@@ -761,9 +707,9 @@ def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
Max
"""
- if op.type == "Maximum":
+ if op.type == Op.Maximum:
# finds the Mul input(s) to the Max
- muls = [i for i in op.inputs if i.ops[0].type == "MulAct"]
+ muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]
if len(muls) == 1:
mul = muls[0].ops[0]
elif len(muls) == 2:
@@ -777,10 +723,10 @@ def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
mul_ofm = mul.outputs[0]
if len(mul_ofm.consumers()) != 1:
return op
- # make sure the Mul doesn't have a faf
- if mul.attrs["fused_activation_function"]:
+ # make sure the Mul doesn't have a fused activation function
+ if mul.activation:
return op
- ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+ ifm, ofm = op.get_ifm_ofm()
if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
return op
if not ifm.is_scaling_equal(ofm) or not ifm.is_scaling_equal(mul_ofm):
@@ -798,7 +744,7 @@ def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
return op
const = const_tens.ops[0]
# check that it is a constant
- if const.type != "Const":
+ if const.type != Op.Const:
return op
# Remove the Mul from the shared input's consumers
shared_in.consumer_list.remove(mul)
@@ -807,7 +753,7 @@ def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
val = const.outputs[0].values
if val >= 0:
- new_op = "LeakyRelu"
+ new_op = Op.LeakyRelu
op.attrs["alpha"] = val
# to produce bit exact results, the alpha is not enough;
# save additional scaling info in attr "alpha_scale", to be used as input
@@ -819,13 +765,13 @@ def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)
op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)
elif val == -1:
- new_op = "Abs"
+ new_op = Op.Abs
else:
return op
- op.type = op.type.replace("Maximum", new_op)
- op.name = op.name.replace("Maximum", new_op)
- op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op)
+ op.type = new_op
+ op.name = op.name.replace("Maximum", new_op.name)
+ op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
op.inputs = [shared_in]
return op
@@ -833,10 +779,10 @@ def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
def convert_lrelu_to_mul_max(op, arch):
# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)
# (the opposite of convert_mul_max_to_abs_or_lrelu)
- ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+ ifm, ofm = op.get_ifm_ofm()
# Add multiplication with alpha
- mul_alpha = Operation("MulAct", op.name + "_mul_alpha")
+ mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
mul_alpha.add_input_tensor(ifm)
# Create const tensor containing alpha as scalar
alpha = op.attrs["alpha"]
@@ -855,7 +801,7 @@ def convert_lrelu_to_mul_max(op, arch):
fm_id = ifm
else:
# Add multiplication with identity
- mul_identity = Operation("MulAct", op.name + "_mul_identity")
+ mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
mul_identity.add_input_tensor(ifm)
# Create const tensor containing identity as scalar
quantization = ifm.quantization.clone()
@@ -871,7 +817,7 @@ def convert_lrelu_to_mul_max(op, arch):
mul_identity.set_output_tensor(fm_id)
# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
- op.type = "Maximum"
+ op.type = Op.Maximum
op.name = op.name.replace("LeakyRelu", "Maximum")
op.inputs = []
ifm.consumer_list.remove(op)
@@ -884,9 +830,8 @@ def convert_to_lut(op, lut_values):
# Rewrite the operation by Add with scalar 0 + LUT activation
ifm = op.inputs[0]
assert ifm.dtype.size_in_bytes() == 1
- op.type = "AddAct"
+ op.type = Op.Add
op.name = op.name + "_add"
- op.attrs.update({"npu_block_type": NpuBlockType.ElementWise})
# Mark as no-op to enable potential fusing optimizations
op.attrs["is_nop"] = True
# Create an input tensor containing scalar zero
@@ -898,7 +843,7 @@ def convert_to_lut(op, lut_values):
# The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
# so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
# should be the same as the IFM
- op.attrs["forced_output_quantization"] = ifm.quantization
+ op.forced_output_quantization = ifm.quantization
lut_tensor = lut.create_lut_tensor(op.name + "_lut", lut_values, DataType.int8)
op.set_activation_lut(lut_tensor)
return op
@@ -907,7 +852,7 @@ def convert_to_lut(op, lut_values):
def convert_to_lut8(op, fn):
# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
# fn is a function(real) -> real
- ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+ ifm, ofm = op.get_ifm_ofm()
if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
return op
# Generate the LUT
@@ -929,7 +874,7 @@ def convert_to_lut8(op, fn):
def convert_lrelu_to_lut(op, arch):
- ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+ ifm, ofm = op.get_ifm_ofm()
# Generate the LUT
alpha = op.attrs["alpha"]
ifm_scale = np.double(ifm.quantization.scale_f32)
@@ -960,9 +905,9 @@ def convert_lrelu_to_lut(op, arch):
def convert_lrelu(op, arch, nng):
# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max
- if op.type != "LeakyRelu":
+ if op.type != Op.LeakyRelu:
return op
- ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+ ifm, ofm = op.get_ifm_ofm()
if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:
# use LUT for int8/uint8
return convert_lrelu_to_lut(op, arch)
@@ -974,20 +919,20 @@ def convert_lrelu(op, arch, nng):
def convert_tanh_sigmoid_to_lut(op, arch, nng):
# Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
- if op.type == "Sigmoid":
+ if op.type == Op.Sigmoid:
return convert_to_lut8(op, clamp_sigmoid)
- elif op.type == "Tanh":
+ elif op.type == Op.Tanh:
return convert_to_lut8(op, math.tanh)
return op
def remove_unwanted_reshapes(op, arch, nng):
# Try to remove reshapes enclosing ElementWise operator with only one non-constant input
- if not op.run_on_npu or op.attrs["npu_block_type"] != NpuBlockType.ElementWise:
+ if not op.run_on_npu or not op.type.is_elementwise_op():
return op
# Check if the ElementWise operator only have one non-constant input
- non_const_tens = [x for x in op.inputs if x.ops[0].type != "Const"]
+ non_const_tens = [x for x in op.inputs if x.ops[0].type != Op.Const]
if len(non_const_tens) != 1:
return op
ifm = non_const_tens[0]
@@ -997,12 +942,12 @@ def remove_unwanted_reshapes(op, arch, nng):
prev_op = ifm.ops[0]
if (
len(ifm.consumer_list) == 1
- and prev_op.type == "Reshape"
+ and prev_op.type == Op.Reshape
and len(ofm.consumer_list) == 1
- and ofm.consumer_list[0].type == "Reshape"
+ and ofm.consumer_list[0].type == Op.Reshape
):
# Operation is enclosed by reshapes, check if they can be removed
- prev_op_ifm, _, _, prev_op_ofm = prev_op.get_ifm_weights_biases_ofm()
+ prev_op_ifm, prev_op_ofm = prev_op.get_ifm_ofm()
cons_op = ofm.consumer_list[0]
cons_op_ifm = ofm
cons_op_ofm = cons_op.outputs[0]
@@ -1018,19 +963,18 @@ def remove_unwanted_reshapes(op, arch, nng):
def fuse_activation_function_with_prev(op, arch, nng):
# if op is a no-op: attempts to move the activation function to the preceding op
- if not op.attrs.get("is_nop", False) or op.attrs.get("fused_activation_function", None) is None:
+ if not op.attrs.get("is_nop", False) or op.activation is None:
return op
- ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()
+ ifm, ofm = op.get_ifm_ofm()
# finds the input(s) to the operation
prev_op = ifm.ops[0]
# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
fuse = (
prev_op.run_on_npu
- and "npu_block_type" in prev_op.attrs
- and prev_op.attrs["npu_block_type"] != NpuBlockType.Default
+ and prev_op.type.npu_block_type != NpuBlockType.Default
and len(ifm.ops) == 1
and len(prev_op.outputs[0].consumers()) == 1
- and prev_op.attrs.get("fused_activation_function", None) is None
+ and prev_op.activation is None
)
if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:
# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),
@@ -1039,9 +983,8 @@ def fuse_activation_function_with_prev(op, arch, nng):
if not fuse:
return op
# Move the fused activation function + corresponding info to prev_op
- for attr in ("fused_activation_function", "forced_output_quantization"):
- if attr in op.attrs:
- prev_op.attrs[attr] = op.attrs[attr]
+ prev_op.activation = op.activation
+ prev_op.forced_output_quantization = op.forced_output_quantization
if op.activation_lut is not None:
prev_op.set_activation_lut(op.activation_lut)
# Bypass op
@@ -1050,7 +993,7 @@ def fuse_activation_function_with_prev(op, arch, nng):
def add_attrs_to_resizebilinear(op, arch, nng):
- if op.type == "ResizeBilinear" and op.run_on_npu:
+ if op.type == Op.ResizeBilinear and op.run_on_npu:
input_tensor = op.inputs[0]
upscaled_shape = [input_tensor.shape[1] * 2, input_tensor.shape[2] * 2]
out_shape = op.outputs[0].shape[1:3]
@@ -1070,7 +1013,7 @@ def add_attrs_to_resizebilinear(op, arch, nng):
def fixup_bias_tensors(op, arch, nng):
- if op.needs_bias() and not op.inputs[-1]:
+ if op.type.needs_bias() and op.bias is None:
# Op has no bias, add bias tensor filled with zeros
nr_biases = op.inputs[1].shape[-1]
bias_values = [0] * nr_biases
@@ -1091,8 +1034,6 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
nng.print_graph()
op_rewrite_list = [
- # mark block type and check if the operations are supported
- mark_npu_block_type,
set_tensor_equivalence,
supported_operator_check,
# then do any rewrites of supported operators
@@ -1106,7 +1047,6 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
fixup_conv2d_backprop,
fixup_relus_with_differing_ifm_ofm_scaling,
fixup_act_reorder,
- mark_npu_block_type,
fixup_elementwise_with_scalars,
reorder_depthwise_weights,
fixup_resizebilinear,
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 8486dadc..dc52ae52 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -25,6 +25,7 @@ from .nn_graph import PassPlacement
from .nn_graph import SchedulingStrategy
from .numeric_util import round_up_divide
from .operation import NpuBlockType
+from .operation import Op
from .tensor import TensorPurpose
@@ -39,7 +40,7 @@ def match_tensor(source, derived):
if source == derived:
return True
ops = derived.ops
- return ops != [] and len(ops) == 1 and ops[0].type == "SplitSliceRead" and source == ops[0].inputs[0]
+ return ops != [] and len(ops) == 1 and ops[0].type == Op.SplitSliceRead and source == ops[0].inputs[0]
def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
@@ -56,8 +57,8 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
for op in ps.ops:
- if op.type == "SplitSliceRead":
- ps.primary_op.attrs["fused_memory_function"] = op.type
+ if op.type == Op.SplitSliceRead:
+ ps.primary_op.memory_function = op.type
assert len(op.inputs) == 1
if match_tensor(ps.ifm_tensor, op.inputs[0]):
split_offsets[0] = op.attrs["split_start"]
@@ -68,10 +69,10 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
else:
ifm_idx = 0
for op in ps.ops:
- if op.type == "SplitSliceRead":
+ if op.type == Op.SplitSliceRead:
assert ifm_idx < 2
split_offsets[ifm_idx] = op.attrs["split_start"]
- ps.primary_op.attrs["fused_memory_function"] = op.type
+ ps.primary_op.memory_function = op.type
ifm_idx += 1
ifm_tensor = ps.ifm_tensor
@@ -89,19 +90,16 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
if ps.primary_op is not None:
strides = ps.primary_op.attrs.get("strides", None)
skirt = ps.primary_op.attrs.get("skirt", None)
- if ps.primary_op.type == "Conv2DBackpropInputSwitchedBias":
+ if ps.primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
- elif ps.primary_op.type == "ResizeBilinear":
+ elif ps.primary_op.type == Op.ResizeBilinear:
upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3])
concat_axis = 0
concat_offset = 0
- # Fusable activation functions
- activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
-
for op in ps.ops:
- if op.type == "ConcatSliceWrite":
+ if op.type == Op.ConcatSliceWrite:
concat_axis = op.attrs["concat_axis"]
concat_start = op.attrs["concat_start"]
concat_end = op.attrs["concat_end"]
@@ -109,9 +107,9 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
ofm_start[concat_axis] = concat_start
ofm_end[concat_axis] = concat_end
concat_offset = concat_start
- ps.primary_op.attrs["fused_memory_function"] = op.type
- elif op.type in activation_ops:
- ps.primary_op.attrs["fused_activation_function"] = op.type
+ ps.primary_op.memory_function = op.type
+ elif op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid):
+ ps.primary_op.activation = op.type
if strat == SchedulingStrategy.WeightStream:
ofm_step = block_config[-1]
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
index 99b46c07..56d68d13 100644
--- a/ethosu/vela/insert_dma.py
+++ b/ethosu/vela/insert_dma.py
@@ -17,6 +17,7 @@
# Insert DMA operations into the graph for transfering weights.
from . import rewrite_graph
from .operation import NpuBlockType
+from .operation import Op
from .operation import Operation
from .tensor import MemArea
from .tensor import MemType
@@ -24,9 +25,6 @@ from .tensor import TensorPurpose
from .weight_compressor import compress_weights
-binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum"))
-
-
def weights_fit_sram(arch, op, tens, nng):
if tens.purpose != TensorPurpose.Weights:
return True
@@ -57,7 +55,7 @@ def weights_fit_sram(arch, op, tens, nng):
def insert_dma_cmd(op, arch, nng):
- if op.type == "DMA" or not op.run_on_npu:
+ if op.type == Op.DMA or not op.run_on_npu:
return op
is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in op.inputs)
@@ -76,14 +74,14 @@ def insert_dma_cmd(op, arch, nng):
) or tens.purpose == TensorPurpose.LUT:
if tens.purpose in (TensorPurpose.Weights, TensorPurpose.LUT) or (
tens.purpose == TensorPurpose.FeatureMap
- and op.type in binary_elementwise_op
+ and op.type.is_binary_elementwise_op()
and tens.shape != []
and tens.shape != op.outputs[0].shape
and tens.storage_size() > max_ifm_shram_avail
):
only_vector_product_consumers = True
for oper in tens.consumers():
- if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
+ if oper is None or oper.type.npu_block_type != NpuBlockType.VectorProduct:
only_vector_product_consumers = False
break
@@ -95,7 +93,7 @@ def insert_dma_cmd(op, arch, nng):
) or tens.purpose == TensorPurpose.LUT:
# Insert a DMA command here, as well as a new tensor situated in SRAM of the same size.
new_tens = tens.clone_into_fast_storage(arch)
- dma_cmd = Operation("DMA", tens.ops[0].name + "_dma")
+ dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma")
dma_cmd.inputs = [tens]
dma_cmd.set_output_tensor(new_tens)
dma_cmd.attrs["source"] = tens.mem_area
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 9a8ee580..23026c79 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -18,6 +18,7 @@
# Can work with either a pass packed subgraph or a scheduled subgraph.
from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass
from .nn_graph import PassPlacement
+from .operation import Op
from .tensor import MemType
from .tensor import Tensor
@@ -262,7 +263,11 @@ def extract_live_ranges_from_cascaded_passes(
cps_primary_op = cps.passes[0].primary_op
- if cps_primary_op and cps_primary_op.type == "NpuOp" and MemType.Permanent_CPU not in target_mem_type_set:
+ if (
+ cps_primary_op
+ and cps_primary_op.type == Op.CustomNpuOp
+ and MemType.Permanent_CPU not in target_mem_type_set
+ ):
# If the primary-op is an NpuOp that means this is where an Npu subgraph
# is called. Go into said subgraph and extract live ranges before continuing.
# Use default allocation alignment of 16 for Npu tensors
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index c4496cdc..206d8365 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -18,10 +18,11 @@
from . import rewrite_graph
from . import weight_compressor
from .errors import OperatorError
+from .operation import CustomType
+from .operation import Op
from .tensor import MemType
from .tensor import TensorFormat
from .tensor import TensorPurpose
-from .tflite_mapping import custom_prefix
def purpose_from_list(lst):
@@ -59,72 +60,53 @@ tensor_purposes = [ # ops, input_purpose
(
set(
(
- "Relu",
- "Relu6",
- "Mul",
- "Add",
- "Sub",
- "Rsqrt",
- "Abs",
- "Cast",
- "Exp",
- "Floor",
- "FloorDiv",
- "FloorMod",
- "SquaredDifference",
- "AddN",
- "BiasAdd",
- "RealDiv",
- "Maximum",
- "Minimum",
- "Sigmoid",
- "Tanh",
- "FusedBatchNorm",
- "AvgPool",
- "MaxPool",
- "Squeeze",
- "Softmax",
- "LRN",
- "Assign",
- "BatchMatMul",
- "ZerosLike",
- "ExtractImagePatches",
- "MulAct",
- "AddAct",
- "SubAct",
- "DivAct",
- "AvgPoolAct",
- "MaxPoolAct",
- "LeakyRelu",
- "CLZ",
- "SHL",
- "SHR",
- "ReduceSum",
+ Op.Relu,
+ Op.Relu6,
+ Op.Rsqrt,
+ Op.Abs,
+ Op.Cast,
+ Op.Exp,
+ Op.Floor,
+ Op.FloorDiv,
+ Op.FloorMod,
+ Op.SquaredDifference,
+ Op.AddN,
+ Op.Maximum,
+ Op.Minimum,
+ Op.Sigmoid,
+ Op.Tanh,
+ Op.AvgPool,
+ Op.MaxPool,
+ Op.Squeeze,
+ Op.Softmax,
+ Op.LRN,
+ Op.BatchMatMul,
+ Op.ZerosLike,
+ Op.Mul,
+ Op.Add,
+ Op.Sub,
+ Op.Div,
+ Op.LeakyRelu,
+ Op.CLZ,
+ Op.SHL,
+ Op.SHR,
+ Op.ReduceSum,
)
),
all_fm,
),
(
- set(
- (
- "Conv2D",
- "DepthwiseConv2dNative",
- "MatMul",
- "Conv2DBiasAct",
- "DepthwiseConv2dBiasAct",
- "FullyConnectedAct",
- )
- ),
+ set((Op.Conv2D, Op.MatMul, Op.Conv2DBias, Op.DepthwiseConv2DBias, Op.FullyConnected,)),
purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]),
),
(
- set(("Conv2DBackpropInputSwitchedBias",)),
+ set((Op.Conv2DBackpropInputSwitchedBias,)),
purpose_from_list(
[TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
),
),
(
- set(("QuantizedConv2D", "QuantizedMatMul")),
+ set((Op.QuantizedConv2D, Op.QuantizedMatMul)),
purpose_from_list(
[
TensorPurpose.FeatureMap,
@@ -139,66 +121,39 @@ tensor_purposes = [ # ops, input_purpose
(
set(
(
- "Reshape",
- "Min",
- "Max",
- "Mean",
- "Pad",
- "MirrorPad",
- "ArgMax",
- "ArgMin",
- "ExpandDims",
- "ResizeNearestNeighbor",
- "ResizeBilinear",
- "Tile",
- "Transpose",
- "Mfcc",
+ Op.Reshape,
+ Op.Min,
+ Op.Max,
+ Op.Mean,
+ Op.Pad,
+ Op.MirrorPad,
+ Op.ArgMax,
+ Op.ArgMin,
+ Op.ExpandDims,
+ Op.ResizeNearestNeighbor,
+ Op.ResizeBilinear,
+ Op.Tile,
+ Op.Transpose,
)
),
purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
),
(
- set(("QuantizedReshape", "QuantizedResizeBilinear")),
+ set((Op.QuantizedReshape,)),
purpose_from_list(
[TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
),
),
(
- set(("QuantizedBiasAdd", "QuantizedAdd", "QuantizedMul")),
- purpose_from_list(
- [
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- ]
- ),
- ),
- (
- set(
- (
- "Dequantize",
- "Quantize",
- "QuantizeV2",
- "QuantizedRelu",
- "QuantizedRelu1",
- "QuantizedRelu6",
- "QuantizedAvgPool",
- "QuantizedMaxPool",
- "Slice",
- "SplitV",
- )
- ),
+ set((Op.Dequantize, Op.Quantize, Op.QuantizedAvgPool, Op.QuantizedMaxPool, Op.Slice, Op.SplitV,)),
purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
),
(
- set(("BatchToSpaceND", "SpaceToBatchND", "DepthToSpaceND", "SpaceToDepthND")),
+ set((Op.BatchToSpaceND, Op.SpaceToBatchND, Op.DepthToSpace, Op.SpaceToDepth)),
purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
),
(
- set(("BlockLSTM",)),
+ set((Op.BlockLSTM,)),
purpose_from_list(
[
TensorPurpose.FeatureMap,
@@ -213,33 +168,18 @@ tensor_purposes = [ # ops, input_purpose
]
),
),
- (set(("SplitSliceRead",)), purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap])),
- (set(("Shape", "ConcatSliceWrite", "AudioSpectrogram")), purpose_from_list([TensorPurpose.FeatureMap])),
+ (set((Op.SplitSliceRead,)), purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.FeatureMap])),
+ (set((Op.Shape, Op.ConcatSliceWrite)), purpose_from_list([TensorPurpose.FeatureMap])),
(
- set(("StridedSlice",)),
+ set((Op.StridedSlice,)),
purpose_from_list(
[TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]
),
),
- (set(("Fill", "Pack", "Range")), all_parameter),
- (
- set(("Requantize",)),
- purpose_from_list(
- [
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- TensorPurpose.FeatureMap,
- ]
- ),
- ),
- (set(("Placeholder", "SubgraphInput", "Const", "VariableV2")), purpose_from_list([])),
- (set(("FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars")), input0_from_output_rest_parameter),
- (
- set(("Square", "Sqrt", "Log", "Less", "Enter", "Exit", "Identity", "StopGradient", "Merge", "Switch")),
- inputs_from_output,
- ),
+ (set((Op.Fill, Op.Pack, Op.Range)), all_parameter),
+ (set((Op.Placeholder, Op.SubgraphInput, Op.Const,)), purpose_from_list([])),
+ (set((Op.FakeQuantWithMinMaxArgs,)), input0_from_output_rest_parameter),
+ (set((Op.Square, Op.Sqrt, Op.Log, Op.Less, Op.Identity,)), inputs_from_output,),
(None, all_fm),
]
@@ -247,8 +187,6 @@ tensor_purposes = [ # ops, input_purpose
for ops, input_purpose in tensor_purposes:
if ops is None:
continue
- for op in ops:
- assert len(op) > 1, "string literal has been decomposed"
def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
@@ -260,7 +198,7 @@ def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
tens.mem_area = arch.tensor_storage_mem_area[tens.purpose]
tens.mem_type = arch.tensor_storage_mem_type[tens.purpose]
- if len(tens.ops) == 1 and tens.ops[0].type == "Const":
+ if len(tens.ops) == 1 and tens.ops[0].type == Op.Const:
tens.mem_area = (
arch.permanent_storage_mem_area
) # special case constants, as they must be in permanent storage
@@ -288,11 +226,11 @@ def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
purpose = input_purpose(op, idx) if tens.purpose == TensorPurpose.Unknown else tens.purpose
mark_tensor_helper(tens, purpose)
- if op.type == "Reshape":
+ if op.type == Op.Reshape:
# Reshape's input and output point to same data
op.outputs[0].mem_area = op.inputs[0].mem_area
- if op.type.startswith(custom_prefix) and op.attrs.get("custom_type", "") == "ExistingNpuOp":
+ if op.type == Op.Custom and op.attrs.get("custom_type") == CustomType.ExistingNpuOp:
scratch_tensor = None
if len(op.inputs) >= 3:
@@ -301,7 +239,7 @@ def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
scratch_tensor.purpose = TensorPurpose.Scratch
if scratch_tensor is None:
- raise OperatorError(op, "Scratch tensor not found.")
+ OperatorError(op, "Scratch tensor not found.")
break
@@ -318,21 +256,6 @@ def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False):
return nng
-reshape_operations = set(
- (
- "Reshape",
- "QuantizedReshape",
- "ExpandDims",
- "Squeeze",
- "BatchToSpaceND",
- "SpaceToBatchND",
- "DepthToSpaceND",
- "SpaceToDepthND",
- "Placeholder",
- )
-)
-
-
def mark_tensor_format(nng, arch, verbose_tensor_format=False):
formats_for_tensor = {}
@@ -375,8 +298,9 @@ def mark_tensor_format(nng, arch, verbose_tensor_format=False):
if src_tens is not None:
op = tens.find_npu_op()
if op is not None:
- npu_block_type = op.attrs["npu_block_type"]
- weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 16, 16, op.get_dilation_h_w())
+ weight_compressor.compress_weights(
+ arch, nng, tens, op.type.npu_block_type, 16, 16, op.get_dilation_h_w()
+ )
# Alias compressed weights back into source tensor
src_tens.copy_compressed_weight_info(tens)
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 58aab611..12edf5ef 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -22,6 +22,8 @@
# Graph - A full neural network graph with one or more Subgraphs.
import enum
+from .operation import Op
+
class PassPlacement(enum.Enum):
Unknown = 0
@@ -176,7 +178,7 @@ class Subgraph:
visit_tensor(inp)
inp.consumer_list.append(op)
- if op.type in set(("Placeholder", "SubgraphInput")):
+ if op.type in set((Op.Placeholder, Op.SubgraphInput)):
assert len(op.outputs) == 1
self.input_tensors.append(op.outputs[0])
@@ -319,19 +321,14 @@ class Subgraph:
all_ops = self.get_all_ops()
unique_ops = []
for op in all_ops:
- if op.type in set(("Const", "Identity", "Placeholder")):
+ if op.type in set((Op.Const, Op.Identity, Op.Placeholder)):
continue
- attrs = op.attrs
- if (
- op.type == "Conv2D"
- or op.type == "DepthwiseConv2dNative"
- or op.type == "Conv2DBiasAct"
- or op.type == "DepthwiseConv2dBiasAct"
- ):
+ attrs = op.attrs.copy()
+ if op.type in (Op.Conv2D, Op.Conv2DBias, Op.DepthwiseConv2DBias):
kshape = op.inputs[1].shape
attrs["kshape"] = [kshape[0], kshape[1]]
- attrs["type"] = op.type
+ attrs["type"] = op.type.name
attrs.pop("use_cudnn_on_gpu", None)
if attrs not in unique_ops:
unique_ops.append(attrs)
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index e09fc9e1..fc148f38 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -41,10 +41,7 @@ def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_conf
if ps2.npu_block_type in set((NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct)):
op = ps2.primary_op
- ifm_idx, _, _, _, _ = op.get_ifm_ifm2_weight_bias_ofm_indices()
- ifm_block_depth = arch.calc_ifm_block_depth(
- op.inputs[ifm_idx].shape[-1], op.inputs[ifm_idx].dtype.size_in_bits()
- )
+ ifm_block_depth = arch.calc_ifm_block_depth(op.ifm.shape[-1], op.ifm.dtype.size_in_bits())
else:
ifm_block_depth = block_config_ps2[-1]
@@ -237,8 +234,8 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
elif primary_op:
skirt = primary_op.attrs.get("skirt", skirt)
explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
- assert primary_op.attrs["npu_block_type"] == ps.npu_block_type
- npu_block_type = primary_op.attrs["npu_block_type"]
+ assert primary_op.type.npu_block_type == ps.npu_block_type
+ npu_block_type = primary_op.type.npu_block_type
ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index 430db585..7989fa90 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -22,6 +22,7 @@ import numpy as np
from . import driver_actions
from .data_type import DataType
from .nn_graph import PassPlacement
+from .operation import Op
from .operation import Operation
from .tensor import MemArea
from .tensor import MemType
@@ -125,7 +126,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fas
# For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
# is pointing at the destination address of where the weights should be placed in SRAM.
# This ensures that the Flash weight tensor is used instead and thus gets the correct address.
- if ps.weight_tensor.ops[0].type == "DMA":
+ if ps.weight_tensor.ops[0].type == Op.DMA:
copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
else:
copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
@@ -150,7 +151,7 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fas
def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
- op = Operation("Const", tens.name + "_const")
+ op = Operation(Op.Const, tens.name + "_const")
op.set_output_tensor(tens)
startup_cps.passes[0].ops.insert(0, op)
startup_cps.passes[0].outputs.insert(0, tens)
@@ -166,9 +167,8 @@ def rewrite_npu_call_ops(nng, sg, arch):
for idx, cps in enumerate(sg.cascaded_passes):
for ps in cps.passes:
for op in ps.ops:
- if op.type == "NpuOp":
+ if op.type == Op.CustomNpuOp:
callee = op.attrs["subgraph"]
- op.attrs["custom_type"] = op.type
sz = 0
for tens in [
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 14818870..a2b67dfb 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -15,10 +15,11 @@
# limitations under the License.
# Description:
# Internal representation of a Neural Network Operation.
-import enum
+from collections import namedtuple
+from enum import Enum
-class NpuBlockType(enum.Enum):
+class NpuBlockType(Enum):
Default = 0
ConvolutionMxN = 1
VectorProduct = 2
@@ -28,10 +29,266 @@ class NpuBlockType(enum.Enum):
ReduceSum = 6
+# Classifies operators of type Custom
+class CustomType(Enum):
+ ThirdPartyOp = 0 # Third party custom op
+ NpuOp = 1 # NPU op
+ ExistingNpuOp = 2 # NPU op that was part of the input network
+
+
+TensorIndices = namedtuple("TensorIndices", ["ifms", "weights", "biases"])
+
+NO_INDICES = TensorIndices([], [], [])
+IFM_INDICES = TensorIndices([0], [], [])
+IFM_WEIGHTS_INDICES = TensorIndices([0], [1], [])
+IFM_WEIGHTS_BIAS_INDICES = TensorIndices([0], [1], [2])
+IFM_IFM2_INDICES = TensorIndices([0, 1], [], [])
+CONV2D_BACKPROP_INDICES = TensorIndices([2], [1], [3])
+TRANSPOSE_CONV_INDICES = TensorIndices([0], [1], [3])
+CONCAT_INDICES = TensorIndices([1, 2], [], [])
+SPLIT_IFM_INDICES = TensorIndices([1], [], [])
+BLOCK_LSTM_INDICES = TensorIndices([3], [4], [])
+
+
+# Static information related to operation codes
+class OperatorInfo:
+ __slots__ = ("id", "block_type", "indices", "is_unary")
+ _id = 0
+
+ def __init__(self, block_type=NpuBlockType.Default, indices=NO_INDICES, is_unary=False):
+ OperatorInfo._id += 1
+ self.id = OperatorInfo._id
+ self.block_type = block_type
+ self.indices = indices # Indices of the different tensor purposes
+ self.is_unary = is_unary # Classifies elementwise operators
+
+
+# Internally used operation codes
+class Op(Enum):
+ Abs = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_INDICES, is_unary=True)
+ Add = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_IFM2_INDICES)
+ AddN = OperatorInfo()
+ Any = OperatorInfo()
+ ArgMax = OperatorInfo()
+ ArgMin = OperatorInfo()
+ AvgPool = OperatorInfo(block_type=NpuBlockType.Pooling, indices=IFM_INDICES)
+ BatchMatMul = OperatorInfo()
+ BatchToSpaceND = OperatorInfo()
+ BidirectionalSequenceLstm = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_INDICES)
+ BidirectionalSequenceRnn = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_INDICES)
+ BlockLSTM = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=BLOCK_LSTM_INDICES)
+
+ CLZ = OperatorInfo(
+ block_type=NpuBlockType.ElementWise, indices=IFM_INDICES, is_unary=True
+ ) # NPU specific operation
+ Call = OperatorInfo()
+ Cast = OperatorInfo()
+ Ceil = OperatorInfo()
+ Concat = OperatorInfo(indices=CONCAT_INDICES)
+ ConcatEmbeddings = OperatorInfo()
+ ConcatSliceWrite = OperatorInfo(indices=IFM_INDICES)
+ ConcatTFLite = OperatorInfo()
+ Const = OperatorInfo() # Constant tensor, only used in CPU subgraphs
+ Conv2D = OperatorInfo(block_type=NpuBlockType.ConvolutionMxN, indices=IFM_WEIGHTS_INDICES)
+ Conv2DBackpropInput = OperatorInfo(block_type=NpuBlockType.ConvolutionMxN, indices=CONV2D_BACKPROP_INDICES)
+ Conv2DBackpropInputSwitchedBias = OperatorInfo(
+ block_type=NpuBlockType.ConvolutionMxN, indices=TRANSPOSE_CONV_INDICES
+ )
+ Conv2DBias = OperatorInfo(block_type=NpuBlockType.ConvolutionMxN, indices=IFM_WEIGHTS_BIAS_INDICES)
+ Cos = OperatorInfo()
+ Custom = OperatorInfo() # Custom 3rd party operator, only used in CPU subgraphs
+ CustomNpuOp = OperatorInfo() # NPU custom operator, only used in CPU subgraphs
+ DMA = OperatorInfo()
+ Delegate = OperatorInfo()
+ Densify = OperatorInfo()
+ DepthToSpace = OperatorInfo()
+ DepthwiseConv2DBias = OperatorInfo(block_type=NpuBlockType.ConvolutionDepthWise, indices=IFM_WEIGHTS_BIAS_INDICES)
+ Dequantize = OperatorInfo()
+ Div = OperatorInfo()
+ Elu = OperatorInfo()
+ EmbeddingLookup = OperatorInfo()
+ EmbeddingLookupSparse = OperatorInfo()
+ Equal = OperatorInfo()
+ Exp = OperatorInfo()
+ ExpandDims = OperatorInfo(indices=IFM_INDICES)
+ FakeQuantWithMinMaxArgs = OperatorInfo()
+ Fill = OperatorInfo()
+ Floor = OperatorInfo()
+ FloorDiv = OperatorInfo()
+ FloorMod = OperatorInfo()
+ FullyConnected = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_BIAS_INDICES)
+ GatherNd = OperatorInfo()
+ GatherV2 = OperatorInfo()
+ Greater = OperatorInfo()
+ GreaterEqual = OperatorInfo()
+ HardSwish = OperatorInfo()
+ HashtableLookup = OperatorInfo()
+ Identity = OperatorInfo()
+ If = OperatorInfo()
+ L2Norm = OperatorInfo()
+ L2Pool2D = OperatorInfo()
+ LRN = OperatorInfo()
+ LSHProjection = OperatorInfo()
+ LeakyRelu = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_INDICES, is_unary=True)
+ Less = OperatorInfo()
+ LessEqual = OperatorInfo()
+ Log = OperatorInfo()
+ LogSoftmax = OperatorInfo()
+ LogicalAnd = OperatorInfo()
+ LogicalNot = OperatorInfo()
+ LogicalOr = OperatorInfo()
+ Lstm = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_INDICES)
+ LUT = OperatorInfo() # NPU specific, operator has LUT, only used in fused activation functions
+ MatMul = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_INDICES)
+ MatrixDiag = OperatorInfo()
+ MatrixSetDiag = OperatorInfo()
+ Max = OperatorInfo()
+ MaxPool = OperatorInfo(block_type=NpuBlockType.Pooling, indices=IFM_INDICES)
+ Maximum = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_IFM2_INDICES)
+ Mean = OperatorInfo()
+ Min = OperatorInfo()
+ Minimum = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_IFM2_INDICES)
+ MirrorPad = OperatorInfo()
+ Mul = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_IFM2_INDICES)
+ Neg = OperatorInfo()
+ NonMaxSuppressionV4 = OperatorInfo()
+ NonMaxSuppressionV5 = OperatorInfo()
+ NotEqual = OperatorInfo()
+ OneHot = OperatorInfo()
+ Pack = OperatorInfo()
+ PackReshaped = OperatorInfo(indices=IFM_INDICES)
+ Pad = OperatorInfo()
+ PadV2 = OperatorInfo()
+ Placeholder = OperatorInfo() # Only used in CPU subgraphs
+ Pow = OperatorInfo()
+ Prelu = OperatorInfo()
+ Prod = OperatorInfo()
+ Quantize = OperatorInfo()
+ QuantizedAvgPool = OperatorInfo(block_type=NpuBlockType.Pooling, indices=IFM_INDICES)
+ QuantizedConv2D = OperatorInfo(block_type=NpuBlockType.ConvolutionMxN, indices=IFM_WEIGHTS_INDICES)
+ QuantizedMatMul = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_INDICES)
+ QuantizedMaxPool = OperatorInfo(block_type=NpuBlockType.Pooling, indices=IFM_INDICES)
+ QuantizedReshape = OperatorInfo(indices=IFM_INDICES)
+ Range = OperatorInfo()
+ Rank = OperatorInfo()
+ ReduceSum = OperatorInfo(block_type=NpuBlockType.ReduceSum, indices=IFM_INDICES)
+ Relu = OperatorInfo(indices=IFM_INDICES)
+ Relu6 = OperatorInfo(indices=IFM_INDICES)
+ ReluN1To1 = OperatorInfo(indices=IFM_INDICES)
+ Reshape = OperatorInfo(indices=IFM_INDICES)
+ ResizeBilinear = OperatorInfo(block_type=NpuBlockType.Pooling, indices=IFM_INDICES)
+ ResizeNearestNeighbor = OperatorInfo()
+ ReverseSequence = OperatorInfo()
+ ReverseV2 = OperatorInfo()
+ Rnn = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_INDICES)
+ Round = OperatorInfo()
+ Rsqrt = OperatorInfo()
+ SHL = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_IFM2_INDICES) # NPU specific operation
+ SHR = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_IFM2_INDICES) # NPU specific operation
+ ScatterNd = OperatorInfo()
+ SegmentSum = OperatorInfo()
+ Select = OperatorInfo()
+ SelectV2 = OperatorInfo()
+ Shape = OperatorInfo()
+ Sigmoid = OperatorInfo(indices=IFM_INDICES)
+ SignBit = OperatorInfo()
+ Sin = OperatorInfo()
+ SkipGram = OperatorInfo()
+ Slice = OperatorInfo(indices=IFM_INDICES)
+ Softmax = OperatorInfo()
+ SpaceToBatchND = OperatorInfo()
+ SpaceToDepth = OperatorInfo()
+ SparseToDense = OperatorInfo()
+ Split = OperatorInfo(indices=SPLIT_IFM_INDICES)
+ SplitSliceRead = OperatorInfo(indices=IFM_INDICES)
+ SplitV = OperatorInfo(indices=IFM_INDICES)
+ Sqrt = OperatorInfo()
+ Square = OperatorInfo()
+ SquaredDifference = OperatorInfo()
+ Squeeze = OperatorInfo(indices=IFM_INDICES)
+ StridedSlice = OperatorInfo(indices=IFM_INDICES)
+ StridedSliceOptions = OperatorInfo()
+ Sub = OperatorInfo(block_type=NpuBlockType.ElementWise, indices=IFM_IFM2_INDICES)
+ SubgraphInput = OperatorInfo() # Only used in CPU subgraphs
+ Sum = OperatorInfo()
+ Svdf = OperatorInfo()
+ Tanh = OperatorInfo(indices=IFM_INDICES)
+ Tile = OperatorInfo()
+ TopKV2 = OperatorInfo()
+ Transpose = OperatorInfo()
+ UnidirectionalSequenceLstm = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_INDICES)
+ UnidirectionalSequenceRnn = OperatorInfo(block_type=NpuBlockType.VectorProduct, indices=IFM_WEIGHTS_INDICES)
+ Unique = OperatorInfo()
+ Unpack = OperatorInfo()
+ UnpackReshaped = OperatorInfo(indices=IFM_INDICES)
+ Where = OperatorInfo()
+ While = OperatorInfo()
+ ZerosLike = OperatorInfo()
+
+ @property
+ def info(self):
+ return self.value
+
+ @property
+ def npu_block_type(self):
+ return self.info.block_type
+
+ def is_conv2d_op(self):
+ return self.info.block_type == NpuBlockType.ConvolutionMxN
+
+ def is_depthwise_conv2d_op(self):
+ return self.info.block_type == NpuBlockType.ConvolutionDepthWise
+
+ def is_pool_op(self):
+ return self.info.block_type == NpuBlockType.Pooling
+
+ def is_maxpool_op(self):
+ return self in (Op.MaxPool, Op.QuantizedMaxPool)
+
+ def is_avgpool_op(self):
+ return self in (Op.QuantizedAvgPool, Op.AvgPool)
+
+ def is_elementwise_op(self):
+ return self.info.block_type == NpuBlockType.ElementWise
+
+ def is_unary_elementwise_op(self):
+ return self.info.block_type == NpuBlockType.ElementWise and self.info.is_unary
+
+ def is_binary_elementwise_op(self):
+ return self.info.block_type == NpuBlockType.ElementWise and not self.info.is_unary
+
+ def is_relu_op(self):
+ return self in (Op.Relu, Op.Relu6, Op.ReluN1To1)
+
+ def is_activation_op(self):
+ return self.is_relu_op() or self in (Op.Tanh, Op.Sigmoid, Op.Softmax, Op.LUT)
+
+ def is_split_op(self):
+ return self in (Op.Split, Op.SplitV, Op.StridedSlice, Op.Slice, Op.UnpackReshaped)
+
+ def is_concat_op(self):
+ return self in (Op.Concat, Op.ConcatTFLite, Op.PackReshaped)
+
+ def needs_bias(self):
+ return bool(self.info.indices.biases)
+
+ @classmethod
+ def op_set(cls, predicate):
+ # Returns the set of all operator codes that fulfill the given predicate
+ return {op_type for op_type in Op if predicate(op_type)}
+
+ def __str__(self):
+ return self.name
+
+ __repr__ = __str__
+
+ def __lt__(self, other):
+ return self.value.id < other.value.id
+
+
def create_avgpool_nop(name):
- op = Operation("AvgPool", name)
+ op = Operation(Op.AvgPool, name)
op.attrs["padding"] = b"VALID"
- op.attrs["npu_block_type"] = NpuBlockType.Pooling
op.attrs["stride_w"] = 1
op.attrs["stride_h"] = 1
op.attrs["filter_width"] = 1
@@ -70,6 +327,9 @@ input and output tensors, as well as an attribute dictionary."""
"flops",
"scheduled_pass",
"run_on_npu",
+ "activation",
+ "memory_function",
+ "forced_output_quantization",
"activation_lut",
)
@@ -81,6 +341,13 @@ input and output tensors, as well as an attribute dictionary."""
self.outputs = []
self.flops = 0
self.run_on_npu = True
+ # Fused activation function. If not none: operator code.
+ self.activation = None
+ # Fused memory function, if not None: operator code
+ self.memory_function = None
+ # If not none: contains QuantizationParameters to be used as output quantization
+ # (which overrides the ofm tensor's quantization), used in LUT
+ self.forced_output_quantization = None
self.scheduled_pass = None
self.op_index = None # input network operator index
self.activation_lut = None
@@ -92,173 +359,95 @@ input and output tensors, as well as an attribute dictionary."""
res.inputs = list(self.inputs)
res.outputs = list(self.outputs)
res.flops = self.flops
+ res.run_on_npu = self.run_on_npu
+ res.activation = self.activation
+ res.memory_function = self.memory_function
+ res.forced_output_quantization = self.forced_output_quantization
res.scheduled_pass = self.scheduled_pass
res.op_index = None # not relevant as not part of input network
return res
def __str__(self):
- return "<nng.Operation '%s' type=%s>" % (self.name, self.type)
+ return "<nng.Operation '{}' type={}>".format(self.name, self.type)
__repr__ = __str__
- def get_ifm_ifm2_weight_bias_ofm_indices(self):
- ifm_idx = -1
- ifm2_idx = -1
- weight_idx = -1
- bias_idx = -1
- ofm_idx = -1
- npu_block_type = self.attrs.get("npu_block_type", NpuBlockType.Default)
- if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise):
- ifm_idx = 0
- weight_idx = 1
- ofm_idx = 0
-
- if self.type in ("Conv2DBiasAct", "DepthwiseConv2dBiasAct", "TransposeConvAct"):
- if len(self.inputs) >= 3:
- bias_idx = 2
-
- elif self.type == "Conv2DBackpropInputSwitchedBias":
- bias_idx = 3
-
- elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
- ifm_idx = 0
- ofm_idx = 0
- elif npu_block_type == NpuBlockType.VectorProduct:
- ifm_idx = 0
- weight_idx = 1
- ofm_idx = 0
-
- if self.type == "FullyConnectedAct":
- if len(self.inputs) >= 3:
- bias_idx = 2
-
- if self.type == "BlockLSTM":
- ifm_idx = 3
- weight_idx = 4
- ofm_idx = 6
-
- elif npu_block_type == NpuBlockType.ElementWise:
- ifm_idx = 0
- ifm2_idx = 1
- ofm_idx = 0
-
- # LeakyRelu, Abs and CLZ have a single IFM
- if self.type in ("LeakyRelu", "Abs", "CLZ"):
- ifm2_idx = -1
-
- elif self.type == "Conv2DBackpropInput":
- ifm_idx = 2
- weight_idx = 1
- ofm_idx = 0
-
- elif self.type in ("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims", "Sigmoid", "Tanh"):
- ifm_idx = 0
- ofm_idx = 0
-
- elif self.is_split_op():
- ifm_idx = 0
- ofm_idx = 0
- if self.type == "Split":
- ifm_idx = 1
-
- elif self.is_concat_op():
- ifms, _ = self.get_concat_inputs_axis()
- ifm_idx = self.inputs.index(ifms[0])
- if len(ifms) > 1:
- ifm2_idx = self.inputs.index(ifms[1])
- ofm_idx = 0
-
- return ifm_idx, ifm2_idx, weight_idx, bias_idx, ofm_idx
-
def get_ifm_ifm2_weights_ofm(self):
- ifm_tensor = None
- ifm2_tensor = None
- weight_tensor = None
- ofm_tensor = None
-
- ifm_idx, ifm2_idx, weight_idx, _, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices()
- if ifm_idx != -1:
- ifm_tensor = self.inputs[ifm_idx]
- if ifm2_idx != -1:
- ifm2_tensor = self.inputs[ifm2_idx]
- if weight_idx != -1:
- weight_tensor = self.inputs[weight_idx]
- if ofm_idx != -1:
- ofm_tensor = self.outputs[ofm_idx]
-
- return ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor
+ return self.ifm, self.ifm2, self.weights, self.ofm
def get_ifm_weights_biases_ofm(self):
- ifm_tensor = None
- weight_tensor = None
- bias_tensor = None
- ofm_tensor = None
-
- ifm_idx, _, weight_idx, bias_idx, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices()
- if ifm_idx != -1:
- ifm_tensor = self.inputs[ifm_idx]
- if weight_idx != -1:
- weight_tensor = self.inputs[weight_idx]
- if bias_idx != -1:
- bias_tensor = self.inputs[bias_idx]
- if ofm_idx != -1:
- ofm_tensor = self.outputs[ofm_idx]
-
- return ifm_tensor, weight_tensor, bias_tensor, ofm_tensor
+ return self.ifm, self.weights, self.bias, self.ofm
def get_ifm_ifm2_weights_biases_ofm(self):
- ifm_tensor = None
- ifm2_tensor = None
- weight_tensor = None
- bias_tensor = None
- ofm_tensor = None
-
- ifm_idx, ifm2_idx, weight_idx, bias_idx, ofm_idx = self.get_ifm_ifm2_weight_bias_ofm_indices()
- if ifm_idx != -1:
- ifm_tensor = self.inputs[ifm_idx]
- if ifm2_idx != -1:
- ifm2_tensor = self.inputs[ifm2_idx]
- if weight_idx != -1:
- weight_tensor = self.inputs[weight_idx]
- if bias_idx != -1:
- bias_tensor = self.inputs[bias_idx]
- if ofm_idx != -1:
- ofm_tensor = self.outputs[ofm_idx]
-
- return ifm_tensor, ifm2_tensor, weight_tensor, bias_tensor, ofm_tensor
-
- def get_ofm(self):
- _, _, _, ofm = self.get_ifm_ifm2_weights_ofm()
- return ofm
-
- def is_concat_op(self):
- return self.type in ("Concat", "ConcatV2", "QuantizedConcat", "ConcatTFLite", "PackReshaped")
+ return self.ifm, self.ifm2, self.weights, self.bias, self.ofm
+
+ def get_ifm_ofm(self):
+ return self.ifm, self.ofm
+
+ @property
+ def ifm(self):
+ # Gets the IFM tensor, or None if not applicable
+ return self.get_input(self.type.info.indices.ifms, 0)
+
+ @property
+ def ifm2(self):
+ # Gets the IFM2 tensor, or None if not applicable
+ return self.get_input(self.type.info.indices.ifms, 1)
+
+ @property
+ def bias(self):
+ # Gets the bias tensor, or None if not applicable
+ return self.get_input(self.type.info.indices.biases, 0)
+
+ @property
+ def weights(self):
+ # Gets the weight tensor, or None if not applicable
+ return self.get_input(self.type.info.indices.weights, 0)
+
+ def get_ifm_tensors(self):
+ # Gets the IFM tensors, or empty list if not applicable
+ return self._index_list_to_tensors(self.type.info.indices.ifms)
+
+ def get_weight_tensors(self):
+ # Gets the weight tensors, or empty list if not applicable
+ return self._index_list_to_tensors(self.type.info.indices.weights)
+
+ def get_bias_tensors(self):
+ # Gets the bias tensors, or empty list if not applicable
+ return self._index_list_to_tensors(self.type.info.indices.biases)
+
+ def _index_list_to_tensors(self, index_list):
+ return [self.inputs[ix] for ix in index_list if ix < len(self.inputs)]
+
+ def get_input(self, index_list, ix):
+ if ix >= len(index_list):
+ return None
+ if index_list[ix] >= len(self.inputs):
+ return None
+ return self.inputs[index_list[ix]]
+
+ @property
+ def ofm(self):
+ # Gets the OFM tensor, or None if not applicable
+ return self.outputs[0] if self.outputs else None
def get_concat_inputs_axis(self):
- assert self.is_concat_op()
+ assert self.type.is_concat_op()
- if self.type == "ConcatV2":
- axis_tensor = self.inputs[-1]
- inputs = self.inputs[:-1]
- elif self.type == "Concat":
- axis_tensor = self.inputs[0]
- inputs = self.inputs[1:]
- elif self.type == "QuantizedConcat":
+ if self.type == Op.Concat:
axis_tensor = self.inputs[0]
inputs = self.inputs[1:]
- inputs = inputs[: len(inputs) // 3] # Skip min/max
-
- if self.type == "ConcatTFLite":
+ elif self.type == Op.ConcatTFLite:
inputs = self.inputs
axis = self.attrs["axis"]
- elif self.type == "PackReshaped":
+ elif self.type == Op.PackReshaped:
# Requires fixup_pack_input to be called before this point
inputs = self.inputs
axis = self.attrs["axis"]
assert len(self.inputs) == self.attrs["values_count"]
else:
- assert len(axis_tensor.ops) == 1 and axis_tensor.ops[0].type == "Const"
+ assert len(axis_tensor.ops) == 1 and axis_tensor.ops[0].type == Op.Const
axis = int(axis_tensor.values)
return inputs, axis
@@ -267,33 +456,30 @@ input and output tensors, as well as an attribute dictionary."""
_, dilation_h, dilation_w, _ = self.attrs.get("dilation", (1, 1, 1, 1))
return dilation_h, dilation_w
- def is_split_op(self):
- return self.type in ("Split", "SplitV", "StridedSlice", "Slice", "UnpackReshaped")
-
def get_split_inputs_axis(self):
- assert self.is_split_op()
+ assert self.type.is_split_op()
offset_start = None
offset_end = None
axis = None
- if self.type == "Split":
+ if self.type == Op.Split:
num_splits = self.attrs.get("num_splits")
axis_tens = self.inputs[0]
- assert len(axis_tens.ops) == 1 and axis_tens.ops[0].type == "Const"
+ assert len(axis_tens.ops) == 1 and axis_tens.ops[0].type == Op.Const
axis = int(axis_tens.values)
input_tens = self.inputs[1]
outputs = self.outputs
assert num_splits == len(outputs)
- elif self.type == "SplitV":
+ elif self.type == Op.SplitV:
num_splits = self.attrs.get("num_splits")
input_tens = self.inputs[0]
size_tens = self.inputs[1]
- assert len(size_tens.ops) == 1 and size_tens.ops[0].type == "Const"
+ assert len(size_tens.ops) == 1 and size_tens.ops[0].type == Op.Const
sizes = size_tens.values
axis_tens = self.inputs[2]
- assert len(axis_tens.ops) == 1 and axis_tens.ops[0].type == "Const"
+ assert len(axis_tens.ops) == 1 and axis_tens.ops[0].type == Op.Const
axis = int(axis_tens.values)
for idx, size in enumerate(sizes):
@@ -306,7 +492,7 @@ input and output tensors, as well as an attribute dictionary."""
assert num_splits == len(outputs)
assert sum(sizes) == input_tens.shape[axis]
- elif self.type == "Slice":
+ elif self.type == Op.Slice:
input_tens, begin_tens, size_tens = self.inputs
outputs = self.outputs
offset_start = [0] * len(input_tens.shape)
@@ -318,7 +504,7 @@ input and output tensors, as well as an attribute dictionary."""
offset_start[idx] = begin_tens.values[idx]
offset_end[idx] = size_tens.values[idx] + offset_start[idx]
- elif self.type == "StridedSlice":
+ elif self.type == Op.StridedSlice:
input_tens, begin_tens, end_tens, strides_tens = self.inputs
outputs = self.outputs
out_tens = outputs[0]
@@ -336,7 +522,7 @@ input and output tensors, as well as an attribute dictionary."""
assert len(input_tens.shape) == len(out_tens.shape)
offset_start = get_slice_offsets(input_tens.shape, begin_tens, begin_mask, is_begin=True)
offset_end = get_slice_offsets(input_tens.shape, end_tens, end_mask, is_begin=False)
- elif self.type == "UnpackReshaped":
+ elif self.type == Op.UnpackReshaped:
# Requires fixup_unpack_output to be called before this point
input_tens = self.inputs[0]
outputs = self.outputs
@@ -350,7 +536,7 @@ input and output tensors, as well as an attribute dictionary."""
return input_tens, outputs, axis, offset_start, offset_end
def set_activation_lut(self, lut_tensor):
- self.attrs["fused_activation_function"] = "LUT"
+ self.activation = Op.LUT
self.activation_lut = lut_tensor
self.add_input_tensor(lut_tensor)
@@ -372,13 +558,7 @@ input and output tensors, as well as an attribute dictionary."""
tens.ops = [self]
self.outputs = [tens]
- def needs_bias(self):
- return self.type in (
- "Conv2DBiasAct",
- "DepthwiseConv2dBiasAct",
- "Conv2DBackpropInputSwitchedBias",
- "FullyConnectedAct",
- )
-
def get_output_quantization(self):
- return self.attrs.get("forced_output_quantization", self.get_ofm().quantization)
+ if self.forced_output_quantization is not None:
+ return self.forced_output_quantization
+ return self.ofm.quantization
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index f49f9813..35e1b143 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -22,6 +22,7 @@ from .nn_graph import Pass
from .nn_graph import PassPlacement
from .operation import create_avgpool_nop
from .operation import NpuBlockType
+from .operation import Op
from .tensor import TensorPurpose
@@ -40,81 +41,57 @@ class PassFlags(enum.Flag):
PostFusingLimited = 8192
-npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))
+npu_pre_ops = set((Op.SplitSliceRead,))
mac_main_ops = set(
(
# convolutions
- "Conv2DBiasAct",
- "Conv2D",
- "QuantizedConv2D",
- "Conv2DBackpropInputSwitchedBias",
+ Op.Conv2DBias,
+ Op.Conv2D,
+ Op.QuantizedConv2D,
+ Op.Conv2DBackpropInputSwitchedBias,
# depth-wise convolutions
- "DepthwiseConv2dBiasAct",
- "DepthwiseConv2dNative",
- "QuantizedDepthwiseConv2D",
+ Op.DepthwiseConv2DBias,
# FC layers
- "QuantizedMatMul",
- "MatMul",
- "FullyConnectedAct",
+ Op.QuantizedMatMul,
+ Op.MatMul,
+ Op.FullyConnected,
# RNN/LSTM/GRU
- "BlockLSTM",
+ Op.BlockLSTM,
# pooling
- "QuantizedMaxPool",
- "QuantizedAvgPool",
- "AvgPool",
- "MaxPool",
- "AvgPoolAct",
- "MaxPoolAct",
- "ReduceSum",
+ Op.QuantizedMaxPool,
+ Op.QuantizedAvgPool,
+ Op.AvgPool,
+ Op.MaxPool,
+ Op.ReduceSum,
# deconvolution
- "ResizeBilinear",
+ Op.ResizeBilinear,
)
)
-binary_elem_wise_main_ops = set(
- (
- # binary element-wise
- "AddAct",
- "MulAct",
- "SubAct",
- "QuantizedAdd",
- "QuantizedSub",
- "QuantizedMul",
- "Mul",
- "Add",
- "Sub",
- "Minimum",
- "Maximum",
- "SHL",
- "SHR",
- )
-)
+binary_elem_wise_main_ops = Op.op_set(Op.is_binary_elementwise_op)
-unary_elem_wise_main_ops = set(("LeakyRelu", "Abs", "CLZ",)) # Unary element-wise operations
+unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op) # Unary element-wise operations
elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
-activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1"))
-npu_post_ops = activation_ops | set(
- # Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct.
- ("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")
-)
+activation_ops = Op.op_set(Op.is_relu_op)
+npu_post_ops = activation_ops
npu_post_fuse_limited_ops = set(
# Set of post operators that should not be fused with main/elementwise ops
- ("ConcatSliceWrite", "Sigmoid", "Tanh", "Quantize")
+ (Op.ConcatSliceWrite, Op.Sigmoid, Op.Tanh, Op.Quantize)
)
-elem_wise_ops = elem_wise_main_ops | activation_ops | set(("Sigmoid", "Tanh"))
+elem_wise_ops = elem_wise_main_ops | activation_ops | set((Op.Sigmoid, Op.Tanh))
-quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
-cpu_ops = set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN")) | quantization_ops
+quantization_ops = set((Op.Dequantize, Op.Max, Op.Min))
+cpu_ops = set((Op.Softmax, Op.LRN, Op.Shape, Op.Pad, Op.AddN)) | quantization_ops
-npu_dma_ops = set(("DMA",))
-startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
-memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",))
+npu_dma_ops = set((Op.DMA,))
+startup_init_ops = set((Op.Const, Op.Placeholder, Op.SubgraphInput))
+memory_only_ops = set((Op.Squeeze, Op.Reshape, Op.QuantizedReshape, Op.ExpandDims,))
test_sequence = [
@@ -234,10 +211,6 @@ test_sequence = [
for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:
assert not flags_to_clear & flags_to_set
- if operation_set is not None:
- for op in operation_set:
- assert len(op) > 1 # This is to avoid string literals being decomposed
-
def pack_into_passes(nng, arch, verbose_packing=False):
def visit_op(op, ignored):
@@ -254,7 +227,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
if op.type in startup_init_ops:
startup_list.append(op)
else:
- _, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+ ofm_tensor = op.ofm
if ofm_tensor is None:
ofm_tensor = op.outputs[0]
build_pass((op,), ofm_tensor)
@@ -287,7 +260,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
continue
reverse_ops_list.append(curr_op)
- new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default)
+ new_block_type = curr_op.type.npu_block_type
if new_block_type != NpuBlockType.Default:
assert npu_block_type == NpuBlockType.Default
npu_block_type = new_block_type # Only one major block type per pass
@@ -302,10 +275,8 @@ def pack_into_passes(nng, arch, verbose_packing=False):
PassFlags.Mac | PassFlags.ElementWise | PassFlags.Post | PassFlags.PostFusingLimited
):
assert len(curr_op.inputs) >= 1
- if curr_op.type == "BlockLSTM":
- ifm_tensor = curr_op.inputs[3]
- else:
- ifm_tensor = curr_op.inputs[0]
+ ifm_tensor = curr_op.ifm
+ assert ifm_tensor is not None
assert ifm_tensor.purpose == TensorPurpose.FeatureMap
if flags_to_set & PassFlags.Dma:
@@ -377,7 +348,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
primary_op = create_primary_op(ops_list)
if primary_op is not None:
visit_tensor_refcount[primary_op.inputs[0]] += 1
- npu_block_type = primary_op.attrs["npu_block_type"]
+ npu_block_type = primary_op.type.npu_block_type
for input_tens in primary_op.inputs:
if input_tens not in input_set:
input_set.add(input_tens)
@@ -394,7 +365,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
for inp in primary_op.inputs:
if inp is None:
continue
- if len(inp.ops) == 1 and inp.ops[0].type == "DMA" and inp.purpose == TensorPurpose.FeatureMap:
+ if len(inp.ops) == 1 and inp.ops[0].type == Op.DMA and inp.purpose == TensorPurpose.FeatureMap:
src_op = inp.ops[0]
if src_op in input_ops_list:
inp = src_op.inputs[0]
@@ -408,7 +379,7 @@ def pack_into_passes(nng, arch, verbose_packing=False):
add_input_list(inp, input_set, input_refcounts, lut_list, ordered_input_list)
name = ops_list[0].name
- non_dma_ops = [op for op in ops_list if op.type != "DMA"]
+ non_dma_ops = [op for op in ops_list if op.type != Op.DMA]
if non_dma_ops:
name = non_dma_ops[0].name
ps = Pass(name, placement, is_element_wise, npu_block_type)
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index da9be668..073b50fb 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -50,6 +50,7 @@ from .numeric_util import quantise_float32
from .numeric_util import round_away_zero
from .numeric_util import round_up_to_int
from .operation import NpuBlockType
+from .operation import Op
from .tensor import MemType
from .tensor import TensorBlockTraversal
from .tensor import TensorFormat
@@ -357,16 +358,16 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
elementwise_mode_map = {
- "MulAct": elementwise_mode.MUL.value,
- "AddAct": elementwise_mode.ADD.value,
- "SubAct": elementwise_mode.SUB.value,
- "Minimum": elementwise_mode.MIN.value,
- "Maximum": elementwise_mode.MAX.value,
- "LeakyRelu": elementwise_mode.LRELU.value,
- "Abs": elementwise_mode.ABS.value,
- "CLZ": elementwise_mode.CLZ.value,
- "SHR": elementwise_mode.SHR.value,
- "SHL": elementwise_mode.SHL.value,
+ Op.Mul: elementwise_mode.MUL.value,
+ Op.Add: elementwise_mode.ADD.value,
+ Op.Sub: elementwise_mode.SUB.value,
+ Op.Minimum: elementwise_mode.MIN.value,
+ Op.Maximum: elementwise_mode.MAX.value,
+ Op.LeakyRelu: elementwise_mode.LRELU.value,
+ Op.Abs: elementwise_mode.ABS.value,
+ Op.CLZ: elementwise_mode.CLZ.value,
+ Op.SHR: elementwise_mode.SHR.value,
+ Op.SHL: elementwise_mode.SHL.value,
}
cmd_stream = []
@@ -439,15 +440,15 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
rounding_mode = (
rounding.NATURAL if primary_op.attrs.get("rounding_mode", "") == b"NATURAL" else rounding.TFL
)
- if primary_op.type == "ResizeBilinear":
+ if primary_op.type == Op.ResizeBilinear:
rounding_mode = rounding.TRUNCATE
- fmf = primary_op.attrs.get("fused_memory_function", None)
- faf = primary_op.attrs.get("fused_activation_function", None)
- fused_quantize = any(op.type == "Quantize" for op in ps.ops)
+ fmf = primary_op.memory_function
+ faf = primary_op.activation
+ fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
# Force output scale, used in operations with fused LUT
# Note: with current LUT support, forced_ofm_quantization is always equal to cmd.ofm_tensor.quantization
# except when primary_op is AddAct + 0 (no-op) + LUT
- forced_ofm_quantization = primary_op.attrs.get("forced_output_quantization", None)
+ forced_ofm_quantization = primary_op.forced_output_quantization
ofm_quant = cmd.ofm_tensor.quantization
if forced_ofm_quantization is not None:
ofm_quant = forced_ofm_quantization
@@ -482,16 +483,16 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
# Calculate scales needed for arithmetic elementwise operators
- if primary_op.type in set(("AddAct", "MulAct", "SubAct",)):
+ if primary_op.type in set((Op.Add, Op.Mul, Op.Sub,)):
input_scale = cmd.ifm_tensor.quantization.scale_f32 if cmd.ifm_tensor.quantization else None
input2_scale = cmd.ifm2_tensor.quantization.scale_f32 if cmd.ifm2_tensor.quantization else None
output_scale = ofm_quant.scale_f32 if ofm_quant else None
use_global_scale = True
- if output_scale is not None and faf in ("Sigmoid", "Tanh"):
+ if output_scale is not None and faf in (Op.Sigmoid, Op.Tanh):
output_scale = 1 / 0x3000
- if primary_op.type == "MulAct":
+ if primary_op.type == Op.Mul:
if None in (input_scale, input2_scale, output_scale):
ofm_scale = 1
shift = 0
@@ -537,11 +538,11 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
- elif primary_op.type in set(("LeakyRelu", "Abs",)):
+ elif primary_op.type in set((Op.LeakyRelu, Op.Abs,)):
output_scale = ofm_quant.scale_f32
use_global_scale = True
- if primary_op.type == "LeakyRelu":
+ if primary_op.type == Op.LeakyRelu:
output_scale = primary_op.attrs["alpha"]
ofm_scale, shift = scaling.quantise_scale(output_scale)
@@ -599,10 +600,10 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
- if primary_op.type == "ResizeBilinear":
+ if primary_op.type == Op.ResizeBilinear:
# perform nearest neighbor upscale
emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
- elif primary_op.type == "Conv2DBackpropInputSwitchedBias":
+ elif primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
# perform insert zero upscale
emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
else:
@@ -651,12 +652,9 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
valid_padding = sum(explicit_padding) == 0
- if (
- primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear", "ReduceSum"))
- and valid_padding
- ):
+ if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.ReduceSum)) and valid_padding:
# For valid padding vela has to output scaling values
- if faf == "Sigmoid" or faf == "Tanh":
+ if faf == Op.Sigmoid or faf == Op.Tanh:
rescale = 0x3000 * cmd.ifm_tensor.quantization.scale_f32
if cmd.ifm_tensor.dtype == DataType.int16:
# Calculate scale and shift for the output scale of 1/(3*4096)
@@ -675,7 +673,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
ifm_scale_f64 = np.double(cmd.ifm_tensor.quantization.scale_f32)
ofm_scale_f64 = np.double(ofm_quant.scale_f32)
scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
- elif primary_op.type == "ResizeBilinear" and "rescale" in primary_op.attrs:
+ elif primary_op.type == Op.ResizeBilinear and "rescale" in primary_op.attrs:
rescale = primary_op.attrs["rescale"]
rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
scale, shift = scaling.quantise_pooling_scale(k_height * k_width, rescale_bits)
@@ -689,7 +687,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
rescale = cmd.ifm_tensor.quantization.scale_f32 / ofm_quant.scale_f32
rescale_bits = 0
if k_height == k_width == 1:
- if fmf == "ConcatSliceWrite":
+ if fmf == Op.ConcatSliceWrite:
rounding_mode = rounding.NATURAL
if rescale > 1:
rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
@@ -814,35 +812,35 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
# Even if no activation function, values need to be set to override previous values
faf_min = ofm_quant_qmin
faf_max = ofm_quant_qmax
- elif faf == "Relu":
+ elif faf == Op.Relu:
emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
faf_max = ofm_quant_qmax
- elif faf == "Relu6":
+ elif faf == Op.Relu6:
emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
faf_min = quantise_float32(0.0, ofm_quant.scale_f32, ofm_quant.zero_point)
faf_max = quantise_float32(6.0, ofm_quant.scale_f32, ofm_quant.zero_point)
- elif faf == "ReluN1To1":
+ elif faf == Op.ReluN1To1:
emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.NONE)
faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
- elif faf == "Tanh":
+ elif faf == Op.Tanh:
emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.TANH)
- if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
+ if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):
faf_min = quantise_float32(-1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
else:
faf_min = quantise_float32(clamp_tanh(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
faf_max = quantise_float32(clamp_tanh(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
- elif faf == "Sigmoid":
+ elif faf == Op.Sigmoid:
emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation.SIGMOID)
- if primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear")):
+ if primary_op.type in set((Op.AvgPool, Op.ResizeBilinear)):
faf_min = quantise_float32(0, ofm_quant.scale_f32, ofm_quant.zero_point)
faf_max = quantise_float32(1.0, ofm_quant.scale_f32, ofm_quant.zero_point)
else:
faf_min = quantise_float32(clamp_sigmoid(ifm_min), ofm_quant.scale_f32, ofm_quant.zero_point)
faf_max = quantise_float32(clamp_sigmoid(ifm_max), ofm_quant.scale_f32, ofm_quant.zero_point)
- elif faf == "LUT":
+ elif faf == Op.LUT:
lut_index = int(activation.LUT_START.value) + primary_op.attrs.get("lut_index", -1)
assert activation.LUT_START.value <= lut_index <= activation.LUT_END.value, "LUT index out of range."
if cmd.ofm_tensor.dtype == DataType.int32:
@@ -851,7 +849,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
faf_min = ofm_quant_qmin
faf_max = ofm_quant_qmax
else:
- raise Exception("Unsupported fused_activation_function = " + faf)
+ raise Exception("Unsupported fused_activation_function = " + faf.name)
# Activation range needs to be set based upon the quantisation range and the fused activation range
emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, max(ofm_quant_qmin, faf_min))
@@ -911,14 +909,11 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
need_zero_point = (
(faf is not None and forced_ofm_quantization is None)
- or (fmf == "ConcatSliceWrite")
+ or (fmf == Op.ConcatSliceWrite)
or fused_quantize
)
if (
- (
- primary_op.type in set(("AvgPool", "AvgPoolAct", "ResizeBilinear", "CLZ", "SHL"))
- and not need_zero_point
- )
+ (primary_op.type in set((Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL)) and not need_zero_point)
or (
tens.dtype == DataType.int32
and zero_point_op in (cmd0.NPU_SET_IFM_ZERO_POINT, cmd0.NPU_SET_IFM2_ZERO_POINT)
@@ -933,7 +928,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
zero_point = forced_ofm_quantization.zero_point
elif (
"resizebilinear" in primary_op.attrs
- and primary_op.type == "AddAct"
+ and primary_op.type == Op.Add
and cmd0.NPU_SET_OFM_ZERO_POINT == zero_point_op
):
# Force output zero point same as the input zero point
@@ -1108,7 +1103,7 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
# Vector product is implemented using a 1x1 convolution
emit.cmd_do_operation(cmd0.NPU_OP_CONV)
elif npu_block_type == NpuBlockType.Pooling:
- param = pooling_mode.MAX.value if "Max" in primary_op.type else pooling_mode.AVERAGE.value
+ param = pooling_mode.MAX.value if primary_op.type.is_maxpool_op() else pooling_mode.AVERAGE.value
emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=param)
elif npu_block_type == NpuBlockType.ReduceSum:
emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_mode.REDUCE_SUM.value)
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 5c2ddabb..41e15294 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -37,6 +37,7 @@ from .npu_performance import make_metrics_arrays
from .npu_performance import PassCycles
from .numeric_util import full_shape
from .operation import NpuBlockType
+from .operation import Op
from .shared_buffer_allocation import find_block_configs_suitable_for_pass_and_shared_buffer
from .shared_buffer_allocation import shared_buffer_allocation_for_pass_and_block_config
from .tensor import MemArea
@@ -254,11 +255,7 @@ class DynamicProgrammingScheduler:
self.pareto_max_candidates = 16
self.ifm_stream_npu_blocks = set(
- (
- NpuBlockType.ConvolutionMxN,
- NpuBlockType.ConvolutionDepthWise,
- NpuBlockType.Pooling,
- )
+ (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling,)
)
num_pareto_metrics = 4
@@ -652,7 +649,7 @@ class DynamicProgrammingScheduler:
def avoid_for_cascading(self, pred_candidate):
for op in pred_candidate.ops:
if (
- op.type == "ConcatSliceWrite"
+ op.type == Op.ConcatSliceWrite
and self.arch.feature_map_storage_mem_area != self.arch.fast_storage_mem_area
):
# For SRAM spilling, concat op is avoided as predecessor
@@ -981,9 +978,9 @@ class DynamicProgrammingScheduler:
use_NHCWB16 = False
use_fast_storage = False
continue
- if op.type == "ReduceSum" and output.dtype == DataType.int32:
+ if op.type == Op.ReduceSum and output.dtype == DataType.int32:
use_NHCWB16 = False
- elif op.type == "Reshape":
+ elif op.type == Op.Reshape:
# Detect no-op reshapes by comparing their full input and output tensor shapes.
inshape = full_shape(4, op.inputs[0].shape, 1)
outshape = full_shape(4, op.outputs[0].shape, 1)
@@ -995,7 +992,7 @@ class DynamicProgrammingScheduler:
incompatible_consumers = [
(
not consumer.run_on_npu
- or consumer.type == "Reshape"
+ or consumer.type == Op.Reshape
or (consumer is last_op_in_subgraph)
)
for consumer in op.outputs[0].consumer_list
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
index 58856a3e..aa5f4c86 100644
--- a/ethosu/vela/shared_buffer_allocation.py
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -25,6 +25,7 @@ from .architecture_features import SHRAMElements
from .errors import VelaError
from .ethos_u55_regs.ethos_u55_regs import resampling_mode
from .operation import NpuBlockType
+from .operation import Op
from .range_set import MemoryRangeSet
from .tensor import MemArea
@@ -39,7 +40,7 @@ class SharedBufferAllocation:
ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
- has_scale = len(tensors) == len(scales) and not None in scales
+ has_scale = len(tensors) == len(scales) and None not in scales
strides = (1, 1, 1, 1)
dilation = (1, 1, 1, 1)
@@ -53,7 +54,7 @@ class SharedBufferAllocation:
k_h = 1
k_w = 1
if weight_tensor:
- if ps.primary_op.type != "FullyConnectedAct":
+ if ps.primary_op.type != Op.FullyConnected:
k_h = weight_tensor.shape[0]
k_w = weight_tensor.shape[1]
else:
@@ -94,7 +95,9 @@ class SharedBufferAllocation:
self.use_ifm_element == SHRAMElements.IFM16_Elementwise
)
elif self.ifm_bits == 32:
- assert self.is_elementwise or ps.npu_block_type == NpuBlockType.ReduceSum, "Unsupported 32-bit IFM operation"
+ assert (
+ self.is_elementwise or ps.npu_block_type == NpuBlockType.ReduceSum
+ ), "Unsupported 32-bit IFM operation"
self.use_ifm_element = SHRAMElements.IFM32
else:
assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 5a5396f4..12c20164 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -25,6 +25,7 @@ import numpy as np
from . import fp_math
from . import scaling
from .data_type import DataType
+from .operation import Op
from .operation import Operation
from .tensor import create_const_tensor
from .tensor import create_reshape_tensor
@@ -229,7 +230,7 @@ class SoftMax:
# PASS 0 - Depthwise Maxpool
maxpool_op = self.op.clone("_maxpool0")
- maxpool_op.type = "MaxPool"
+ maxpool_op.type = Op.MaxPool
maxpool_h = ifm.shape[1] * ifm.shape[2]
maxpool_w = ifm.shape[3]
maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1]
@@ -246,7 +247,7 @@ class SoftMax:
maxpool_op.set_output_tensor(ifm_max)
# PASS 1 - Sub+LUT(exp)
- sub_op = Operation("SubAct", self.op.name + "_sub1")
+ sub_op = Operation(Op.Sub, self.op.name + "_sub1")
sub_op.add_input_tensor(ifm)
sub_op.add_input_tensor(create_reshape_tensor(ifm_max, [1, ifm.shape[1], ifm.shape[2], 1]))
sub_op.set_activation_lut(
@@ -262,7 +263,7 @@ class SoftMax:
sub_op.set_output_tensor(ifm_exp)
# PASS 2 - SHR
- shr2_op = Operation("SHR", self.op.name + "_shr2")
+ shr2_op = Operation(Op.SHR, self.op.name + "_shr2")
shr2_op.attrs["rounding_mode"] = b"NATURAL"
shr2_op.add_input_tensor(ifm_exp)
shr2_op.add_input_tensor(
@@ -275,7 +276,7 @@ class SoftMax:
shr2_op.set_output_tensor(rescaled_exp)
# PASS 3 - Reduce sum
- reduce_sum_op = Operation("ReduceSum", self.op.name + "_reduce_sum3")
+ reduce_sum_op = Operation(Op.ReduceSum, self.op.name + "_reduce_sum3")
reduce_sum_op.attrs["padding"] = b"VALID"
reduce_sum_op.attrs["stride_w"] = 1
reduce_sum_op.attrs["stride_h"] = 1
@@ -291,14 +292,14 @@ class SoftMax:
reduce_sum_op.set_output_tensor(sum_of_exp)
# PASS 4 - CLZ
- clz_op = Operation("CLZ", self.op.name + "_clz4")
+ clz_op = Operation(Op.CLZ, self.op.name + "_clz4")
clz_op.add_input_tensor(sum_of_exp)
headroom_plus_one = Tensor(reduce_sum_shape, DataType.int32, clz_op.name + "_0")
headroom_plus_one.quantization = no_scale_quant
clz_op.set_output_tensor(headroom_plus_one)
# PASS 5 - Sub
- sub5_op = Operation("SubAct", self.op.name + "_sub5")
+ sub5_op = Operation(Op.Sub, self.op.name + "_sub5")
sub5_op.add_input_tensor(
create_const_tensor(
"headroom_offset_const",
@@ -316,7 +317,7 @@ class SoftMax:
# PASS 6 - Sub
one = create_const_tensor("one_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant)
- sub6_op = Operation("SubAct", self.op.name + "_sub6")
+ sub6_op = Operation(Op.Sub, self.op.name + "_sub6")
sub6_op.add_input_tensor(headroom_plus_one)
sub6_op.add_input_tensor(one)
headroom = Tensor(reduce_sum_shape, DataType.int32, sub6_op.name + "_0")
@@ -324,7 +325,7 @@ class SoftMax:
sub6_op.set_output_tensor(headroom)
# PASS 7 - SHL
- shl7_op = Operation("SHL", self.op.name + "_shl7")
+ shl7_op = Operation(Op.SHL, self.op.name + "_shl7")
shl7_op.add_input_tensor(sum_of_exp)
shl7_op.add_input_tensor(headroom)
shifted_sum = Tensor(reduce_sum_shape, DataType.int32, shl7_op.name + "_0")
@@ -332,7 +333,7 @@ class SoftMax:
shl7_op.set_output_tensor(shifted_sum)
# PASS 8 - Sub
- sub8_op = Operation("SubAct", self.op.name + "_sub8")
+ sub8_op = Operation(Op.Sub, self.op.name + "_sub8")
sub8_op.add_input_tensor(shifted_sum)
sub8_op.add_input_tensor(
create_const_tensor(
@@ -344,7 +345,7 @@ class SoftMax:
sub8_op.set_output_tensor(shifted_sum_minus_one)
# PASS 9 - SHL
- shl9_op = Operation("SHL", self.op.name + "_shl9")
+ shl9_op = Operation(Op.SHL, self.op.name + "_shl9")
shl9_op.add_input_tensor(shifted_sum_minus_one)
shl9_op.add_input_tensor(one)
shifted_sum_minus_one = Tensor(reduce_sum_shape, DataType.int32, shl9_op.name + "_0")
@@ -352,7 +353,7 @@ class SoftMax:
shl9_op.set_output_tensor(shifted_sum_minus_one)
# PASS 10 - Add
- add10_op = Operation("AddAct", self.op.name + "_add10")
+ add10_op = Operation(Op.Add, self.op.name + "_add10")
add10_op.add_input_tensor(
create_const_tensor(
"F0_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 31) - 1], np.int32, quantization=no_scale_quant
@@ -365,7 +366,7 @@ class SoftMax:
add10_op.set_output_tensor(half_denominator)
# PASS 11 - Multiply
- mul11_op = Operation("MulAct", self.op.name + "_mul11")
+ mul11_op = Operation(Op.Mul, self.op.name + "_mul11")
mul11_op.add_input_tensor(half_denominator)
mul11_op.add_input_tensor(
create_const_tensor(
@@ -383,7 +384,7 @@ class SoftMax:
mul11_op.set_output_tensor(rescaled)
# PASS 12 - Add
- add12_op = Operation("AddAct", self.op.name + "_add12")
+ add12_op = Operation(Op.Add, self.op.name + "_add12")
add12_op.add_input_tensor(rescaled)
add12_op.add_input_tensor(
create_const_tensor(
@@ -403,7 +404,7 @@ class SoftMax:
)
for i in range(3):
# PASS 13, 18, 23 - MUL
- mul_op = Operation("MulAct", self.op.name + "_mul%d" % (13 + i * 5))
+ mul_op = Operation(Op.Mul, self.op.name + "_mul%d" % (13 + i * 5))
mul_op.add_input_tensor(nr_x)
mul_op.add_input_tensor(half_denominator)
half_denominator_times_x = Tensor(reduce_sum_shape, DataType.int32, mul_op.name + "_0")
@@ -411,14 +412,14 @@ class SoftMax:
half_denominator_times_x.quantization.scale_f32 = 2.0
mul_op.set_output_tensor(half_denominator_times_x)
# PASS 14, 19, 24 - SUB
- sub_op = Operation("SubAct", self.op.name + "_sub%d" % (14 + i * 5))
+ sub_op = Operation(Op.Sub, self.op.name + "_sub%d" % (14 + i * 5))
sub_op.add_input_tensor(F2_one)
sub_op.add_input_tensor(half_denominator_times_x)
one_minus_half_denominator_times_x = Tensor(reduce_sum_shape, DataType.int32, sub_op.name + "_0")
one_minus_half_denominator_times_x.quantization = one_scale_quant
sub_op.set_output_tensor(one_minus_half_denominator_times_x)
# PASS 15, 20, 25 - MUL
- mul_op = Operation("MulAct", self.op.name + "_mul%d" % (15 + i * 5))
+ mul_op = Operation(Op.Mul, self.op.name + "_mul%d" % (15 + i * 5))
mul_op.add_input_tensor(nr_x)
mul_op.add_input_tensor(one_minus_half_denominator_times_x)
to_rescale = Tensor(reduce_sum_shape, DataType.int32, mul_op.name + "_0")
@@ -426,14 +427,14 @@ class SoftMax:
to_rescale.quantization.scale_f32 = 2.0
mul_op.set_output_tensor(to_rescale)
# PASS 16, 21, 26 - MUL
- shl_op = Operation("MulAct", self.op.name + "_mul%d" % (16 + i * 5))
+ shl_op = Operation(Op.Mul, self.op.name + "_mul%d" % (16 + i * 5))
shl_op.add_input_tensor(to_rescale)
shl_op.add_input_tensor(four)
to_add = Tensor(reduce_sum_shape, DataType.int32, shl_op.name + "_0")
to_add.quantization = no_scale_quant
shl_op.set_output_tensor(to_add)
# PASS 17, 22, 27 - ADD
- add_op = Operation("AddAct", self.op.name + "_add%d" % (17 + i * 5))
+ add_op = Operation(Op.Add, self.op.name + "_add%d" % (17 + i * 5))
add_op.add_input_tensor(nr_x)
add_op.add_input_tensor(to_add)
nr_x = Tensor(reduce_sum_shape, DataType.int32, add_op.name + "_0")
@@ -441,7 +442,7 @@ class SoftMax:
add_op.set_output_tensor(nr_x)
# PASS 28 - Multiply
- mul28_op = Operation("MulAct", self.op.name + "_mul28")
+ mul28_op = Operation(Op.Mul, self.op.name + "_mul28")
mul28_op.add_input_tensor(nr_x)
mul28_op.add_input_tensor(
create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant)
@@ -451,7 +452,7 @@ class SoftMax:
mul28_op.set_output_tensor(scale_factor)
# PASS 29 - Multiply
- mul_op = Operation("MulAct", self.op.name + "_mul29")
+ mul_op = Operation(Op.Mul, self.op.name + "_mul29")
mul_op.add_input_tensor(ifm_exp)
mul_op.add_input_tensor(scale_factor)
scaled_exp = Tensor(ifm_exp.shape, DataType.int32, mul_op.name + "_0")
@@ -460,7 +461,7 @@ class SoftMax:
mul_op.set_output_tensor(scaled_exp)
# PASS 30 - SHR
- shr30_op = Operation("SHR", self.op.name + "_shr30")
+ shr30_op = Operation(Op.SHR, self.op.name + "_shr30")
shr30_op.attrs["rounding_mode"] = b"NATURAL"
shr30_op.add_input_tensor(scaled_exp)
shr30_op.add_input_tensor(right_shift)
@@ -474,7 +475,7 @@ class SoftMax:
# PASS 0 - Depthwise Maxpool
maxpool_op = self.op.clone("_maxpool0")
- maxpool_op.type = "MaxPool"
+ maxpool_op.type = Op.MaxPool
maxpool_h = ifm.shape[1] * ifm.shape[2]
maxpool_w = ifm.shape[3]
maxpool_ifm_shape = [1, maxpool_h, maxpool_w, 1]
@@ -491,7 +492,7 @@ class SoftMax:
maxpool_op.set_output_tensor(maxpool_ofm)
# PASS 1 - Sub
- sub1_op = Operation("SubAct", self.op.name + "_sub1")
+ sub1_op = Operation(Op.Sub, self.op.name + "_sub1")
sub1_op.add_input_tensor(ifm)
sub1_op.add_input_tensor(create_reshape_tensor(maxpool_ofm, [1, ifm.shape[1], ifm.shape[2], 1]))
sub1_ofm = Tensor(ifm.shape, DataType.int32, sub1_op.name + "_0")
@@ -504,7 +505,7 @@ class SoftMax:
mul2_scale, _ = scaling.elementwise_mul_scale(sub1_ofm.quantization.scale_f32, beta, mul2_out_range)
mul2_quant = ifm.quantization.clone()
mul2_quant.scale_f32 = beta
- mul2_op = Operation("MulAct", self.op.name + "_mul2")
+ mul2_op = Operation(Op.Mul, self.op.name + "_mul2")
mul2_op.add_input_tensor(sub1_ofm)
mul2_op.add_input_tensor(
create_const_tensor(
@@ -517,7 +518,7 @@ class SoftMax:
mul2_op.set_output_tensor(mul2_ofm)
# PASS 3 - Add+LUT(exp)
- add_op = Operation("AddAct", self.op.name + "_add3")
+ add_op = Operation(Op.Add, self.op.name + "_add3")
add_op.add_input_tensor(mul2_ofm)
add_op.add_input_tensor(
create_const_tensor(
@@ -534,7 +535,7 @@ class SoftMax:
add_op.set_output_tensor(exp_ofm)
# PASS 4 - Reduce sum
- reduce_sum_op = Operation("ReduceSum", self.op.name + "_reduce_sum4")
+ reduce_sum_op = Operation(Op.ReduceSum, self.op.name + "_reduce_sum4")
reduce_sum_op.attrs["padding"] = b"VALID"
reduce_sum_op.attrs["stride_w"] = 1
reduce_sum_op.attrs["stride_h"] = 1
@@ -550,14 +551,14 @@ class SoftMax:
reduce_sum_op.set_output_tensor(sum_of_exp)
# PASS 5 - CLZ
- clz_op = Operation("CLZ", self.op.name + "_clz5")
+ clz_op = Operation(Op.CLZ, self.op.name + "_clz5")
clz_op.add_input_tensor(sum_of_exp)
headroom_plus_one = Tensor(reduce_sum_shape, DataType.int32, clz_op.name + "_0")
headroom_plus_one.quantization = no_scale_quant
clz_op.set_output_tensor(headroom_plus_one)
# PASS 6 - Sub
- sub6_op = Operation("SubAct", self.op.name + "_sub6")
+ sub6_op = Operation(Op.Sub, self.op.name + "_sub6")
sub6_op.add_input_tensor(
create_const_tensor(
sub6_op.name + "_const", [1, 1, 1, 1], DataType.int32, [31], np.int32, quantization=no_scale_quant
@@ -569,7 +570,7 @@ class SoftMax:
sub6_op.set_output_tensor(reciprocal_right_shift)
# PASS 7 - SHL
- shl7_op = Operation("SHL", self.op.name + "_shl7")
+ shl7_op = Operation(Op.SHL, self.op.name + "_shl7")
shl7_op.add_input_tensor(
create_const_tensor(
shl7_op.name + "_const", [1, 1, 1, 1], DataType.int32, [1], np.int32, quantization=no_scale_quant
@@ -581,7 +582,7 @@ class SoftMax:
shl7_op.set_output_tensor(constant_one)
# PASS 8 - Sub
- sub8_op = Operation("SubAct", self.op.name + "_sub8")
+ sub8_op = Operation(Op.Sub, self.op.name + "_sub8")
sub8_op.add_input_tensor(sum_of_exp)
sub8_op.add_input_tensor(constant_one)
sum_of_exps_minus_one = Tensor(reduce_sum_shape, DataType.int32, sub8_op.name + "_0")
@@ -589,7 +590,7 @@ class SoftMax:
sub8_op.set_output_tensor(sum_of_exps_minus_one)
# PASS 9 - SHL
- shl9_op = Operation("SHL", self.op.name + "_shl9")
+ shl9_op = Operation(Op.SHL, self.op.name + "_shl9")
shl9_op.add_input_tensor(sum_of_exps_minus_one)
shl9_op.add_input_tensor(headroom_plus_one)
shifted_sum_minus_one = Tensor(reduce_sum_shape, DataType.int32, shl9_op.name + "_0")
@@ -597,7 +598,7 @@ class SoftMax:
shl9_op.set_output_tensor(shifted_sum_minus_one)
# PASS 10 - SHR
- shr10_op = Operation("SHR", self.op.name + "_shr10")
+ shr10_op = Operation(Op.SHR, self.op.name + "_shr10")
shr10_op.add_input_tensor(shifted_sum_minus_one)
shr10_op.add_input_tensor(
create_const_tensor(
@@ -609,7 +610,7 @@ class SoftMax:
shr10_op.set_output_tensor(shifted_sum_minus_one_16)
# PASS 11 - Sub+LUT(one over one plus x)
- sub11_op = Operation("SubAct", self.op.name + "_sub11")
+ sub11_op = Operation(Op.Sub, self.op.name + "_sub11")
sub11_op.add_input_tensor(shifted_sum_minus_one_16)
sub11_op.add_input_tensor(
create_const_tensor(
@@ -631,7 +632,7 @@ class SoftMax:
sub11_op.set_output_tensor(reciprocal_scale)
# PASS 12 - Multiply
- mul_op = Operation("MulAct", self.op.name + "_mul12")
+ mul_op = Operation(Op.Mul, self.op.name + "_mul12")
mul_op.add_input_tensor(exp_ofm)
mul_op.add_input_tensor(reciprocal_scale)
mul_ofm = Tensor(exp_ofm.shape, DataType.int32, mul_op.name + "_0")
@@ -639,7 +640,7 @@ class SoftMax:
mul_op.set_output_tensor(mul_ofm)
# PASS 13 - SHR
- shr13_op = Operation("SHR", self.op.name + "_shr13")
+ shr13_op = Operation(Op.SHR, self.op.name + "_shr13")
shr13_op.add_input_tensor(mul_ofm)
shr13_op.add_input_tensor(reciprocal_right_shift)
shr13_op.set_output_tensor(ofm)
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 2ea14f2b..6fd68f85 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -25,6 +25,7 @@ from .npu_performance import BandwidthDirection
from .npu_performance import MacCount
from .npu_performance import PassCycles
from .numeric_util import round_up_to_int
+from .operation import Op
from .tensor import MemArea
from .tensor import TensorPurpose
@@ -192,11 +193,11 @@ def write_pass_metrics_csv(nng, pass_filename):
continue # skip the dummy init pass
for ps in cps.passes:
- if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
+ if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp:
# just treat this as a call, unroll it
write_subgraph(ps.ops[0].attrs["subgraph"])
continue
- stats = [ps.name, " ".join(op.type for op in ps.ops)]
+ stats = [ps.name, " ".join(op.type.name for op in ps.ops)]
stats += [ps.placement.name]
stats += [cps.strategy.name]
stats += list(ps.block_config)
diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py
index 6ae072f0..3d4a09f3 100644
--- a/ethosu/vela/supported_operators.py
+++ b/ethosu/vela/supported_operators.py
@@ -20,6 +20,7 @@ import numpy as np
from .data_type import BaseType
from .data_type import DataType
from .operation import get_slice_offsets
+from .operation import Op
# Custom decorator function to allow formatting docstrings containing "{}"
@@ -37,18 +38,18 @@ def warn_cpu(op, msg):
class SupportedOperators:
# Categorised lists of supported operators
- npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))
- convolution_ops = set(("Conv2DBiasAct", "Conv2D", "QuantizedConv2D",))
- depthwise_convolution_ops = set(("DepthwiseConv2dBiasAct", "DepthwiseConv2dNative", "QuantizedDepthwiseConv2D",))
- transpose_convolution_ops = set(("Conv2DBackpropInput",))
- max_pooling_ops = set(("QuantizedMaxPool", "MaxPool", "MaxPoolAct",))
- avg_pooling_ops = set(("QuantizedAvgPool", "AvgPool", "AvgPoolAct",))
- pooling_ops = set(("ReduceSum",)) | max_pooling_ops | avg_pooling_ops
- resizing_ops = set(("ResizeBilinear",))
- fc_vector_products = set(("QuantizedMatMul", "MatMul", "FullyConnectedAct",))
+ npu_pre_ops = set((Op.SplitSliceRead,))
+ convolution_ops = set((Op.Conv2DBias, Op.Conv2D, Op.QuantizedConv2D,))
+ depthwise_convolution_ops = set((Op.DepthwiseConv2DBias,))
+ transpose_convolution_ops = set((Op.Conv2DBackpropInput,))
+ max_pooling_ops = Op.op_set(Op.is_maxpool_op)
+ avg_pooling_ops = Op.op_set(Op.is_avgpool_op)
+ pooling_ops = set((Op.ReduceSum,)) | max_pooling_ops | avg_pooling_ops
+ resizing_ops = set((Op.ResizeBilinear,))
+ fc_vector_products = set((Op.QuantizedMatMul, Op.MatMul, Op.FullyConnected,))
mac_main_ops = (
# RNN/LSTM/GRU
- set(("BlockLSTM",))
+ set((Op.BlockLSTM,))
# convolutions
| convolution_ops
# depth-wise convolutions
@@ -62,45 +63,29 @@ class SupportedOperators:
# FC layers
| fc_vector_products
)
- unary_elem_wise_main_ops = set(("LeakyRelu", "Abs", "CLZ",))
- binary_elem_wise_min_max_ops = set(("Minimum", "Maximum",))
- binary_elem_wise_shift_ops = set(("SHL", "SHR",))
- binary_elem_wise_add_mul_sub = set(
- ("AddAct", "MulAct", "SubAct", "QuantizedAdd", "QuantizedSub", "QuantizedMul", "Mul", "Add", "Sub",)
- )
+ unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op)
+ binary_elem_wise_min_max_ops = set((Op.Minimum, Op.Maximum,))
+ binary_elem_wise_shift_ops = set((Op.SHL, Op.SHR,))
+ binary_elem_wise_add_mul_sub = set((Op.Add, Op.Mul, Op.Sub,))
binary_elem_wise_main_ops = binary_elem_wise_min_max_ops | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
supported_int32_tensor_ops = (
- set(("Requantize", "ReduceSum", "CLZ",)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
- )
- activation_ops = set(
- (
- "QuantizedRelu",
- "QuantizedRelu1",
- "QuantizedRelu6",
- "Relu",
- "Relu6",
- "ReluN1To1",
- "Sigmoid",
- "Tanh",
- "Softmax",
- )
+ set((Op.ReduceSum, Op.CLZ,)) | binary_elem_wise_add_mul_sub | binary_elem_wise_shift_ops
)
+ activation_ops = set((Op.Relu, Op.Relu6, Op.ReluN1To1, Op.Sigmoid, Op.Tanh, Op.Softmax,))
npu_post_ops = (
+ # activation functions
+ activation_ops
# concatenation write direction
- set(("ConcatSliceWrite",))
- # bias add and batch norm
- | set(("QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm",))
+ | set((Op.ConcatSliceWrite,))
# Quantization
- | set(("Quantize",))
- # activation functions
- | activation_ops
+ | set((Op.Quantize,))
)
- split_ops = set(("Split", "SplitV", "StridedSlice", "Slice", "UnpackReshaped", "Unpack",))
- concat_ops = set(("Concat", "ConcatV2", "QuantizedConcat", "ConcatTFLite", "PackReshaped", "Pack",))
- memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",)) | concat_ops | split_ops
- shapeless_input_ops = set(("Split", "SplitV",)) | binary_elem_wise_main_ops
- supported_fused_activations = set(("Relu", "Relu6", "ReluN1To1", "Tanh", "Sigmoid", "LUT",))
+ split_ops = set((Op.Split, Op.SplitV, Op.StridedSlice, Op.Slice, Op.UnpackReshaped, Op.Unpack,))
+ concat_ops = set((Op.Concat, Op.ConcatTFLite, Op.PackReshaped, Op.Pack,))
+ memory_only_ops = set((Op.Squeeze, Op.Reshape, Op.QuantizedReshape, Op.ExpandDims,)) | concat_ops | split_ops
+ shapeless_input_ops = binary_elem_wise_main_ops | set((Op.Split, Op.SplitV,))
+ supported_fused_activations = set((Op.Relu, Op.Relu6, Op.ReluN1To1, Op.Tanh, Op.Sigmoid, Op.LUT,))
supported_operators = npu_pre_ops | mac_main_ops | elem_wise_main_ops | npu_post_ops | memory_only_ops
supported_dtypes = set((DataType.uint8, DataType.int8, DataType.int16, DataType.int32))
# Defined ranges for allowed values:
@@ -233,7 +218,7 @@ class SupportedOperators:
@docstring_format_args([supported_fused_activations])
def constraint_faf(cls, op):
"The fused activation function (if present) must be one of type: {}"
- faf = op.attrs.get("fused_activation_function")
+ faf = op.activation
valid = (faf is None) or (faf in cls.supported_fused_activations)
extra = "fused_activation_function={}".format(faf)
return valid, extra
@@ -300,7 +285,7 @@ class SupportedOperators:
@classmethod
def check_depthwise_convolution_restrictions(cls, op):
# check depth
- ifm_tensor, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+ ifm_tensor, ofm_tensor = op.get_ifm_ofm()
if op.attrs["depth_multiplier"] > 1 and not (
(ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"])
):
@@ -337,9 +322,9 @@ class SupportedOperators:
return False
# check data type
- ifm_tensor, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+ ifm_tensor, ofm_tensor = op.get_ifm_ofm()
if ifm_tensor.dtype != ofm_tensor.dtype:
- if op.type != "ReduceSum":
+ if op.type != Op.ReduceSum:
return False
# TODO: else check ReduceSum restrictions.
@@ -365,7 +350,7 @@ class SupportedOperators:
@classmethod
def check_resize_restrictions(cls, op):
# check unsupported upscaling factor
- if op.type == "ResizeBilinear":
+ if op.type == Op.ResizeBilinear:
if op.inputs[0].shape[1] == 1 and op.inputs[0].shape[2] == 1:
return True
if op.inputs[0].shape == op.outputs[0].shape:
@@ -424,10 +409,10 @@ class SupportedOperators:
ifm_tensor.dtype == ofm_tensor.dtype or ofm_tensor.dtype == DataType.int32
):
return False
- elif op.type in cls.binary_elem_wise_shift_ops | set(("CLZ")):
+ elif op.type in cls.binary_elem_wise_shift_ops:
if ifm_tensor.dtype != DataType.int32 or ifm2_tensor.dtype != DataType.int32:
return False
- if op.type in ("CLZ", "SHL") and ofm_tensor.dtype != DataType.int32:
+ if op.type in (Op.CLZ, Op.SHL) and ofm_tensor.dtype != DataType.int32:
return False
# check batch size
@@ -438,7 +423,7 @@ class SupportedOperators:
return False
# negative alpha values are not supported
- if op.type == "LeakyRelu" and op.attrs["alpha"] < 0:
+ if op.type == Op.LeakyRelu and op.attrs["alpha"] < 0:
return False
# check if ifm or ifm2 has ofm shape
@@ -452,7 +437,7 @@ class SupportedOperators:
@classmethod
def check_memory_only_restrictions(cls, op):
- if op.type == "StridedSlice":
+ if op.type == Op.StridedSlice:
if len(op.inputs) != 4:
warn_cpu(op, "has {} input tensors, only 4 inputs are supported".format(len(op.inputs)))
return False
@@ -493,7 +478,7 @@ class SupportedOperators:
),
)
return False
- if op.type == "SplitV":
+ if op.type == Op.SplitV:
# check that maximum one size is set to -1, indicating that size should be inferred
sizes = op.inputs[1].values
num_to_be_inferred = 0
@@ -504,7 +489,7 @@ class SupportedOperators:
if num_to_be_inferred > 1:
print("Warning:", op.type, "has more than one size to be inferred, which is illegal, placing on CPU")
return False
- if op.type.find("Concat") != -1:
+ if op.type in set((Op.Concat, Op.ConcatTFLite,)):
axis = op.attrs.get("axis", None)
if axis is None:
print("Warning:", op.type, "invalid or missing axis, placing on CPU")
@@ -554,7 +539,7 @@ class SupportedOperators:
@classmethod
def check_activation_ops(cls, op):
- if op.type == "Softmax":
+ if op.type == Op.Softmax:
ifm_tensor = op.inputs[0]
ofm_tensor = op.outputs[0]
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index c0786bfc..98dfa3d3 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -25,6 +25,7 @@ import numpy as np
from . import numeric_util
from .data_type import DataType
from .ethos_u55_regs.ethos_u55_regs import resampling_mode
+from .operation import Op
from .operation import Operation
from .range_set import MemoryRangeSet
@@ -242,7 +243,7 @@ def create_const_tensor(name, shape, dtype, values, value_dtype=None, purpose=Te
const_tensor.values = np.array(values, dtype=value_dtype)
const_tensor.quant_values = np.frombuffer(const_tensor.values.tobytes(), dtype=np.uint8)
# Operator
- const_op = Operation("Const", name)
+ const_op = Operation(Op.Const, name)
const_op.set_output_tensor(const_tensor)
return const_tensor
@@ -258,7 +259,7 @@ def create_reshape_tensor(tens, shape, ifm_reshape=True):
if not ifm_reshape:
reshape_ifm, reshape_ofm = reshape_ofm, reshape_ifm
# Operator
- reshape_op = Operation("Reshape", name)
+ reshape_op = Operation(Op.Reshape, name)
reshape_op.attrs["new_shape"] = shape
reshape_op.add_input_tensor(reshape_ifm)
reshape_op.add_input_tensor(create_const_tensor(name + "_shape", [1], DataType.int32, shape))
@@ -649,7 +650,7 @@ class Tensor:
return strides
def needs_dma(self):
- return len(self.ops) == 1 and self.ops[0].type == "DMA"
+ return len(self.ops) == 1 and self.ops[0].type == Op.DMA
def get_dma_src_tensor(self):
# For weight tensors that need DMA: returns the source tensor in Flash, else None
@@ -659,7 +660,7 @@ class Tensor:
def find_npu_op(self):
# Returns the NPU operator that uses this tensor, excluding DMA operators.
for op in self.consumers():
- if op.type == "DMA":
+ if op.type == Op.DMA:
return op.outputs[0].find_npu_op()
if op.run_on_npu:
return op
diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
index ee1a40fe..44ee0afb 100644
--- a/ethosu/vela/test/test_lut.py
+++ b/ethosu/vela/test/test_lut.py
@@ -26,6 +26,7 @@ from ethosu.vela import pass_packing
from ethosu.vela.data_type import DataType
from ethosu.vela.high_level_command_stream import DMA
from ethosu.vela.nn_graph import Graph
+from ethosu.vela.operation import Op
from ethosu.vela.rewrite_graph import verify_graph_health
from ethosu.vela.tensor import create_const_tensor
from ethosu.vela.tensor import TensorPurpose
@@ -94,28 +95,28 @@ def test_optimize_high_level_cmd_stream_2K():
arch = testutil.create_arch()
shape = [1, 1, 1, 1]
# u8 LUT op, should lead to DMA
- op0 = testutil.create_elemwise_op("AddAct", "op0", shape, shape, shape)
+ op0 = testutil.create_elemwise_op(Op.Add, "op0", shape, shape, shape)
set_256_lut(op0, "lut0")
# u8 LUT op, should lead to DMA
- op1 = testutil.create_elemwise_op("AddAct", "op1", shape, shape, shape)
+ op1 = testutil.create_elemwise_op(Op.Add, "op1", shape, shape, shape)
set_256_lut(op1, "lut1")
# u8 LUT op with different LUT, should lead to DMA
- op2 = testutil.create_elemwise_op("AddAct", "op2", shape, shape, shape)
+ op2 = testutil.create_elemwise_op(Op.Add, "op2", shape, shape, shape)
set_256_lut(op2, "lut2")
# u8 LUT op with same LUT as in op1, should not lead to DMA
- op3 = testutil.create_elemwise_op("AddAct", "op3", shape, shape, shape)
+ op3 = testutil.create_elemwise_op(Op.Add, "op3", shape, shape, shape)
set_256_lut(op3, "lut1")
# u8 LUT op with same LUT as in op2, should not lead to DMA
- op4 = testutil.create_elemwise_op("AddAct", "op4", shape, shape, shape)
+ op4 = testutil.create_elemwise_op(Op.Add, "op4", shape, shape, shape)
set_256_lut(op4, "lut2")
# 2K LUT op, should lead to DMA, and will overwrite all previous LUTs in SHRAM
- op5_2K = testutil.create_elemwise_op("AddAct", "op5", shape, shape, shape)
+ op5_2K = testutil.create_elemwise_op(Op.Add, "op5", shape, shape, shape)
set_2K_lut(op5_2K, "lut5")
# Another 2K LUT op, should lead to DMA, and will overwrite the previous LUT in SHRAM
- op6_2K = testutil.create_elemwise_op("AddAct", "op6", shape, shape, shape)
+ op6_2K = testutil.create_elemwise_op(Op.Add, "op6", shape, shape, shape)
set_2K_lut(op6_2K, "lut6")
# u8 LUT op with same LUT as in op1, should lead to DMA
- op7 = testutil.create_elemwise_op("AddAct", "op7", shape, shape, shape)
+ op7 = testutil.create_elemwise_op(Op.Add, "op7", shape, shape, shape)
set_256_lut(op7, "lut1")
op_list = [op0, op1, op2, op3, op4, op5_2K, op6_2K, op7]
@@ -149,28 +150,28 @@ def test_optimize_high_level_cmd_stream_1K():
arch = testutil.create_arch()
shape = [1, 1, 1, 1]
# u8 LUT op, should lead to DMA
- op0 = testutil.create_elemwise_op("AddAct", "op0", shape, shape, shape)
+ op0 = testutil.create_elemwise_op(Op.Add, "op0", shape, shape, shape)
set_256_lut(op0, "lut0")
# u8 LUT op, should lead to DMA
- op1 = testutil.create_elemwise_op("AddAct", "op1", shape, shape, shape)
+ op1 = testutil.create_elemwise_op(Op.Add, "op1", shape, shape, shape)
set_256_lut(op1, "lut1")
# 1K LUT op with different LUT, should lead to DMA
- op2_1K = testutil.create_elemwise_op("AddAct", "op2", shape, shape, shape)
+ op2_1K = testutil.create_elemwise_op(Op.Add, "op2", shape, shape, shape)
set_1K_lut(op2_1K, "lut2")
# u8 LUT op with same LUT as in op1, should not lead to DMA
- op3 = testutil.create_elemwise_op("AddAct", "op3", shape, shape, shape)
+ op3 = testutil.create_elemwise_op(Op.Add, "op3", shape, shape, shape)
set_256_lut(op3, "lut1")
# 1K LUT op with same LUT as in op2, should not lead to DMA
- op4_1K = testutil.create_elemwise_op("AddAct", "op4", shape, shape, shape)
+ op4_1K = testutil.create_elemwise_op(Op.Add, "op4", shape, shape, shape)
set_1K_lut(op4_1K, "lut2")
# 1K LUT op, should lead to DMA, and will overwrite lut2
- op5_2K = testutil.create_elemwise_op("AddAct", "op5", shape, shape, shape)
+ op5_2K = testutil.create_elemwise_op(Op.Add, "op5", shape, shape, shape)
set_1K_lut(op5_2K, "lut5")
# u8 LUT op, lut0 should still be present, should not lead to DMA
- op6 = testutil.create_elemwise_op("AddAct", "op6", shape, shape, shape)
+ op6 = testutil.create_elemwise_op(Op.Add, "op6", shape, shape, shape)
set_256_lut(op6, "lut0")
# 1K LUT op with same LUT as in op2, should lead to DMA
- op7 = testutil.create_elemwise_op("AddAct", "op7", shape, shape, shape)
+ op7 = testutil.create_elemwise_op(Op.Add, "op7", shape, shape, shape)
set_1K_lut(op7, "lut2")
op_list = [op0, op1, op2_1K, op3, op4_1K, op5_2K, op6, op7]
diff --git a/ethosu/vela/test/test_supported_operators.py b/ethosu/vela/test/test_supported_operators.py
index 53c20927..20d448d7 100644
--- a/ethosu/vela/test/test_supported_operators.py
+++ b/ethosu/vela/test/test_supported_operators.py
@@ -19,6 +19,7 @@
import numpy as np
from ethosu.vela.data_type import DataType
+from ethosu.vela.operation import Op
from ethosu.vela.supported_operators import SupportedOperators
from ethosu.vela.tensor import create_const_tensor
from ethosu.vela.tensor import QuantizationParameters
@@ -35,7 +36,7 @@ def create_strided_slice_op(in_shape, out_shape, start_offsets, end_offsets):
in3 = create_const_tensor("strides", [len(end_offsets)], DataType.uint8, len(end_offsets) * [1])
out = Tensor(out_shape, DataType.uint8, "out")
attrs = {"ellipsis_mask": 0, "new_axis_mask": 0, "shrink_axis_mask": 0, "begin_mask": 0, "end_mask": 0}
- return testutil.create_op("StridedSlice", [in0, in1, in2, in3], out, attrs=attrs)
+ return testutil.create_op(Op.StridedSlice, [in0, in1, in2, in3], out, attrs=attrs)
def create_strided_slice():
@@ -93,21 +94,21 @@ def test_constraint_tens_defined_shape():
# Tensors cannot have None in them
inp = Tensor([1, 8, None, 8], DataType.uint8, "in")
out = Tensor([1, 8, 8, 8], DataType.uint8, "out")
- op = testutil.create_op("Relu", [inp], out)
+ op = testutil.create_op(Op.Relu, [inp], out)
assert not support.is_operator_supported(op)
def test_constraint_tens_shapeless():
# Shapeless input is allowed if its of a certain type:
- op = testutil.create_elemwise_op("Mul", "scalar_mul", [1, 8, 8, 8], [], [1, 8, 8, 8])
+ op = testutil.create_elemwise_op(Op.Mul, "scalar_mul", [1, 8, 8, 8], [], [1, 8, 8, 8])
assert support.is_operator_supported(op)
# Shapeless output is not allowed at all:
- op = testutil.create_elemwise_op("Mul", "scalar_mul", [1, 8, 8, 8], [1, 8, 8, 8], [])
+ op = testutil.create_elemwise_op(Op.Mul, "scalar_mul", [1, 8, 8, 8], [1, 8, 8, 8], [])
assert not support.is_operator_supported(op)
# Invalid shapeless input due to op type:
inp = Tensor([], DataType.uint8, "in")
out = Tensor([1, 8, 8, 8], DataType.uint8, "out")
- op = testutil.create_op("Relu", [inp], out)
+ op = testutil.create_op(Op.Relu, [inp], out)
assert not support.is_operator_supported(op)
@@ -115,7 +116,7 @@ def test_constraint_tens_shape_size():
# Tensors cannot be > 4D
inp = Tensor([1, 1, 8, 8, 8], DataType.uint8, "in")
out = Tensor([1, 1, 8, 8, 8], DataType.uint8, "out")
- op = testutil.create_op("Relu", [inp], out)
+ op = testutil.create_op(Op.Relu, [inp], out)
assert not support.is_operator_supported(op)
@@ -123,14 +124,14 @@ def test_constraint_tens_dtype():
# Tensors can only be of type uint8, int8, int16 (and int32)
inp = Tensor([1, 8, 8, 8], DataType.float32, "in")
out = Tensor([1, 8, 8, 8], DataType.float32, "out")
- op = testutil.create_op("Relu", [inp], out)
+ op = testutil.create_op(Op.Relu, [inp], out)
assert not support.is_operator_supported(op)
# For int32, only select op types are allowed:
- op = testutil.create_elemwise_op("Mul", "scalar_mul", [1, 8, 8, 8], [], [1, 8, 8, 8], DataType.int32)
+ op = testutil.create_elemwise_op(Op.Mul, "scalar_mul", [1, 8, 8, 8], [], [1, 8, 8, 8], DataType.int32)
assert support.is_operator_supported(op)
inp = Tensor([1, 8, 8, 8], DataType.int32, "in")
out = Tensor([1, 8, 8, 8], DataType.int32, "out")
- op = testutil.create_op("Relu", [inp], out)
+ op = testutil.create_op(Op.Relu, [inp], out)
assert not support.is_operator_supported(op)
@@ -138,11 +139,11 @@ def test_constraint_tens_dimension():
# Tensors can only have values in the inclusive range of 1-65535
inp = Tensor([1, 8, 8, 0], DataType.uint8, "in")
out = Tensor([1, 8, 8, 0], DataType.uint8, "out")
- op = testutil.create_op("Relu", [inp], out)
+ op = testutil.create_op(Op.Relu, [inp], out)
assert not support.is_operator_supported(op)
inp = Tensor([1, 8, 8, 65536], DataType.uint8, "in")
out = Tensor([1, 8, 8, 65536], DataType.uint8, "out")
- op = testutil.create_op("Relu", [inp], out)
+ op = testutil.create_op(Op.Relu, [inp], out)
assert not support.is_operator_supported(op)
@@ -150,13 +151,14 @@ def test_constraint_faf():
# Fused activation functions, if set, must be a valid op type
inp = Tensor([1, 8, 8, 8], DataType.uint8, "in")
out = Tensor([1, 8, 8, 8], DataType.uint8, "out")
- op = testutil.create_op("Relu", [inp], out, attrs={"fused_activation_function": "Conv2D"})
+ op = testutil.create_op(Op.Relu, [inp], out)
+ op.activation = Op.Conv2D
assert not support.is_operator_supported(op)
def test_constraint_tens_quant_scale():
# Quantization scale cannot be infinit
- op = testutil.create_elemwise_op("Mul", "scalar_mul", [1, 8, 8, 8], [], [1, 8, 8, 8])
+ op = testutil.create_elemwise_op(Op.Mul, "scalar_mul", [1, 8, 8, 8], [], [1, 8, 8, 8])
op.inputs[0].quantization = QuantizationParameters()
op.inputs[0].quantization.scale_f32 = np.inf
assert not support.is_operator_supported(op)
diff --git a/ethosu/vela/test/test_tflite_reader.py b/ethosu/vela/test/test_tflite_reader.py
index d63c0007..23abb4a0 100644
--- a/ethosu/vela/test/test_tflite_reader.py
+++ b/ethosu/vela/test/test_tflite_reader.py
@@ -20,6 +20,7 @@ from unittest.mock import patch
import pytest
+from ethosu.vela.operation import Op
from ethosu.vela.tflite_reader import TFLiteSubgraph
@@ -41,13 +42,13 @@ class TestTFLiteSubgraph:
parse_op_testdata = [
# op_type, opt_serializer, inputs, output, expected
- ("FullyConnected", None, [0, 1, 2], 3, 3), # FC
- ("FullyConnected", None, [0, 1, -1], 3, 3), # FC disabled Bias
- ("FullyConnected", None, [0, 1], 3, 3), # FC no Bias
- ("Conv2D", None, [2, 1, 3], 0, 3), # Conv2D
- ("Conv2DBackprop", None, [0, 1, 2, 3], 4, 4), # TransposeConv
- ("Conv2DBackprop", None, [0, 1, 2], 4, 4), # TransposeConv no Bias
- pytest.param("Conv2D", None, [0, -1, 1], 3, 3, marks=pytest.mark.xfail), # Conv2D no Weights
+ (Op.FullyConnected, None, [0, 1, 2], 3, 3), # FC
+ (Op.FullyConnected, None, [0, 1, -1], 3, 3), # FC disabled Bias
+ (Op.FullyConnected, None, [0, 1], 3, 3), # FC no Bias
+ (Op.Conv2D, None, [2, 1, 3], 0, 3), # Conv2D
+ (Op.Conv2DBackpropInput, None, [0, 1, 2, 3], 4, 4), # TransposeConv
+ (Op.Conv2DBackpropInput, None, [0, 1, 2], 4, 4), # TransposeConv no Bias
+ pytest.param(Op.Conv2D, None, [0, -1, 1], 3, 3, marks=pytest.mark.xfail), # Conv2D no Weights
]
@pytest.mark.parametrize("op_type, opt_serializer, inputs, output, expected", parse_op_testdata)
@@ -56,7 +57,7 @@ class TestTFLiteSubgraph:
# Mock a TFLiteSubGraph
sg = TFLiteSubgraph(None, None)
sg.graph = MagicMock()
- sg.graph.operator_codes = [(op_type, opt_serializer)]
+ sg.graph.operator_codes = [(op_type, opt_serializer, "")]
# Mock a couple of tensors
sg.tensors = [MagicMock() for _ in range(5)]
diff --git a/ethosu/vela/test/testutil.py b/ethosu/vela/test/testutil.py
index 13b6bf40..adb874a0 100644
--- a/ethosu/vela/test/testutil.py
+++ b/ethosu/vela/test/testutil.py
@@ -20,7 +20,6 @@ import numpy as np
from ethosu.vela import architecture_features
from ethosu.vela.data_type import DataType
from ethosu.vela.nn_graph import Subgraph
-from ethosu.vela.operation import NpuBlockType
from ethosu.vela.operation import Operation
from ethosu.vela.tensor import create_const_tensor
from ethosu.vela.tensor import Tensor
@@ -52,7 +51,6 @@ def create_elemwise_op(type, name, ifm_shape, ifm2_shape, ofm_shape, datatype=Da
op.add_input_tensor(create_const_tensor(name + "_ifm2", ifm2_shape, datatype, np.zeros(ifm2_shape), np_type))
ofm = Tensor(ofm_shape, datatype, name + "_ofm")
op.set_output_tensor(ofm)
- op.attrs["npu_block_type"] = NpuBlockType.ElementWise
return op
diff --git a/ethosu/vela/tflite_mapping.py b/ethosu/vela/tflite_mapping.py
index c25f4156..8a53039a 100644
--- a/ethosu/vela/tflite_mapping.py
+++ b/ethosu/vela/tflite_mapping.py
@@ -22,6 +22,8 @@ import struct
import numpy as np
from .data_type import DataType
+from .operation import CustomType
+from .operation import Op
from .tflite import AbsOptions
from .tflite import AddNOptions
from .tflite import AddOptions
@@ -396,16 +398,16 @@ class CustomOptionsSerializer:
attrs["custom_options_format"] = op_data.CustomOptionsFormat()
if np.array_equal(custom_options, self.CUSTOM_OPTIONS_NPU_OP):
- attrs["custom_type"] = "ExistingNpuOp"
+ attrs["custom_type"] = CustomType.ExistingNpuOp
return attrs
def serialize(self, builder, attrs):
- custom_type = attrs.get("custom_type", "")
+ custom_type = attrs.get("custom_type", CustomType.ThirdPartyOp)
self.custom_opt_format = attrs.get("custom_options_format", self.CUSTOM_OPTIONS_FORMAT_DEFAULT)
# Set NPU op custom options for the TensorFlow Lite custom operator
- if custom_type == "NpuOp":
+ if custom_type == CustomType.NpuOp:
custom_options = self.CUSTOM_OPTIONS_NPU_OP
else:
custom_options = attrs.get("custom_options", [])
@@ -426,11 +428,11 @@ padding_inv_map = inverse_map(padding_map)
activation_function_map = {
ActivationFunctionType.NONE: None,
- ActivationFunctionType.RELU: "Relu",
- ActivationFunctionType.RELU_N1_TO_1: "ReluN1To1",
- ActivationFunctionType.RELU6: "Relu6",
- ActivationFunctionType.TANH: "Tanh",
- ActivationFunctionType.SIGN_BIT: "SignBit",
+ ActivationFunctionType.RELU: Op.Relu,
+ ActivationFunctionType.RELU_N1_TO_1: Op.ReluN1To1,
+ ActivationFunctionType.RELU6: Op.Relu6,
+ ActivationFunctionType.TANH: Op.Tanh,
+ ActivationFunctionType.SIGN_BIT: Op.SignBit,
}
activation_function_inv_map = inverse_map(activation_function_map)
@@ -478,52 +480,50 @@ reducer_opts = OptionsSerializer("ReducerOptions", ("keep_dims",))
is_int_vec = True
-custom_prefix = "Custom_"
-
builtin_operator_map = {
- BuiltinOperator.ADD: ("AddAct", OptionsSerializer("AddOptions", (fused_act, "pot_scale_int16"))),
- BuiltinOperator.AVERAGE_POOL_2D: ("AvgPoolAct", pool2d_opts),
- BuiltinOperator.CONCATENATION: ("ConcatTFLite", OptionsSerializer("ConcatenationOptions", ("axis", fused_act))),
- BuiltinOperator.CONV_2D: ("Conv2DBiasAct", conv2d_opts),
- BuiltinOperator.DEPTHWISE_CONV_2D: ("DepthwiseConv2dBiasAct", depthwise_opts),
- BuiltinOperator.DEPTH_TO_SPACE: ("DepthToSpace", OptionsSerializer("DepthToSpaceOptions", ("block_size",))),
- BuiltinOperator.DEQUANTIZE: ("Dequantize", OptionsSerializer("DequantizeOptions")),
- BuiltinOperator.EMBEDDING_LOOKUP: ("EmbeddingLookup", None),
- BuiltinOperator.FLOOR: ("Floor", None),
+ BuiltinOperator.ADD: (Op.Add, OptionsSerializer("AddOptions", (fused_act, "pot_scale_int16"))),
+ BuiltinOperator.AVERAGE_POOL_2D: (Op.AvgPool, pool2d_opts),
+ BuiltinOperator.CONCATENATION: (Op.ConcatTFLite, OptionsSerializer("ConcatenationOptions", ("axis", fused_act))),
+ BuiltinOperator.CONV_2D: (Op.Conv2DBias, conv2d_opts),
+ BuiltinOperator.DEPTHWISE_CONV_2D: (Op.DepthwiseConv2DBias, depthwise_opts),
+ BuiltinOperator.DEPTH_TO_SPACE: (Op.DepthToSpace, OptionsSerializer("DepthToSpaceOptions", ("block_size",))),
+ BuiltinOperator.DEQUANTIZE: (Op.Dequantize, OptionsSerializer("DequantizeOptions")),
+ BuiltinOperator.EMBEDDING_LOOKUP: (Op.EmbeddingLookup, None),
+ BuiltinOperator.FLOOR: (Op.Floor, None),
BuiltinOperator.FULLY_CONNECTED: (
- "FullyConnectedAct",
+ Op.FullyConnected,
OptionsSerializer("FullyConnectedOptions", (fused_act, "weights_format", "asymmetric_quantize_inputs")),
),
- BuiltinOperator.HASHTABLE_LOOKUP: ("HashtableLookup", None),
- BuiltinOperator.L2_NORMALIZATION: ("L2NormAct", OptionsSerializer("L2NormOptions", (fused_act,))),
- BuiltinOperator.L2_POOL_2D: ("L2Pool2D", pool2d_opts),
+ BuiltinOperator.HASHTABLE_LOOKUP: (Op.HashtableLookup, None),
+ BuiltinOperator.L2_NORMALIZATION: (Op.L2Norm, OptionsSerializer("L2NormOptions", (fused_act,))),
+ BuiltinOperator.L2_POOL_2D: (Op.L2Pool2D, pool2d_opts),
BuiltinOperator.LOCAL_RESPONSE_NORMALIZATION: (
- "LRN",
+ Op.LRN,
OptionsSerializer("LocalResponseNormalizationOptions", ("radius", "bias", "alpha", "beta")),
),
- BuiltinOperator.LOGISTIC: ("Sigmoid", None),
- BuiltinOperator.LSH_PROJECTION: ("LSHProjection", OptionsSerializer("LSHProjectionOptions", ("type",))),
- BuiltinOperator.LSTM: ("LstmAct", lstm_opts),
- BuiltinOperator.MAX_POOL_2D: ("MaxPool", pool2d_opts),
- BuiltinOperator.MUL: ("MulAct", OptionsSerializer("MulOptions", (fused_act,))),
- BuiltinOperator.RELU: ("Relu", None),
- BuiltinOperator.RELU_N1_TO_1: ("ReluN1To1", None),
- BuiltinOperator.RELU6: ("Relu6", None),
- BuiltinOperator.RESHAPE: ("Reshape", OptionsSerializer("ReshapeOptions", (("new_shape", is_int_vec),))),
+ BuiltinOperator.LOGISTIC: (Op.Sigmoid, None),
+ BuiltinOperator.LSH_PROJECTION: (Op.LSHProjection, OptionsSerializer("LSHProjectionOptions", ("type",))),
+ BuiltinOperator.LSTM: (Op.Lstm, lstm_opts),
+ BuiltinOperator.MAX_POOL_2D: (Op.MaxPool, pool2d_opts),
+ BuiltinOperator.MUL: (Op.Mul, OptionsSerializer("MulOptions", (fused_act,))),
+ BuiltinOperator.RELU: (Op.Relu, None),
+ BuiltinOperator.RELU_N1_TO_1: (Op.ReluN1To1, None),
+ BuiltinOperator.RELU6: (Op.Relu6, None),
+ BuiltinOperator.RESHAPE: (Op.Reshape, OptionsSerializer("ReshapeOptions", (("new_shape", is_int_vec),))),
BuiltinOperator.RESIZE_BILINEAR: (
- "ResizeBilinear",
+ Op.ResizeBilinear,
OptionsSerializer("ResizeBilinearOptions", ("align_corners", "half_pixel_centers")),
),
- BuiltinOperator.RNN: ("RnnAct", rnn_opts),
- BuiltinOperator.SOFTMAX: ("Softmax", OptionsSerializer("SoftmaxOptions", ("beta",))),
- BuiltinOperator.SPACE_TO_DEPTH: ("SpaceToDepth", OptionsSerializer("SpaceToDepthOptions", ("block_size",))),
+ BuiltinOperator.RNN: (Op.Rnn, rnn_opts),
+ BuiltinOperator.SOFTMAX: (Op.Softmax, OptionsSerializer("SoftmaxOptions", ("beta",))),
+ BuiltinOperator.SPACE_TO_DEPTH: (Op.SpaceToDepth, OptionsSerializer("SpaceToDepthOptions", ("block_size",))),
BuiltinOperator.SVDF: (
- "SvdfAct",
+ Op.Svdf,
OptionsSerializer("SVDFOptions", ("rank", fused_act, "asymmetric_quantize_inputs")),
),
- BuiltinOperator.TANH: ("Tanh", None),
+ BuiltinOperator.TANH: (Op.Tanh, None),
BuiltinOperator.CONCAT_EMBEDDINGS: (
- "ConcatEmbeddings",
+ Op.ConcatEmbeddings,
OptionsSerializer(
"ConcatEmbeddingsOptions",
(
@@ -538,40 +538,40 @@ builtin_operator_map = {
),
),
BuiltinOperator.SKIP_GRAM: (
- "SkipGram",
+ Op.SkipGram,
OptionsSerializer("SkipGramOptions", ("ngram_size", "max_skip_size", "include_all_ngrams")),
),
- BuiltinOperator.CALL: ("Call", OptionsSerializer("CallOptions", ("subgraph",))),
+ BuiltinOperator.CALL: (Op.Call, OptionsSerializer("CallOptions", ("subgraph",))),
BuiltinOperator.EMBEDDING_LOOKUP_SPARSE: (
- "EmbeddingLookupSparse",
+ Op.EmbeddingLookupSparse,
OptionsSerializer("EmbeddingLookupSparseOptions", ("combiner",)),
),
- BuiltinOperator.PAD: ("Pad", OptionsSerializer("PadOptions")),
- BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_RNN: ("UnidirectionalSequenceRnnAct", seq_rnn_opts),
- BuiltinOperator.GATHER: ("GatherV2", OptionsSerializer("GatherOptions", ("axis",))),
- BuiltinOperator.BATCH_TO_SPACE_ND: ("BatchToSpaceND", OptionsSerializer("BatchToSpaceNDOptions")),
- BuiltinOperator.SPACE_TO_BATCH_ND: ("SpaceToBatchND", OptionsSerializer("SpaceToBatchNDOptions")),
- BuiltinOperator.TRANSPOSE: ("Transpose", OptionsSerializer("TransposeOptions")),
- BuiltinOperator.MEAN: ("Mean", None),
- BuiltinOperator.SUB: ("SubAct", OptionsSerializer("SubOptions", (fused_act, "pot_scale_int16",))),
- BuiltinOperator.DIV: ("DivAct", OptionsSerializer("DivOptions", (fused_act,))),
- BuiltinOperator.SQUEEZE: ("Squeeze", OptionsSerializer("SqueezeOptions", (("squeeze_dims", is_int_vec),))),
- BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: ("UnidirectionalSequenceLstmAct", unidir_seq_lstm_opts),
+ BuiltinOperator.PAD: (Op.Pad, OptionsSerializer("PadOptions")),
+ BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_RNN: (Op.UnidirectionalSequenceRnn, seq_rnn_opts),
+ BuiltinOperator.GATHER: (Op.GatherV2, OptionsSerializer("GatherOptions", ("axis",))),
+ BuiltinOperator.BATCH_TO_SPACE_ND: (Op.BatchToSpaceND, OptionsSerializer("BatchToSpaceNDOptions")),
+ BuiltinOperator.SPACE_TO_BATCH_ND: (Op.SpaceToBatchND, OptionsSerializer("SpaceToBatchNDOptions")),
+ BuiltinOperator.TRANSPOSE: (Op.Transpose, OptionsSerializer("TransposeOptions")),
+ BuiltinOperator.MEAN: (Op.Mean, None),
+ BuiltinOperator.SUB: (Op.Sub, OptionsSerializer("SubOptions", (fused_act, "pot_scale_int16",))),
+ BuiltinOperator.DIV: (Op.Div, OptionsSerializer("DivOptions", (fused_act,))),
+ BuiltinOperator.SQUEEZE: (Op.Squeeze, OptionsSerializer("SqueezeOptions", (("squeeze_dims", is_int_vec),))),
+ BuiltinOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: (Op.UnidirectionalSequenceLstm, unidir_seq_lstm_opts),
BuiltinOperator.STRIDED_SLICE: (
- "StridedSlice",
+ Op.StridedSlice,
OptionsSerializer(
"StridedSliceOptions", ("begin_mask", "end_mask", "ellipsis_mask", "new_axis_mask", "shrink_axis_mask")
),
),
- BuiltinOperator.BIDIRECTIONAL_SEQUENCE_RNN: ("BidirectionalSequenceRnnAct", bidir_seq_rnn_opts),
- BuiltinOperator.EXP: ("Exp", OptionsSerializer("ExpOptions")),
- BuiltinOperator.TOPK_V2: ("TopKV2", OptionsSerializer("TopKV2Options")),
- BuiltinOperator.SPLIT: ("Split", OptionsSerializer("SplitOptions", ("num_splits",))),
- BuiltinOperator.LOG_SOFTMAX: ("LogSoftmax", OptionsSerializer("LogSoftmaxOptions")),
- BuiltinOperator.DELEGATE: ("Delegate", None),
- BuiltinOperator.BIDIRECTIONAL_SEQUENCE_LSTM: ("BidirectionalSequenceLstmAct", bidir_seq_lstm_opts),
+ BuiltinOperator.BIDIRECTIONAL_SEQUENCE_RNN: (Op.BidirectionalSequenceRnn, bidir_seq_rnn_opts),
+ BuiltinOperator.EXP: (Op.Exp, OptionsSerializer("ExpOptions")),
+ BuiltinOperator.TOPK_V2: (Op.TopKV2, OptionsSerializer("TopKV2Options")),
+ BuiltinOperator.SPLIT: (Op.Split, OptionsSerializer("SplitOptions", ("num_splits",))),
+ BuiltinOperator.LOG_SOFTMAX: (Op.LogSoftmax, OptionsSerializer("LogSoftmaxOptions")),
+ BuiltinOperator.DELEGATE: (Op.Delegate, None),
+ BuiltinOperator.BIDIRECTIONAL_SEQUENCE_LSTM: (Op.BidirectionalSequenceLstm, bidir_seq_lstm_opts),
BuiltinOperator.CAST: (
- "Cast",
+ Op.Cast,
OptionsSerializer(
"CastOptions",
(
@@ -580,109 +580,112 @@ builtin_operator_map = {
),
),
),
- BuiltinOperator.PRELU: ("Prelu", None),
- BuiltinOperator.MAXIMUM: ("Maximum", OptionsSerializer("MaximumMinimumOptions")),
+ BuiltinOperator.PRELU: (Op.Prelu, None),
+ BuiltinOperator.MAXIMUM: (Op.Maximum, OptionsSerializer("MaximumMinimumOptions")),
BuiltinOperator.ARG_MAX: (
- "ArgMax",
+ Op.ArgMax,
OptionsSerializer("ArgMaxOptions", (("output_type", datatype_deserialize, datatype_serialize),)),
),
- BuiltinOperator.MINIMUM: ("Minimum", OptionsSerializer("MaximumMinimumOptions")),
- BuiltinOperator.LESS: ("Less", OptionsSerializer("LessOptions")),
- BuiltinOperator.NEG: ("Neg", OptionsSerializer("NegOptions")),
- BuiltinOperator.PADV2: ("PadV2", OptionsSerializer("PadV2Options")),
- BuiltinOperator.GREATER: ("Greater", OptionsSerializer("GreaterOptions")),
- BuiltinOperator.GREATER_EQUAL: ("GreaterEqual", OptionsSerializer("GreaterEqualOptions")),
- BuiltinOperator.LESS_EQUAL: ("LessEqual", OptionsSerializer("LessEqualOptions")),
- BuiltinOperator.SELECT: ("Select", OptionsSerializer("SelectOptions")),
- BuiltinOperator.SLICE: ("Slice", OptionsSerializer("SliceOptions")),
- BuiltinOperator.SIN: ("Sin", None),
+ BuiltinOperator.MINIMUM: (Op.Minimum, OptionsSerializer("MaximumMinimumOptions")),
+ BuiltinOperator.LESS: (Op.Less, OptionsSerializer("LessOptions")),
+ BuiltinOperator.NEG: (Op.Neg, OptionsSerializer("NegOptions")),
+ BuiltinOperator.PADV2: (Op.PadV2, OptionsSerializer("PadV2Options")),
+ BuiltinOperator.GREATER: (Op.Greater, OptionsSerializer("GreaterOptions")),
+ BuiltinOperator.GREATER_EQUAL: (Op.GreaterEqual, OptionsSerializer("GreaterEqualOptions")),
+ BuiltinOperator.LESS_EQUAL: (Op.LessEqual, OptionsSerializer("LessEqualOptions")),
+ BuiltinOperator.SELECT: (Op.Select, OptionsSerializer("SelectOptions")),
+ BuiltinOperator.SLICE: (Op.Slice, OptionsSerializer("SliceOptions")),
+ BuiltinOperator.SIN: (Op.Sin, None),
BuiltinOperator.TRANSPOSE_CONV: (
- "Conv2DBackpropInput",
+ Op.Conv2DBackpropInput,
OptionsSerializer("TransposeConvOptions", (padding, "stride_w", "stride_h")),
),
BuiltinOperator.SPARSE_TO_DENSE: (
- "SparseToDense",
+ Op.SparseToDense,
OptionsSerializer("SparseToDenseOptions", ("validate_indices",)),
),
- BuiltinOperator.TILE: ("Tile", OptionsSerializer("TileOptions")),
- BuiltinOperator.EXPAND_DIMS: ("ExpandDims", OptionsSerializer("ExpandDimsOptions")),
- BuiltinOperator.EQUAL: ("Equal", OptionsSerializer("EqualOptions")),
- BuiltinOperator.NOT_EQUAL: ("NotEqual", OptionsSerializer("NotEqualOptions")),
- BuiltinOperator.LOG: ("Log", None),
- BuiltinOperator.SUM: ("Sum", None),
- BuiltinOperator.SQRT: ("Sqrt", None),
- BuiltinOperator.RSQRT: ("Rsqrt", None),
+ BuiltinOperator.TILE: (Op.Tile, OptionsSerializer("TileOptions")),
+ BuiltinOperator.EXPAND_DIMS: (Op.ExpandDims, OptionsSerializer("ExpandDimsOptions")),
+ BuiltinOperator.EQUAL: (Op.Equal, OptionsSerializer("EqualOptions")),
+ BuiltinOperator.NOT_EQUAL: (Op.NotEqual, OptionsSerializer("NotEqualOptions")),
+ BuiltinOperator.LOG: (Op.Log, None),
+ BuiltinOperator.SUM: (Op.Sum, None),
+ BuiltinOperator.SQRT: (Op.Sqrt, None),
+ BuiltinOperator.RSQRT: (Op.Rsqrt, None),
BuiltinOperator.SHAPE: (
- "Shape",
+ Op.Shape,
OptionsSerializer("ShapeOptions", (("out_type", datatype_deserialize, datatype_serialize),)),
),
- BuiltinOperator.POW: ("Pow", OptionsSerializer("PowOptions")),
+ BuiltinOperator.POW: (Op.Pow, OptionsSerializer("PowOptions")),
BuiltinOperator.ARG_MIN: (
- "ArgMin",
+ Op.ArgMin,
OptionsSerializer("ArgMinOptions", (("output_type", datatype_deserialize, datatype_serialize),)),
),
BuiltinOperator.FAKE_QUANT: (
- "FakeQuantWithMinMaxArgs",
+ Op.FakeQuantWithMinMaxArgs,
OptionsSerializer("FakeQuantOptions", ("min", "max", "num_bits", "narrow_range")),
),
- BuiltinOperator.REDUCE_PROD: ("Prod", reducer_opts),
- BuiltinOperator.REDUCE_MAX: ("Max", reducer_opts),
- BuiltinOperator.PACK: ("Pack", OptionsSerializer("PackOptions", ("values_count", "axis"))),
- BuiltinOperator.LOGICAL_OR: ("LogicalOr", OptionsSerializer("LogicalOrOptions")),
- BuiltinOperator.ONE_HOT: ("OneHot", OptionsSerializer("OneHotOptions", ("axis",))),
- BuiltinOperator.LOGICAL_AND: ("LogicalAnd", OptionsSerializer("LogicalAndOptions")),
- BuiltinOperator.LOGICAL_NOT: ("LogicalNot", OptionsSerializer("LogicalNotOptions")),
- BuiltinOperator.UNPACK: ("Unpack", OptionsSerializer("UnpackOptions", ("num", "axis"))),
- BuiltinOperator.REDUCE_MIN: ("Min", reducer_opts),
- BuiltinOperator.FLOOR_DIV: ("FloorDiv", OptionsSerializer("FloorDivOptions")),
- BuiltinOperator.REDUCE_ANY: ("Any", reducer_opts),
- BuiltinOperator.SQUARE: ("Square", OptionsSerializer("SquareOptions")),
- BuiltinOperator.ZEROS_LIKE: ("ZerosLike", OptionsSerializer("ZerosLikeOptions")),
- BuiltinOperator.FILL: ("Fill", OptionsSerializer("FillOptions")),
- BuiltinOperator.FLOOR_MOD: ("FloorMod", OptionsSerializer("FloorModOptions")),
- BuiltinOperator.RANGE: ("Range", OptionsSerializer("RangeOptions")),
+ BuiltinOperator.REDUCE_PROD: (Op.Prod, reducer_opts),
+ BuiltinOperator.REDUCE_MAX: (Op.Max, reducer_opts),
+ BuiltinOperator.PACK: (Op.Pack, OptionsSerializer("PackOptions", ("values_count", "axis"))),
+ BuiltinOperator.LOGICAL_OR: (Op.LogicalOr, OptionsSerializer("LogicalOrOptions")),
+ BuiltinOperator.ONE_HOT: (Op.OneHot, OptionsSerializer("OneHotOptions", ("axis",))),
+ BuiltinOperator.LOGICAL_AND: (Op.LogicalAnd, OptionsSerializer("LogicalAndOptions")),
+ BuiltinOperator.LOGICAL_NOT: (Op.LogicalNot, OptionsSerializer("LogicalNotOptions")),
+ BuiltinOperator.UNPACK: (Op.Unpack, OptionsSerializer("UnpackOptions", ("num", "axis"))),
+ BuiltinOperator.REDUCE_MIN: (Op.Min, reducer_opts),
+ BuiltinOperator.FLOOR_DIV: (Op.FloorDiv, OptionsSerializer("FloorDivOptions")),
+ BuiltinOperator.REDUCE_ANY: (Op.Any, reducer_opts),
+ BuiltinOperator.SQUARE: (Op.Square, OptionsSerializer("SquareOptions")),
+ BuiltinOperator.ZEROS_LIKE: (Op.ZerosLike, OptionsSerializer("ZerosLikeOptions")),
+ BuiltinOperator.FILL: (Op.Fill, OptionsSerializer("FillOptions")),
+ BuiltinOperator.FLOOR_MOD: (Op.FloorMod, OptionsSerializer("FloorModOptions")),
+ BuiltinOperator.RANGE: (Op.Range, OptionsSerializer("RangeOptions")),
BuiltinOperator.RESIZE_NEAREST_NEIGHBOR: (
- "ResizeNearestNeighbor",
+ Op.ResizeNearestNeighbor,
OptionsSerializer("ResizeNearestNeighborOptions", ("align_corners", "half_pixel_centers")),
),
- BuiltinOperator.LEAKY_RELU: ("LeakyRelu", OptionsSerializer("LeakyReluOptions", ("alpha",))),
- BuiltinOperator.SQUARED_DIFFERENCE: ("SquaredDifference", OptionsSerializer("SquaredDifferenceOptions")),
- BuiltinOperator.MIRROR_PAD: ("MirrorPad", OptionsSerializer("MirrorPadOptions", ("mode",))),
- BuiltinOperator.ABS: ("Abs", OptionsSerializer("AbsOptions")),
- BuiltinOperator.SPLIT_V: ("SplitV", OptionsSerializer("SplitVOptions", ("num_splits",))),
+ BuiltinOperator.LEAKY_RELU: (Op.LeakyRelu, OptionsSerializer("LeakyReluOptions", ("alpha",))),
+ BuiltinOperator.SQUARED_DIFFERENCE: (Op.SquaredDifference, OptionsSerializer("SquaredDifferenceOptions")),
+ BuiltinOperator.MIRROR_PAD: (Op.MirrorPad, OptionsSerializer("MirrorPadOptions", ("mode",))),
+ BuiltinOperator.ABS: (Op.Abs, OptionsSerializer("AbsOptions")),
+ BuiltinOperator.SPLIT_V: (Op.SplitV, OptionsSerializer("SplitVOptions", ("num_splits",))),
BuiltinOperator.UNIQUE: (
- "Unique",
+ Op.Unique,
OptionsSerializer("UniqueOptions", (("idx_out_type", datatype_deserialize, datatype_serialize),)),
),
- BuiltinOperator.CEIL: ("Ceil", None),
- BuiltinOperator.REVERSE_V2: ("ReverseV2", OptionsSerializer("ReverseV2Options")),
- BuiltinOperator.ADD_N: ("AddN", OptionsSerializer("AddNOptions")),
- BuiltinOperator.GATHER_ND: ("GatherNd", OptionsSerializer("GatherNdOptions")),
- BuiltinOperator.COS: ("Cos", OptionsSerializer("CosOptions")),
- BuiltinOperator.WHERE: ("Where", OptionsSerializer("WhereOptions")),
- BuiltinOperator.RANK: ("Rank", OptionsSerializer("RankOptions")),
- BuiltinOperator.ELU: ("Elu", None),
+ BuiltinOperator.CEIL: (Op.Ceil, None),
+ BuiltinOperator.REVERSE_V2: (Op.ReverseV2, OptionsSerializer("ReverseV2Options")),
+ BuiltinOperator.ADD_N: (Op.AddN, OptionsSerializer("AddNOptions")),
+ BuiltinOperator.GATHER_ND: (Op.GatherNd, OptionsSerializer("GatherNdOptions")),
+ BuiltinOperator.COS: (Op.Cos, OptionsSerializer("CosOptions")),
+ BuiltinOperator.WHERE: (Op.Where, OptionsSerializer("WhereOptions")),
+ BuiltinOperator.RANK: (Op.Rank, OptionsSerializer("RankOptions")),
+ BuiltinOperator.ELU: (Op.Elu, None),
BuiltinOperator.REVERSE_SEQUENCE: (
- "ReverseSequence",
+ Op.ReverseSequence,
OptionsSerializer("ReverseSequenceOptions", ("seq_dim", "batch_dim")),
),
- BuiltinOperator.MATRIX_DIAG: ("MatrixDiag", OptionsSerializer("MatrixDiagOptions")),
- BuiltinOperator.QUANTIZE: ("Quantize", OptionsSerializer("QuantizeOptions")),
- BuiltinOperator.MATRIX_SET_DIAG: ("MatrixSetDiag", OptionsSerializer("MatrixSetDiagOptions")),
- BuiltinOperator.ROUND: ("Round", None),
- BuiltinOperator.HARD_SWISH: ("HardSwish", OptionsSerializer("HardSwishOptions")),
- BuiltinOperator.IF: ("If", OptionsSerializer("IfOptions", ("then_subgraph_index", "else_subgraph_index"))),
- BuiltinOperator.WHILE: ("While", OptionsSerializer("WhileOptions", ("cond_subgraph_index", "body_subgraph_index"))),
- BuiltinOperator.NON_MAX_SUPPRESSION_V4: ("NonMaxSuppressionV4", OptionsSerializer("NonMaxSuppressionV4Options")),
- BuiltinOperator.NON_MAX_SUPPRESSION_V5: ("NonMaxSuppressionV5", OptionsSerializer("NonMaxSuppressionV5Options")),
- BuiltinOperator.SCATTER_ND: ("ScatterNd", OptionsSerializer("ScatterNdOptions")),
- BuiltinOperator.SELECT_V2: ("SelectV2", OptionsSerializer("SelectV2Options")),
- BuiltinOperator.DENSIFY: ("Densify", OptionsSerializer("DensifyOptions")),
- BuiltinOperator.SEGMENT_SUM: ("SegmentSum", OptionsSerializer("SegmentSumOptions")),
- BuiltinOperator.BATCH_MATMUL: ("BatchMatMul", OptionsSerializer("BatchMatMulOptions", ("adj_x", "adj_y"))),
- BuiltinOperator.CUSTOM: (custom_prefix, CustomOptionsSerializer()),
+ BuiltinOperator.MATRIX_DIAG: (Op.MatrixDiag, OptionsSerializer("MatrixDiagOptions")),
+ BuiltinOperator.QUANTIZE: (Op.Quantize, OptionsSerializer("QuantizeOptions")),
+ BuiltinOperator.MATRIX_SET_DIAG: (Op.MatrixSetDiag, OptionsSerializer("MatrixSetDiagOptions")),
+ BuiltinOperator.ROUND: (Op.Round, None),
+ BuiltinOperator.HARD_SWISH: (Op.HardSwish, OptionsSerializer("HardSwishOptions")),
+ BuiltinOperator.IF: (Op.If, OptionsSerializer("IfOptions", ("then_subgraph_index", "else_subgraph_index"))),
+ BuiltinOperator.WHILE: (
+ Op.While,
+ OptionsSerializer("WhileOptions", ("cond_subgraph_index", "body_subgraph_index")),
+ ),
+ BuiltinOperator.NON_MAX_SUPPRESSION_V4: (Op.NonMaxSuppressionV4, OptionsSerializer("NonMaxSuppressionV4Options")),
+ BuiltinOperator.NON_MAX_SUPPRESSION_V5: (Op.NonMaxSuppressionV5, OptionsSerializer("NonMaxSuppressionV5Options")),
+ BuiltinOperator.SCATTER_ND: (Op.ScatterNd, OptionsSerializer("ScatterNdOptions")),
+ BuiltinOperator.SELECT_V2: (Op.SelectV2, OptionsSerializer("SelectV2Options")),
+ BuiltinOperator.DENSIFY: (Op.Densify, OptionsSerializer("DensifyOptions")),
+ BuiltinOperator.SEGMENT_SUM: (Op.SegmentSum, OptionsSerializer("SegmentSumOptions")),
+ BuiltinOperator.BATCH_MATMUL: (Op.BatchMatMul, OptionsSerializer("BatchMatMulOptions", ("adj_x", "adj_y"))),
+ BuiltinOperator.CUSTOM: (Op.Custom, CustomOptionsSerializer()),
}
builtin_operator_inv_map = {v[0]: (k, v[1]) for k, v in builtin_operator_map.items()}
-builtin_operator_inv_map["NpuOp"] = (BuiltinOperator.CUSTOM, CustomOptionsSerializer())
+builtin_operator_inv_map[Op.CustomNpuOp] = (BuiltinOperator.CUSTOM, CustomOptionsSerializer())
diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py
index 77cc7963..a03f9ec2 100644
--- a/ethosu/vela/tflite_reader.py
+++ b/ethosu/vela/tflite_reader.py
@@ -23,6 +23,7 @@ from .errors import InputFileError
from .errors import TensorError
from .nn_graph import Graph
from .nn_graph import Subgraph
+from .operation import Op
from .operation import Operation
from .tensor import QuantizationParameters
from .tensor import Tensor
@@ -53,7 +54,7 @@ def clone_and_reshape_tensor(src_tens, reorder):
if tens.quant_values is not None:
tens.quant_values = tens.quant_values.transpose(reorder)
- op = Operation("Const", tens.name)
+ op = Operation(Op.Const, tens.name)
op.set_output_tensor(tens)
return tens
@@ -78,12 +79,12 @@ class TFLiteSubgraph:
if tens.ops != []:
TensorError(tens, "This subgraph input tensor has unexpected driving operators.")
- op = Operation("Placeholder", tens.name)
+ op = Operation(Op.Placeholder, tens.name)
op.set_output_tensor(tens)
for tens in self.tensors:
if not tens.ops:
- op = Operation("Const", tens.name)
+ op = Operation(Op.Const, tens.name)
op.set_output_tensor(tens)
def get_tensors_from_indices_remove_duplicates(self, indices, warning_str):
@@ -136,7 +137,7 @@ class TFLiteSubgraph:
return tens
def parse_operator(self, op_index, op_data):
- op_type, opt_serializer = self.graph.operator_codes[op_data.OpcodeIndex()]
+ op_type, opt_serializer, custom_code = self.graph.operator_codes[op_data.OpcodeIndex()]
inputs = [self.tensors[idx] if idx != -1 else None for idx in op_data.InputsAsNumpy()]
outputs = [self.tensors[idx] if idx != -1 else None for idx in op_data.OutputsAsNumpy()]
name = "unknown_op_name"
@@ -149,19 +150,13 @@ class TFLiteSubgraph:
for out in op.outputs:
out.ops = [op]
- if op_type.startswith("DepthwiseConv2d") or op_type.startswith("Conv2D"):
+ if op.type.is_depthwise_conv2d_op() or op.type.is_conv2d_op() or op.type == Op.FullyConnected:
if inputs[1].values is not None:
- inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 2, 3, 0))
- if len(inputs) < 3 or (len(inputs) < 4 and "Backprop" in op_type):
- # No Bias tensor
- inputs.append(None)
- if inputs[-1]:
- inputs[-1] = clone_and_reshape_tensor(inputs[-1], (0,))
-
- if op_type.startswith("FullyConnected"):
- if inputs[1].values is not None:
- inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 0))
- if len(inputs) < 3:
+ if op.type == Op.FullyConnected:
+ inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 0))
+ else:
+ inputs[1] = clone_and_reshape_tensor(inputs[1], (1, 2, 3, 0))
+ if op.type.needs_bias() and len(inputs) <= op_type.info.indices.biases[0]:
# No Bias tensor
inputs.append(None)
if inputs[-1]:
@@ -170,11 +165,11 @@ class TFLiteSubgraph:
if opt_serializer is not None:
op.attrs = opt_serializer.deserialize(op_data)
- if op_type == "Reshape" and "new_shape" not in op.attrs:
+ if op_type == Op.Reshape and "new_shape" not in op.attrs:
# Reshape should have an attrib "new_shape" but if it is missing, add it based on the output shape
op.attrs["new_shape"] = outputs[0].shape
- if op_type == "Cast":
+ if op_type == Op.Cast:
# Cast op should have "in/out_data_type" attribs add if missing
if "in_data_type" not in op.attrs:
op.attrs["in_data_type"] = inputs[0].dtype
@@ -190,6 +185,9 @@ class TFLiteSubgraph:
if "depth_multiplier" in op.attrs:
op.attrs["channel_multiplier"] = op.attrs["depth_multiplier"]
+ op.activation = op.attrs.pop("fused_activation_function", None)
+ if custom_code is not None:
+ op.attrs["custom_code"] = custom_code
@staticmethod
def len1_array_to_scalar(arr):
@@ -260,9 +258,10 @@ class TFLiteGraph:
msg = "The input file contains operator code {} which is currently not supported".format(c)
raise InputFileError(self.name, msg)
op_type, ser = builtin_operator_map[c]
+ custom_code = None
if c == BuiltinOperator.CUSTOM:
- op_type += decode_str(code.CustomCode())
- return op_type, ser
+ custom_code = decode_str(code.CustomCode())
+ return op_type, ser, custom_code
def read_tflite(
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index 68af4874..f444ee51 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -22,6 +22,7 @@ from flatbuffers import encode
from flatbuffers.builder import UOffsetTFlags
from .nn_graph import PassPlacement
+from .operation import Op
from .tensor import MemType
from .tensor import TensorPurpose
from .tflite import Buffer
@@ -34,7 +35,6 @@ from .tflite import SubGraph
from .tflite import Tensor
from .tflite_mapping import builtin_operator_inv_map
from .tflite_mapping import BuiltinOperator
-from .tflite_mapping import custom_prefix
from .tflite_mapping import datatype_inv_map
# ugh, the python flatbuffer interface is missing a method to add in file identifier. patching it in here:
@@ -77,7 +77,7 @@ class TFLiteSerialiser:
self.scratch_fast_buf_id = 1 # Always assign scratch_fast to buffer 1
self.buffers_to_write = [] # have an empty array there
- self.ops_to_ignore = set(("Const", "Placeholder", "SubgraphInput"))
+ self.ops_to_ignore = set((Op.Const, Op.Placeholder, Op.SubgraphInput))
self.tensors_to_reshape = {}
@@ -89,16 +89,17 @@ class TFLiteSerialiser:
for op in ps.ops:
if op.type not in self.ops_to_ignore:
all_ops.append(op)
- if op.type.startswith("Conv2D") or op.type.startswith("DepthwiseConv2d"):
+ if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
# If values are None op has non-constant weights
if op.inputs[1].values is not None:
self.tensors_to_reshape[op.inputs[1]] = (3, 0, 1, 2)
- if op.type.startswith("FullyConnected"):
+ if op.type == Op.FullyConnected:
# If values are None op has non-constant weights
if op.inputs[1].values is not None:
self.tensors_to_reshape[op.inputs[1]] = (1, 0)
- self.operator_codes = list(sorted(set(op.type for op in all_ops)))
+ # list of tuple(Op, string); the custom code is only used for 3rd party custom operators
+ self.operator_codes = sorted(set((op.type, op.attrs.get("custom_code", "")) for op in all_ops))
self.operator_code_map = {}
def write_byte_vector(self, v, alignment=1):
@@ -163,25 +164,25 @@ class TFLiteSerialiser:
return buffer_map
- def serialise_operator_code(self, idx, code):
+ def serialise_operator_code(self, idx, op_type, custom_code):
builder = self.builder
custom_code_offset = None
- if code.startswith(custom_prefix):
- tf_code, opt_serializer = builtin_operator_inv_map[custom_prefix]
- custom_code_offset = builder.CreateString(code[len(custom_prefix) :])
+ if op_type == Op.Custom:
+ tf_code, opt_serializer = builtin_operator_inv_map[op_type]
+ custom_code_offset = builder.CreateString(custom_code)
else:
assert (
- code in builtin_operator_inv_map
- ), "Vela does not contain a mapping to serialise {} operator to a TensorFlow Lite operator".format(code)
- tf_code, opt_serializer = builtin_operator_inv_map[code]
+ op_type in builtin_operator_inv_map
+ ), "Vela does not contain a mapping to serialise {} operator to a TensorFlow Lite operator".format(op_type)
+ tf_code, opt_serializer = builtin_operator_inv_map[op_type]
if tf_code == BuiltinOperator.CUSTOM:
assert (
- code == "NpuOp"
+ op_type == Op.CustomNpuOp
), "Vela only supports serialising NpuOp operators as TensorFlow Lite Custom operators"
custom_code_offset = builder.CreateString("ethos-u")
- self.operator_code_map[code] = (idx, tf_code, opt_serializer)
+ self.operator_code_map[op_type] = (idx, tf_code, opt_serializer)
OperatorCode.OperatorCodeStart(builder)
OperatorCode.OperatorCodeAddBuiltinCode(builder, tf_code)
@@ -281,6 +282,8 @@ class TFLiteSerialiser:
attrs["dilation_w_factor"] = attrs["dilation"][2]
if "channel_multiplier" in attrs:
attrs["depth_multiplier"] = attrs["channel_multiplier"]
+ if op.activation is not None:
+ attrs["fused_activation_function"] = op.activation
builtin_opt_offset, custom_opt_offset = opt_serializer.serialize(builder, attrs)
@@ -310,7 +313,7 @@ class TFLiteSerialiser:
for op in ps.ops:
if op.type not in self.ops_to_ignore:
all_ops.append(op)
- elif op.type == "Placeholder":
+ elif op.type == Op.Placeholder:
placeholder_ops.append(op)
# Add the tensors from all valid ops, as well as the tensors from placeholder ops
@@ -404,7 +407,7 @@ class TFLiteSerialiser:
def serialise_model(self):
builder = self.builder
operator_code_offset = self.write_offset_vector(
- [self.serialise_operator_code(idx, code) for idx, code in enumerate(self.operator_codes)]
+ [self.serialise_operator_code(idx, optype, code) for idx, (optype, code) in enumerate(self.operator_codes)]
)
description = builder.CreateString("Vela Optimised")
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index 8426705a..94535216 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -28,6 +28,7 @@ from .nn_graph import SchedulingStrategy
from .numeric_util import round_up
from .numeric_util import round_up_divide
from .operation import NpuBlockType
+from .operation import Op
from .scaling import quantise_scale
from .scaling import reduced_quantise_scale
from .tensor import create_equivalence_id
@@ -336,7 +337,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
is_depthwise = tens.block_traversal == TensorBlockTraversal.DepthWise
is_partkernel = tens.block_traversal == TensorBlockTraversal.PartKernelFirst
- if tens.consumer_list[0].type == "Conv2DBackpropInputSwitchedBias":
+ if tens.consumer_list[0].type == Op.Conv2DBackpropInputSwitchedBias:
# Transpose Convoluion, reverse weights in H and W axes
weights = np.flip(weights, axis=(0, 1))
@@ -406,9 +407,9 @@ def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=Fals
assert tens.purpose == TensorPurpose.FeatureMap
assert tens.format == TensorFormat.NHWC
# the connected operator should expect a bias input unless it is a FullyConnected
- assert "Bias" in tens.consumer_list[0].type or tens.consumer_list[0].type.startswith("FullyConnected")
+ assert tens.consumer_list[0].type.needs_bias()
# the input bias tensor is the same as that connected to the operator
- _, _, bias_tens, _ = tens.consumer_list[0].get_ifm_weights_biases_ofm()
+ bias_tens = tens.consumer_list[0].bias
assert tens is bias_tens
# the operator should only have a single output
@@ -508,14 +509,13 @@ def update_pass_weight_and_scale_tensors(nng, arch):
op = tens.find_npu_op()
if op is None:
continue
- npu_usage_of_tensor = op.attrs["npu_block_type"]
needs_dma = tens.needs_dma()
if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma:
ofm_depth_step = ps.block_config[-1]
else:
ofm_depth_step = tens.shape[-1]
compress_weights(
- arch, nng, tens, npu_usage_of_tensor, ps.block_config[-1], ofm_depth_step, op.get_dilation_h_w()
+ arch, nng, tens, op.type.npu_block_type, ps.block_config[-1], ofm_depth_step, op.get_dilation_h_w()
)
# Update source tensor
if needs_dma:
@@ -527,7 +527,7 @@ def update_pass_weight_and_scale_tensors(nng, arch):
if ps.scale_tensor is not None:
rescale_for_faf = False
- activation_ops = set(("Sigmoid", "Tanh"))
+ activation_ops = set((Op.Sigmoid, Op.Tanh))
if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):
rescale_for_faf = True
calc_scales_and_pack_biases(ps.scale_tensor, arch, ofm_depth_step, rescale_for_faf)