aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/tflite_graph_optimiser.py
diff options
context:
space:
mode:
Diffstat (limited to 'ethosu/vela/tflite_graph_optimiser.py')
-rw-r--r--ethosu/vela/tflite_graph_optimiser.py61
1 files changed, 41 insertions, 20 deletions
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 5b0e2fb3..077f4afa 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -51,7 +51,9 @@ from .operation import Operation
from .operation import Padding
from .operation_util import create_add_nop
from .operation_util import create_avgpool_nop
+from .operation_util import create_cast_op
from .operation_util import create_depthwise_maxpool
+from .operation_util import create_memcpy
from .operation_util import get_pad_values_from_input
from .scaling import quantise_scale
from .shape4d import Shape4D
@@ -520,7 +522,8 @@ def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng):
"dilation": (1, 1, 1, 1),
"explicit_padding": None,
}
- op.name = "depthwise_conv_SHL_7"
+ orig_name = op.name
+ op.name = f"{orig_name}_depthwise_conv_SHL_7"
op.type = Op.DepthwiseConv2DBias
op.attrs.update(dw_op_attrs)
n, h, w, c = full_shape(4, ifm.shape, 1)
@@ -592,25 +595,43 @@ def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng):
maxpool_op.write_offset = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])
DebugDatabase.add_optimised(op, maxpool_op)
- # Convert output to OFM dtype and reshape back to original OFM shape with 1x1 DWConv
- dw_conv = Operation(Op.DepthwiseConv2DBias, f"depthwise_conv_convert_to_32bit_{op_idx}")
- dw_conv.attrs.update(dw_op_attrs)
- dw_conv.inputs = [maxpool_op.ofm]
- dw_conv.add_input_tensor(
- create_const_tensor(
- "weights",
- [1, 1, 1, 1],
- DataType.uint8,
- np.array([1]).reshape([1, 1, 1, 1]),
- quantization=identity_quant,
- ),
- )
- dw_conv.add_input_tensor(create_const_tensor(dw_conv.name + "_bias", [1], DataType.int64, [0]))
- ofm.ops.append(dw_conv)
- dw_conv.outputs = [ofm]
- dw_conv.ifm_shapes.append(Shape4D([1, orig_ifm_shape.height, orig_ifm_shape.width, 1]))
- dw_conv.ofm_shapes.append(Shape4D(ofm.shape))
- DebugDatabase.add_optimised(op, dw_conv)
+ # Set final shape
+ maxpool_ofm.set_all_shapes([1, h, w, 1])
+
+ # Convert 16bit to 32bit or 64bit
+ if ofm.dtype == DataType.int64:
+ # If OFM dtype is int64 the result is converted by two cast ops (16bit to 32bit)
+ #
+ # A -> B -> C -> D (OFM)
+ # |0001| |00010000| |0001|0000| |00010000|00000000|
+ # i16 i32 i16 i16 i32 i32
+ # <-------i64------->
+ #
+ # Memcpy is used to copy the content from B to C and from D to OFM
+ # Memcpy will be turned into a nop or an DMA transer if memory regions differs.
+ intermediate_32bit = Tensor([1, h, w, 1], DataType.int32, f"{orig_name}_32bit")
+ else:
+ intermediate_32bit = ofm
+
+ op_cast = create_cast_op(f"{orig_name}_cast_to_32bit_1", maxpool_ofm, intermediate_32bit)
+ DebugDatabase.add_optimised(op, op_cast)
+
+ if ofm.dtype == DataType.int64:
+ # Create int16 tensor with double shape to cover the intermediate_32bit result from the first cast
+ intermediate_16bit_2x_size = Tensor([1, h, w, 2], DataType.int16, f"{orig_name}_16bit_2x_size")
+ memcpy_op = create_memcpy(f"{orig_name}_memcpy_1", intermediate_32bit, intermediate_16bit_2x_size)
+ DebugDatabase.add_optimised(op, memcpy_op)
+
+ # Create int32 tensor with double ofm shape to be able to store a "int64" result
+ intermediate_32bit_2x_size = Tensor([1, h, w, 2], DataType.int32, f"{orig_name}_32bit_2x_size")
+
+ op_cast = create_cast_op(
+ f"{orig_name}_cast_to_32bit_2", intermediate_16bit_2x_size, intermediate_32bit_2x_size
+ )
+ DebugDatabase.add_optimised(op, op_cast)
+
+ memcpy_op = create_memcpy("f{orig_name}_memcpy_2", intermediate_32bit_2x_size, ofm)
+ DebugDatabase.add_optimised(op, memcpy_op)
return op