diff options
Diffstat (limited to 'ethosu/vela/tflite_graph_optimiser.py')
-rw-r--r-- | ethosu/vela/tflite_graph_optimiser.py | 61 |
1 files changed, 41 insertions, 20 deletions
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index 5b0e2fb3..077f4afa 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -51,7 +51,9 @@ from .operation import Operation from .operation import Padding from .operation_util import create_add_nop from .operation_util import create_avgpool_nop +from .operation_util import create_cast_op from .operation_util import create_depthwise_maxpool +from .operation_util import create_memcpy from .operation_util import get_pad_values_from_input from .scaling import quantise_scale from .shape4d import Shape4D @@ -520,7 +522,8 @@ def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng): "dilation": (1, 1, 1, 1), "explicit_padding": None, } - op.name = "depthwise_conv_SHL_7" + orig_name = op.name + op.name = f"{orig_name}_depthwise_conv_SHL_7" op.type = Op.DepthwiseConv2DBias op.attrs.update(dw_op_attrs) n, h, w, c = full_shape(4, ifm.shape, 1) @@ -592,25 +595,43 @@ def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng): maxpool_op.write_offset = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0]) DebugDatabase.add_optimised(op, maxpool_op) - # Convert output to OFM dtype and reshape back to original OFM shape with 1x1 DWConv - dw_conv = Operation(Op.DepthwiseConv2DBias, f"depthwise_conv_convert_to_32bit_{op_idx}") - dw_conv.attrs.update(dw_op_attrs) - dw_conv.inputs = [maxpool_op.ofm] - dw_conv.add_input_tensor( - create_const_tensor( - "weights", - [1, 1, 1, 1], - DataType.uint8, - np.array([1]).reshape([1, 1, 1, 1]), - quantization=identity_quant, - ), - ) - dw_conv.add_input_tensor(create_const_tensor(dw_conv.name + "_bias", [1], DataType.int64, [0])) - ofm.ops.append(dw_conv) - dw_conv.outputs = [ofm] - dw_conv.ifm_shapes.append(Shape4D([1, orig_ifm_shape.height, orig_ifm_shape.width, 1])) - dw_conv.ofm_shapes.append(Shape4D(ofm.shape)) - DebugDatabase.add_optimised(op, dw_conv) + # Set final shape + maxpool_ofm.set_all_shapes([1, h, w, 1]) + + # Convert 16bit to 32bit or 64bit + if ofm.dtype == DataType.int64: + # If OFM dtype is int64 the result is converted by two cast ops (16bit to 32bit) + # + # A -> B -> C -> D (OFM) + # |0001| |00010000| |0001|0000| |00010000|00000000| + # i16 i32 i16 i16 i32 i32 + # <-------i64-------> + # + # Memcpy is used to copy the content from B to C and from D to OFM + # Memcpy will be turned into a nop or an DMA transer if memory regions differs. + intermediate_32bit = Tensor([1, h, w, 1], DataType.int32, f"{orig_name}_32bit") + else: + intermediate_32bit = ofm + + op_cast = create_cast_op(f"{orig_name}_cast_to_32bit_1", maxpool_ofm, intermediate_32bit) + DebugDatabase.add_optimised(op, op_cast) + + if ofm.dtype == DataType.int64: + # Create int16 tensor with double shape to cover the intermediate_32bit result from the first cast + intermediate_16bit_2x_size = Tensor([1, h, w, 2], DataType.int16, f"{orig_name}_16bit_2x_size") + memcpy_op = create_memcpy(f"{orig_name}_memcpy_1", intermediate_32bit, intermediate_16bit_2x_size) + DebugDatabase.add_optimised(op, memcpy_op) + + # Create int32 tensor with double ofm shape to be able to store a "int64" result + intermediate_32bit_2x_size = Tensor([1, h, w, 2], DataType.int32, f"{orig_name}_32bit_2x_size") + + op_cast = create_cast_op( + f"{orig_name}_cast_to_32bit_2", intermediate_16bit_2x_size, intermediate_32bit_2x_size + ) + DebugDatabase.add_optimised(op, op_cast) + + memcpy_op = create_memcpy("f{orig_name}_memcpy_2", intermediate_32bit_2x_size, ofm) + DebugDatabase.add_optimised(op, memcpy_op) return op |