From b4936adf32435a96ca1a89818582f8d2efa366d1 Mon Sep 17 00:00:00 2001 From: Patrik Gustavsson Date: Tue, 5 Oct 2021 13:53:34 +0200 Subject: TOSA: Added decomposition of PAD Added support for: -Rank > 4 and batch > 1 -Tensor dimensions exceeding NPU limit -Padding in any dimension (Implementation for functional compliance, not considering performance) Signed-off-by: Patrik Gustavsson Change-Id: Ief58fb3233d885f10ba5e68c5374b190efbe9351 --- ethosu/vela/operation_util.py | 6 ++ ethosu/vela/tosa_graph_optimiser.py | 151 +++++++++++++++++++++----------- ethosu/vela/tosa_supported_operators.py | 2 +- 3 files changed, 106 insertions(+), 53 deletions(-) diff --git a/ethosu/vela/operation_util.py b/ethosu/vela/operation_util.py index 0fbed46b..29caf6d0 100644 --- a/ethosu/vela/operation_util.py +++ b/ethosu/vela/operation_util.py @@ -50,6 +50,12 @@ def create_add_nop(name: str) -> Operation: return op +def create_pad_nop(name: str) -> Operation: + op = Operation(Op.Pad, name) + op.run_on_npu = True + return op + + def create_depthwise_maxpool( name: str, ifm: Tensor, diff --git a/ethosu/vela/tosa_graph_optimiser.py b/ethosu/vela/tosa_graph_optimiser.py index 954ac68f..e27dbed6 100644 --- a/ethosu/vela/tosa_graph_optimiser.py +++ b/ethosu/vela/tosa_graph_optimiser.py @@ -33,9 +33,12 @@ from .operation import ExplicitScaling from .operation import Op from .operation_util import create_add_nop from .operation_util import create_avgpool_nop +from .operation_util import create_pad_nop from .shape4d import Shape4D from .tensor import create_const_tensor from .tensor import create_equivalence_id +from .tensor import shape_num_elements +from .tensor import Tensor def replace_rescale_with_avg_pool(rescale_op): @@ -414,87 +417,44 @@ def rewrite_rescale(op, arch, nng): return op -# TODO modified copy of TFLite, solution for TOSA PAD will change so reuse has not been considered -def convert_pad(op, arch, nng): +def convert_pad_in_width(op): """ Rewrites PAD operator to an add that copies the IFM to the OFM + up to 4 add operators that fill the OFM with zeros at the borders. """ - - if op.type != Op.Pad: - return op - - # TODO assuming rank <= 4 and N = 1 for rank ==4 - # This is checked in tosa_supported_operators + assert op.type == Op.Pad + assert op.ifm_shapes[0] is not None and op.ofm_shapes[0] is not None ifm = op.ifm - assert ifm is not None - ifm_shape = Shape4D(ifm.shape) ofm = op.ofm - assert ofm is not None + ifm_shape = op.ifm_shapes[0] ofm.ops = [] ofm_shape = op.ofm_shapes[0] - rank = len(ifm.shape) padding = op.inputs[1].values - pad_depth = padding[-1] - if not (pad_depth == 0).all(): - print("Warning: For PAD, padding in depth not supported yet") - assert False - - top, bottom = 0, 0 - left, right = 0, 0 - if rank > 1: - left, right = padding[-2][0], padding[-2][1] - if rank > 2: - top, bottom = padding[-3][0], padding[-3][1] - if rank == 4 and not (padding[-4] == 0).all(): - print("Warning: For PAD, padding not supported in first dimension when rank == 4 yet") - assert False + left, right = padding[-2] # Add op that copies IFM to the right place inside the OFM shp0 = Shape4D(0, 0, 0, 0) - shp_top = shp0.with_height(top) - add_op = create_add_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left)) + add_op = create_add_for_concat(op, op.name + "_main", ifm, ifm_shape, shp0.with_width(left)) add_op.activation = op.activation quant = ofm.quantization pad_value = ifm.quantization.zero_point ifm.quantization.zero_point = 0 - # Add operations that fill the borders of the OFM - if top > 0: - shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth) - zero_tens = create_const_tensor( - op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant, - ) - # If top/bottom or left/right are equal, the const tensors can be allocated to the same address - zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values)) - create_add_for_concat(op, op.name + "_top", zero_tens, shape, shp0) - if bottom > 0: - shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth) - zero_tens = create_const_tensor( - op.name + "_bottom", - shape.as_list(), - ofm.dtype, - shape.elements() * [pad_value], - np.uint8, - quantization=quant, - ) - zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values)) - create_add_for_concat(op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)) if left > 0: shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth) zero_tens = create_const_tensor( op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant ) zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values)) - create_add_for_concat(op, op.name + "_left", zero_tens, shape, shp_top) + create_add_for_concat(op, op.name + "_left", zero_tens, shape, shp0) if right > 0: shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth) zero_tens = create_const_tensor( op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant ) zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values)) - create_add_for_concat(op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)) + create_add_for_concat(op, op.name + "_right", zero_tens, shape, shp0.with_width(ofm_shape.width - right)) op.type = Op.ConcatTFLite return add_op @@ -581,6 +541,8 @@ def create_elem_part_op(op, ifm_cut, ifm2_cut, ofm_cut): part_op.read_shapes[1] = ifm2_shape part_op.ifm2.consumer_list.append(part_op) + return part_op + def get_nhwc_stride(shape): stride_x = shape.depth @@ -700,6 +662,8 @@ def decomp_dims_elementwise(op): ifm.consumer_list.remove(op) if binary: ifm2.consumer_list.remove(op) + + return op_list else: op.ofm_shapes.append(Shape4D(new_ofm_shape)) op.ifm_shapes.append(Shape4D(new_ifm_shape)) @@ -781,6 +745,84 @@ def decomp_rewrite_concat(tens, arch, nng): return tens +def decomp_rewrite_pad(op, arch): + """ + Decomposition of pad to elementwise operations: + For each dimension that needs padding: + -Create a new PAD operator for each dimension to be added + Ifm/ofm are reshape so this is the width dimension is to be padded + (rank for each is 3) + -Rewrite the the new PAD operator so there is: + -1 Add operator for copying the data + -1 Add operator for each left/right to be padded + """ + # TODO several things would be possible to optimize + # For instance there are cases when it should be possible to pad 2 + # dimensions at the same time. + if op.type == Op.Pad: + ofm_elements = shape_num_elements(op.ofm.shape) + padding = op.inputs[1].values + + rank = len(op.ifm.shape) + next_ifm = op.ifm + next_ifm_shape = next_ifm.shape.copy() + + first_pad_rewrite_op = None + ifm_quant = op.ifm.quantization.clone() + + for dim in range(padding.shape[0]): + # Check if padding is to be applied in this dimension + dim_pad = padding[dim] + if not (dim_pad == 0).all(): + # Reshape so that width dimension is to be padded + new_ifm_shape = reshape_concat_shape(next_ifm_shape, rank, dim) + new_pad_input = np.zeros((4, 2), dtype=np.int32) + new_pad_input[2] = dim_pad + + pad_op = create_pad_nop(f"{op.name}_dim_{dim}") + pad_op.add_input_tensor(next_ifm) + new_pad_tens = op.inputs[1].clone("_dim_{dim}") + + name = op.inputs[1].name + f"_dim_{dim}" + new_pad_tens = create_const_tensor( + name, list(new_pad_input.shape), DataType.int32, new_pad_input, np.int32 + ) + pad_op.add_input_tensor(new_pad_tens) + + new_ofm_shape = new_ifm_shape.copy() + new_ofm_shape[-2] = new_ofm_shape[-2] + dim_pad.sum() + next_ifm_shape[dim] = next_ifm_shape[dim] + dim_pad.sum() + + if Shape4D(new_ofm_shape).elements() == ofm_elements: + # Last one, use op.ofm + ofm = op.ofm + else: + # add a new ofm Tensor + ofm = Tensor(new_ofm_shape, op.ofm.dtype, f"{pad_op.name}_tens") + ofm.quantization = ifm_quant.clone() + + pad_op.set_output_tensor(ofm) + pad_op.ifm_shapes.append(Shape4D(new_ifm_shape)) + pad_op.ofm_shapes.append(Shape4D(new_ofm_shape)) + DebugDatabase.add_optimised(op, pad_op) + next_ifm = ofm + + # Rewrite the pad op + converted_pad_op = convert_pad_in_width(pad_op) + first_pad_rewrite_op = converted_pad_op + else: + # Change to Identity operation (will be removed) + op.type = Op.Identity + + if first_pad_rewrite_op: + assert op.ofm.shape == next_ifm_shape + for inp in op.inputs: + inp.consumer_list.remove(op) + return first_pad_rewrite_op + + return op + + def fixup_quantization(op, arch, nng): if op.ifm and op.ifm.quantization.zero_point is None: op.ifm.quantization.zero_point = 0 @@ -812,6 +854,11 @@ def tosa_optimise_graph(nng, arch): nng, sg, arch, [decomp_rewrite_concat], [], rewrite_unsupported=False ) + # Decomposing of pad + for idx, sg in enumerate(nng.subgraphs): + rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [decomp_rewrite_pad]) + sg.refresh_after_modification() + # Handle sg input output for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( @@ -857,7 +904,7 @@ def tosa_optimise_graph(nng, arch): # Post-processing step 1 for idx, sg in enumerate(nng.subgraphs): nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( - nng, sg, arch, [], [rewrite_activation, convert_pad, add_padding_fields], + nng, sg, arch, [], [rewrite_activation, add_padding_fields], ) # Removal of Slice, need to be done after optimisation has been performed, diff --git a/ethosu/vela/tosa_supported_operators.py b/ethosu/vela/tosa_supported_operators.py index 5a85b0eb..e3785113 100644 --- a/ethosu/vela/tosa_supported_operators.py +++ b/ethosu/vela/tosa_supported_operators.py @@ -46,7 +46,7 @@ class TosaSupportedOperators: activation_ops = relu_ops | set((Op.Table,)) pad_ops = set((Op.Pad,)) - rank_unlimited_ops = set((Op.Concat, Op.Reshape, Op.Identity)) + rank_unlimited_ops = set((Op.Concat, Op.Reshape, Op.Identity, Op.Pad)) rank6_limited_ops = elem_wise_ops batch_enabled_ops = rank6_limited_ops | rank_unlimited_ops large_tens_dims_enabled_ops = batch_enabled_ops | set((Op.SplitSliceRead,)) -- cgit v1.2.1