aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ethosu/vela/graph_optimiser_util.py202
-rw-r--r--ethosu/vela/tflite_graph_optimiser.py15
2 files changed, 51 insertions, 166 deletions
diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py
index e8d5ac64..8b24eaf9 100644
--- a/ethosu/vela/graph_optimiser_util.py
+++ b/ethosu/vela/graph_optimiser_util.py
@@ -27,7 +27,6 @@ from .debug_database import DebugDatabase
from .errors import UnsupportedFeatureError
from .errors import VelaError
from .operation import Op
-from .operation_util import create_memcpy
from .shape4d import Shape4D
from .tensor import create_const_tensor
from .tensor import QuantizationParameters
@@ -40,10 +39,6 @@ memory_only_ops = (
Op.Identity,
)
-# Ops that are dependent that the original ifm tensor shape is not changed
-# by the bypass memory op function
-original_ifm_shape_ops = (Op.Mean,)
-
def _avoid_nhcwb16_for_concat(tens):
# If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
@@ -209,50 +204,6 @@ def set_ifm_ofm_op_shapes(op, arch, nng):
return op
-def bypass_need_to_keep_ofm_shape(op):
- # Check if ifm must be replaced by ofm (rank is changed or the op that follow must have original ifm shape)
- ifm_replaced_by_ofm = any(
- ofm_cons is not None and ofm_cons.type in original_ifm_shape_ops for ofm_cons in op.ofm.consumer_list
- ) or len(op.ifm.shape) != len(op.ofm.shape)
- return ifm_replaced_by_ofm
-
-
-def bypass_memory_only_ops(op):
- assert op.type in memory_only_ops
- ofm = op.ofm
- ifm = op.ifm
-
- # Check if ifm/ofm are network ifm/ofm
- ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
- ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)
- ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)
- # Check if ifm/ofm is produced respectively consumed by CPU
- ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
- ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
-
- # This case should be handled prior to this function
- assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed))
-
- if (ifm.shape != ofm.shape) and (ofm_is_sg_ofm or ofm_is_cpu_consumed or bypass_need_to_keep_ofm_shape(op)):
- # Bypassed by replacing ifm with ofm
- ofm.ops = []
- for prev_op in ifm.ops:
- prev_op.outputs = [ofm]
- ofm.ops.append(prev_op)
-
- # All ifm consumers need to use ofm as input
- for ifm_cons in ifm.consumer_list:
- for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
- if cons_ifm == ifm:
- ifm_cons.set_input_tensor(ofm, ifm_idx)
- else:
- # Bypassed by replacing ofm with ifm
- for cons in ofm.consumer_list:
- for ifm_idx, cons_ifm in enumerate(cons.inputs):
- if cons_ifm == ofm:
- cons.set_input_tensor(ifm, ifm_idx)
-
-
def move_splitsliceread_to_consumer(op, cons_op):
assert op.type == Op.SplitSliceRead
@@ -282,117 +233,62 @@ def record_optimised(op, arch):
DebugDatabase.add_optimised(op, op)
-def insert_copy_op_before_op(op):
- # Create a memcpy op with ifm as input
- tens = op.ifm
- copy_tens = tens.clone()
- copy_op = create_memcpy(f"{tens.name}_memcpy")
- copy_op.add_input_tensor(tens)
- copy_op.set_output_tensor(copy_tens)
- copy_op.set_ifm_ofm_shapes()
-
- op.set_input_tensor(copy_tens, 0)
-
- DebugDatabase.add_optimised(op, copy_op)
-
-
-def insert_copy_op_after_tens(tens):
- tens_cons_list_copy = tens.consumer_list.copy()
-
- # Create a mempcy op with ifm as input
- copy_tens = tens.clone()
- copy_op = create_memcpy(tens.name + "_memcpy")
- copy_op.add_input_tensor(tens)
- copy_op.set_output_tensor(copy_tens)
- copy_op.set_ifm_ofm_shapes()
- copy_op.run_on_npu = True
-
- # Set copy_ifm consumers
- for tens_cons in tens_cons_list_copy:
- if tens_cons is not None:
- for ifm_idx, cons_inp in enumerate(tens_cons.inputs):
- if cons_inp == tens:
- tens_cons.set_input_tensor(copy_tens, ifm_idx)
-
- DebugDatabase.add_optimised(tens.ops[0], copy_op)
-
-
-def fix_sg_input_output(op, arch, nng):
+def bypass_memory_only_ops(op, arch, nng):
if not op.run_on_npu or op.type not in memory_only_ops:
return op
- prev_op = op.ifm.ops[0]
- while prev_op is not None and prev_op.run_on_npu and prev_op.type in memory_only_ops:
- # Current op is preceded by another memory only op.
- # Replace current op's ifm with the preceding op's ifm. By doing
- # this the preceding op is removed from current path.
- next_prev_op = prev_op.ifm.ops[0]
- if next_prev_op is not None and next_prev_op.run_on_npu and next_prev_op.type in memory_only_ops:
- # Preceding op also have a preceding memory only op
- prev_op = next_prev_op
- else:
- op.set_input_tensor(prev_op.ifm, 0)
- break
-
- # For the memory only operators we want to remove, tensors are removed.
- # But in order to to do this, they cannot be outputs of the sg,
- # this need to be fixed prior to the removal.
- # Solution is to add a avgpool NOP, to maintain the original tensor.
- # This is also valid when reshape ifm/ofm is produced respectively
- # consumed by CPU
-
- # Rare case: original_ifm_shape_ops contain ops that are dependent
- # that the original ifm tensor shape is not changed by the bypass memory
- # function. If the memory only op ifm is subgraph ifm/ifm is cpu produced
- # or the ifm is consumed by many, then there is a need to insert an avgpool
- # NOP before the original_ifm_shape_ops. Also note that the NOP is only inserted
- # before original_ifm_shape_ops. The above is also true when the memory only
- # op change the rank between the IFM and OFM.
+ # Memory only operators can be completely removed if there is a one to one
+ # connection. The reshape OFM can be connected to the previous op.
#
- # Below is an example showing the case when there is a need for an AVG NOP
- # when RESHAPE is bypassed by replacing IFM with OFM.
+ # Bypassed to
+ # --->
+ # 1x6x6x10 1x6x6x10
+ # ADD ADD
+ # | -------> |
+ # 1x6x6x10 | 1x20x3x6
+ # RESHAPE | MEAN
+ # | ---------|
+ # 1x20x3x10
+ # MEAN
#
- # Converts to And in bypass_memory
- # ---> --->
- # -----ADD----- -----ADD----- -----ADD-----
- # | | | | | |
- # 1x6x6x10 1x6x6x10 1x6x6x10 1x6x6x10 1x6x6x10 1x6x6x10
- # RESHAPE MEAN AVG POOL MEAN AVG POOL MEAN
- # | | | |
- # 1x20x3x6 1x6x6x10 1x20x3x6
- # MEAN RESHAPE MEAN
- # |
- # 1x20x3x6
- # MEAN
- ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1
+ # In the above the ADD OFM = RESHAPE IFM is removed and replaced by
+ # the RESHAPE OFM.
+ #
+ # Then there are two cases when bypassing is not possible. One is when
+ # the IFM is produced by the CPU. This tensor must be preserved. It
+ # cannot be removed from the graph. The other case is when the IFM has
+ # multiple consumers, then it is not possible to just bypass the op and
+ # there is a need for a DMA (nop).
+ #
+ # Converts to
+ # --->
+ # 1x6x6x10 1x6x6x10
+ # -----ADD----- -----ADD-----
+ # | | | |
+ # 1x6x6x10 1x6x6x10 1x6x6x10 1x6x6x10
+ # RESHAPE MEAN DMA OP MEAN
+ # | |
+ # 1x20x3x6 1x20x3x6
+ # MEAN MEAN
+ #
+ # If the DMA IFM and DMA OFM ends up in the same memory area
+ # the DMA op will be removed when the cmd stream is generated.
- # Check if operator ifm/ofm are sg ifm/ofm
- ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
- ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
- ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
- # Check if ifm/ofm is produced respectively consumed by CPU
+ ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1
ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
- ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
-
- if bypass_need_to_keep_ofm_shape(op):
- # Bypass need to keep OFM shape
- if ifm_has_multiple_cons:
- # Rare case:
- # IFM need to persist due to multiple consumers and copy op is needed
- # OFM will replace IFM for the memory only op
- insert_copy_op_before_op(op)
- # One copy added so no need to check for another copy further down
- return op
- elif not (ofm_is_sg_ofm or ofm_is_cpu_consumed):
- # Only one consumer and OFM is not subgraph output or cpu consumed,
- # safe to replace ifm.shape by ofm.shape
- # IFM can then replace OFM for the memory only op and no copy op is needed
- op.ifm.shape = op.ofm.shape
-
- # Special case when when OFM is sg_ofm or cpu_consumed
- if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed):
- # Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the memory only operator.
- insert_copy_op_after_tens(op.ifm)
+
+ if ifm_has_multiple_cons or ifm_is_cpu_produced:
+ # Convert to a memcpy op
+ op.type = Op.Memcpy
+ DebugDatabase.add_optimised(op, op)
+ else:
+ # Bypass op
+ ofm = op.ofm
+ ifm = op.ifm
+ ofm.ops = []
+ for prev_op in ifm.ops:
+ prev_op.outputs = [ofm]
+ ofm.ops.append(prev_op)
return op
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 3a49309d..a1cbb3e2 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -35,7 +35,6 @@ from .graph_optimiser_util import bypass_memory_only_ops
from .graph_optimiser_util import calc_explicit_padding
from .graph_optimiser_util import convert_depthwise_to_conv
from .graph_optimiser_util import convert_to_lut
-from .graph_optimiser_util import fix_sg_input_output
from .graph_optimiser_util import memory_only_ops
from .graph_optimiser_util import move_splitsliceread_to_consumer
from .graph_optimiser_util import needed_total_padding
@@ -1362,11 +1361,6 @@ def convert_tanh_sigmoid_to_lut(op, arch, nng):
return op
-def remove_memory_only_ops(op, arch):
- if op.run_on_npu and op.type in memory_only_ops:
- bypass_memory_only_ops(op)
-
-
def fuse_activation_function_with_prev(op, arch, nng):
# if op is a no-op: attempts to move the activation function to the preceding op
if not op.attrs.get("is_nop", False) or op.activation is None:
@@ -1954,22 +1948,17 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
rewrite_unsupported=False,
)
- # Handle sg input output
+ # Bypass or rewrite memory only operators
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
nng,
sg,
arch,
[],
- [fix_sg_input_output],
+ [bypass_memory_only_ops],
rewrite_unsupported=False,
)
- # Removal of memory only operators
- for sg in nng.subgraphs:
- rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_memory_only_ops])
- sg.refresh_after_modification()
-
# Rewrite of operators
op_rewrite_list = [
set_tensor_equivalence,