diff options
author | Fredrik Svedberg <fredrik.svedberg@arm.com> | 2020-11-06 13:48:15 +0100 |
---|---|---|
committer | patrik.gustavsson <patrik.gustavsson@arm.com> | 2020-11-10 10:09:06 +0000 |
commit | fd31428db9985fe31811063428ebc609a2b42d05 (patch) | |
tree | d6c659e7e39a51dbd23aa96a3e198ef6fb48d6fd /ethosu | |
parent | 219ec070cb0d1aab6b2ea8d0428d6aaa9c5c07fb (diff) | |
download | ethos-u-vela-fd31428db9985fe31811063428ebc609a2b42d05.tar.gz |
[MLBEDSW-3227] Improve u65 softmax performance
Improve u65 softmax performance by selecting more feature map
tensors as SRAM candidates.
Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: I239c9dbebbf2a929004eb01bb0f3efe77f5b97aa
Diffstat (limited to 'ethosu')
-rw-r--r-- | ethosu/vela/scheduler.py | 40 |
1 files changed, 25 insertions, 15 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 59c2b58f..56f4aaae 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -958,6 +958,9 @@ class DynamicProgrammingScheduler: # to be moved to fast storage fast_storage_tensor_rewrites = {} last_op_in_subgraph = self.sg.cascaded_passes[-1].passes[-1].primary_op + # Memory only passes have no primary_op, so use the last op in ops + if last_op_in_subgraph is None: + last_op_in_subgraph = self.sg.cascaded_passes[-1].passes[-1].ops[-1] for ps in self.sg.cascaded_passes: if ps.placement != PassPlacement.Npu: continue @@ -976,25 +979,32 @@ class DynamicProgrammingScheduler: if op.type == Op.ReduceSum and output.dtype == DataType.int32: use_NHCWB16 = False elif op.type == Op.Reshape: - # Detect no-op reshapes by comparing their full input and output tensor shapes. - inshape = full_shape(4, op.inputs[0].shape, 1) - outshape = full_shape(4, op.outputs[0].shape, 1) # Using NHCWB16 format for a no-op reshape is only an option if subsequent # consumers do not also need to perform a reshape or if the OFM is going to # be processed by CPU operations. No-op reshape consumers with empty lists # (those that have no consumers, or null-consumers used as list terminators) # must use normal NHWC output. - incompatible_consumers = [ - ( - not consumer.run_on_npu - or consumer.type == Op.Reshape - or (consumer is last_op_in_subgraph) - ) - for consumer in op.outputs[0].consumer_list - if consumer is not None - ] - if (outshape == inshape) and incompatible_consumers and not any(incompatible_consumers): - rewrites.append(op) + def incompatible_consumers(oper): + if oper and oper.type == Op.Reshape: + for consumer in oper.outputs[0].consumer_list: + yield from incompatible_consumers(consumer) + yield not oper or not oper.run_on_npu or oper is last_op_in_subgraph + + if not any(incompatible_consumers(op)): + + def get_rewrites(oper): + if oper and oper.type == Op.Reshape: + for consumer in oper.outputs[0].consumer_list: + yield from get_rewrites(consumer) + yield oper + + rewrites.extend(get_rewrites(op)) + # Detect no-op reshapes by comparing their full input and output tensor shapes. + inshape = full_shape(4, op.inputs[0].shape, 1) + compatible_shape = [ + (inshape == full_shape(4, oper.outputs[0].shape, 1)) for oper in get_rewrites(op) + ] + use_NHCWB16 = compatible_shape and all(compatible_shape) else: use_NHCWB16 = False use_fast_storage = False @@ -1069,7 +1079,7 @@ def use_fast_storage_for_feature_maps(sg, sram_limit, arch): tens_list = sorted([(len(tens_to_cps[tens]), -tens.storage_size(), tens.name, tens) for tens in tens_to_cps]) for _, _, _, tens in tens_list: cps_list = tens_to_cps[tens] - if len(cps_list) <= 1: + if len(cps_list) < 1: continue sz = tens.storage_size() fits_in_fast_storage = all([cps.sram_used + sz <= sram_limit for cps in cps_list]) |