From 458a208c44f70a9848f1e8e2e91f28ce3641c48f Mon Sep 17 00:00:00 2001 From: Patrik Gustavsson Date: Thu, 13 Aug 2020 13:41:05 +0200 Subject: MLBEDSW-2570 Avoid usage of NHCWB16 for some cases Avoid usage of NHCWB16 when Stack/Pack/Concat is performed in axis 3, and the "concat start" of each slice to be combined is not a multiple of 16. Signed-off-by: Patrik Gustavsson Change-Id: If3f7b4a3424be3c86fc2dc48e8649ce4c4f49485 --- ethosu/vela/graph_optimiser.py | 10 ++++++++++ ethosu/vela/scheduler.py | 33 +++++++++++++++++++-------------- ethosu/vela/tensor.py | 4 ++++ 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index 582924c4..3fe703e1 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -69,6 +69,16 @@ def rewrite_concat(tens, arch): tens.ops.append(new_op) assert tens.shape[axis] == offset + # If axis = 3, NHCWB16 can only be used in the output if all the concat_start's are a multiple of 16, + # as it is only then the address offset for the ofm, for all operations, will be 16 byte aligned + # For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0 + # and those addresses are always 16 byte aligned due to the NHCWB16 format. + if axis == 3: + for op in tens.ops: + if op.attrs["concat_start"] % 16 != 0: + tens.avoid_NHCWB16 = True + break + return tens diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index cc9278fd..f3b3a79c 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -670,14 +670,16 @@ class DynamicProgrammingScheduler: for pred_candidate in ps.dag_predecessors: if len(pred_candidate.outputs) == 1 and pred_candidate.outputs[0] == ifm_tensor: # we found a predecessor that produces this IFM tensor - if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps: - # and it only has one successor, namely us - if pred_candidate.placement == PassPlacement.Npu: - if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks: - # and it is on the Npu - if not self.avoid_for_spilling(pred_candidate): - # and fusable - it's a candidate - pred_pass_list.append(pred_candidate) + if not ifm_tensor.avoid_NHCWB16: + # and NHCWB16 format is not to be avoided + if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps: + # and it only has one successor, namely us + if pred_candidate.placement == PassPlacement.Npu: + if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks: + # and it is on the Npu + if not self.avoid_for_spilling(pred_candidate): + # and fusable - it's a candidate + pred_pass_list.append(pred_candidate) if not pred_pass_list: return ABORT_SEARCH @@ -953,12 +955,15 @@ class DynamicProgrammingScheduler: if output.purpose != TensorPurpose.FeatureMap: continue - use_NHCWB16 = True - for op in output.consumer_list: - if op is None or op.type == "Reshape": - use_NHCWB16 = False - else: - use_NHCWB16 &= op.run_on_npu + use_NHCWB16 = not output.avoid_NHCWB16 + + if use_NHCWB16: + # Check consumers, to see if NHCWB16 can be used in the output + for op in output.consumer_list: + if op is None or op.type == "Reshape": + use_NHCWB16 = False + else: + use_NHCWB16 &= op.run_on_npu if use_NHCWB16: output.set_format(TensorFormat.NHCWB16, arch) diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 35749709..ecca0e0e 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -300,6 +300,7 @@ class Tensor: "npu_tensor", "equivalence_id", "resampling_mode", + "avoid_NHCWB16", ) AllocationQuantum = 16 @@ -346,6 +347,8 @@ class Tensor: self.block_traversal = TensorBlockTraversal.Default self.resampling_mode = resampling_mode.NONE + self.avoid_NHCWB16 = False + def element_size(self): if self.element_size_bytes == 0: return self.dtype.size_in_bits() / 8 @@ -380,6 +383,7 @@ class Tensor: res.resampling_mode = self.resampling_mode res.copy_compressed_weight_info(self) + res.avoid_NHCWB16 = self.avoid_NHCWB16 return res def clone_into_fast_storage(self, arch): -- cgit v1.2.1