aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrik Gustavsson <patrik.gustavsson@arm.com>2020-08-13 13:41:05 +0200
committerFredrik Knutsson <fredrik.knutsson.hunnebo@gmail.com>2020-08-14 10:49:15 +0000
commit458a208c44f70a9848f1e8e2e91f28ce3641c48f (patch)
tree37f23561f75d61746383dafc987b411646baaed8
parentbe733cf04bb262d4eee791d76f01cecd64ff9255 (diff)
downloadethos-u-vela-458a208c44f70a9848f1e8e2e91f28ce3641c48f.tar.gz
MLBEDSW-2570 Avoid usage of NHCWB16 for some cases
Avoid usage of NHCWB16 when Stack/Pack/Concat is performed in axis 3, and the "concat start" of each slice to be combined is not a multiple of 16. Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com> Change-Id: If3f7b4a3424be3c86fc2dc48e8649ce4c4f49485
-rw-r--r--ethosu/vela/graph_optimiser.py10
-rw-r--r--ethosu/vela/scheduler.py33
-rw-r--r--ethosu/vela/tensor.py4
3 files changed, 33 insertions, 14 deletions
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 582924c4..3fe703e1 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -69,6 +69,16 @@ def rewrite_concat(tens, arch):
tens.ops.append(new_op)
assert tens.shape[axis] == offset
+ # If axis = 3, NHCWB16 can only be used in the output if all the concat_start's are a multiple of 16,
+ # as it is only then the address offset for the ofm, for all operations, will be 16 byte aligned
+ # For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
+ # and those addresses are always 16 byte aligned due to the NHCWB16 format.
+ if axis == 3:
+ for op in tens.ops:
+ if op.attrs["concat_start"] % 16 != 0:
+ tens.avoid_NHCWB16 = True
+ break
+
return tens
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index cc9278fd..f3b3a79c 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -670,14 +670,16 @@ class DynamicProgrammingScheduler:
for pred_candidate in ps.dag_predecessors:
if len(pred_candidate.outputs) == 1 and pred_candidate.outputs[0] == ifm_tensor:
# we found a predecessor that produces this IFM tensor
- if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps:
- # and it only has one successor, namely us
- if pred_candidate.placement == PassPlacement.Npu:
- if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks:
- # and it is on the Npu
- if not self.avoid_for_spilling(pred_candidate):
- # and fusable - it's a candidate
- pred_pass_list.append(pred_candidate)
+ if not ifm_tensor.avoid_NHCWB16:
+ # and NHCWB16 format is not to be avoided
+ if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps:
+ # and it only has one successor, namely us
+ if pred_candidate.placement == PassPlacement.Npu:
+ if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks:
+ # and it is on the Npu
+ if not self.avoid_for_spilling(pred_candidate):
+ # and fusable - it's a candidate
+ pred_pass_list.append(pred_candidate)
if not pred_pass_list:
return ABORT_SEARCH
@@ -953,12 +955,15 @@ class DynamicProgrammingScheduler:
if output.purpose != TensorPurpose.FeatureMap:
continue
- use_NHCWB16 = True
- for op in output.consumer_list:
- if op is None or op.type == "Reshape":
- use_NHCWB16 = False
- else:
- use_NHCWB16 &= op.run_on_npu
+ use_NHCWB16 = not output.avoid_NHCWB16
+
+ if use_NHCWB16:
+ # Check consumers, to see if NHCWB16 can be used in the output
+ for op in output.consumer_list:
+ if op is None or op.type == "Reshape":
+ use_NHCWB16 = False
+ else:
+ use_NHCWB16 &= op.run_on_npu
if use_NHCWB16:
output.set_format(TensorFormat.NHCWB16, arch)
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 35749709..ecca0e0e 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -300,6 +300,7 @@ class Tensor:
"npu_tensor",
"equivalence_id",
"resampling_mode",
+ "avoid_NHCWB16",
)
AllocationQuantum = 16
@@ -346,6 +347,8 @@ class Tensor:
self.block_traversal = TensorBlockTraversal.Default
self.resampling_mode = resampling_mode.NONE
+ self.avoid_NHCWB16 = False
+
def element_size(self):
if self.element_size_bytes == 0:
return self.dtype.size_in_bits() / 8
@@ -380,6 +383,7 @@ class Tensor:
res.resampling_mode = self.resampling_mode
res.copy_compressed_weight_info(self)
+ res.avoid_NHCWB16 = self.avoid_NHCWB16
return res
def clone_into_fast_storage(self, arch):