aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRaul Farkas <raul.farkas@arm.com>2023-03-16 16:38:05 +0000
committerRaul Farkas <raul.farkas@arm.com>2023-03-27 16:35:56 +0100
commit72c6a2414205e033279f80b622cdf479c05a4f5b (patch)
tree35dedce67cedd2fe5533cf0beb2942a7f31199e3
parent430002df36f79d035e31e8304fb8b176129cd3cc (diff)
downloadethos-u-vela-72c6a2414205e033279f80b622cdf479c05a4f5b.tar.gz
MLBEDSW-6343: Remove op_index constraint
Remove op_index constraint and force linear format for all Conv2D that have strides that can be optimised. Change-Id: Idef3508ab074ea9abeacac030eaaa15a00ad1211 Signed-off-by: Raul Farkas <raul.farkas@arm.com>
-rw-r--r--ethosu/vela/extract_npu_subgraphs.py4
-rw-r--r--ethosu/vela/graph_optimiser_util.py7
-rw-r--r--ethosu/vela/scheduler.py10
-rw-r--r--ethosu/vela/tensor.py12
-rw-r--r--ethosu/vela/tflite_graph_optimiser.py21
5 files changed, 33 insertions, 21 deletions
diff --git a/ethosu/vela/extract_npu_subgraphs.py b/ethosu/vela/extract_npu_subgraphs.py
index bf637b83..5e9a5b53 100644
--- a/ethosu/vela/extract_npu_subgraphs.py
+++ b/ethosu/vela/extract_npu_subgraphs.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
#
# SPDX-License-Identifier: Apache-2.0
#
@@ -250,7 +250,7 @@ def extract_subgraph(nng, orig_sg, arch):
for tens in curr_sg.output_tensors:
# ofm can depend on multiple ops. These ops can be divided into different NPU
# nodes due to CPU nodes. If that is the case the ofm must be NHWC.
- tens.needs_linear_format = True
+ tens.force_linear_format = True
return new_subgraphs
diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py
index 8b24eaf9..e1341d82 100644
--- a/ethosu/vela/graph_optimiser_util.py
+++ b/ethosu/vela/graph_optimiser_util.py
@@ -30,6 +30,7 @@ from .operation import Op
from .shape4d import Shape4D
from .tensor import create_const_tensor
from .tensor import QuantizationParameters
+from .tensor import Tensor
memory_only_ops = (
Op.Reshape,
@@ -90,7 +91,9 @@ def _avoid_nhcwb16_for_memory_only(tens):
# Check if non linear format can be used
-def check_format_restrictions(tens, arch):
+def check_format_restrictions(tens: Tensor, arch):
+ if tens.force_linear_format:
+ return
if len(tens.ops) < 1:
return
if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
@@ -161,7 +164,7 @@ def check_format_restrictions(tens, arch):
else:
return
- tens.needs_linear_format = False
+ tens.force_linear_format = False
def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 83e19bc6..6fcb6c1d 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -467,11 +467,11 @@ class Scheduler:
if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
continue
- if output.needs_linear_format:
+ if output.use_linear_format:
continue
if self.avoid_nhcwb16_for_ofm(output, ps, arch):
- output.needs_linear_format = True
+ output.force_linear_format = True
continue
output.set_format(TensorFormat.NHCWB16, arch)
@@ -504,11 +504,11 @@ class Scheduler:
if ps.ofm_tensor in self.sg.output_tensors:
# This Op produces a subgraph output
op.requires_full_ofm = True
- if ps.ifm_tensor.needs_linear_format:
+ if ps.ifm_tensor.use_linear_format:
op.requires_full_ifm = True
- if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
+ if ps.ifm2_tensor and ps.ifm2_tensor.use_linear_format:
op.requires_full_ifm2 = True
- if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
+ if ps.ofm_tensor.use_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
op.requires_full_ofm = True
if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
# Op has multiple outputs or consumers - requires full OFM
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 86306cad..51c7592e 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -397,7 +397,7 @@ class Tensor:
"block_traversal",
"equivalence_id",
"src_tensor",
- "needs_linear_format",
+ "force_linear_format",
"ifm_write_protected",
)
AllocationQuantum = 16
@@ -444,13 +444,19 @@ class Tensor:
self.quantization: Optional[QuantizationParameters] = None
self.block_traversal: TensorBlockTraversal = TensorBlockTraversal.Default
- self.needs_linear_format = True
+ # Keep track of whether the linear format should be enforced
+ self.force_linear_format: Optional[bool] = None
self.ifm_write_protected = False
# Reference to parent-tensor if this tensor is a clone
self.src_tensor: Optional[Tensor] = None
@property
+ def use_linear_format(self) -> bool:
+ """Return whether the tensor should use linear format or not."""
+ return self.force_linear_format in (True, None)
+
+ @property
def original_shape(self):
return self._original_shape
@@ -545,7 +551,7 @@ class Tensor:
if shape_len > 4:
return
- assert not (self.needs_linear_format and fmt == TensorFormat.NHCWB16)
+ assert not (self.use_linear_format and fmt == TensorFormat.NHCWB16)
self.storage_rounding_quantum = arch.storage_rounding_quantums[self.format]
self.storage_rounding_quantum = tuple(self.storage_rounding_quantum[-shape_len:])
self.brick_size = arch.brick_sizes[self.format]
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 44f5d6ae..518b6db0 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -942,19 +942,21 @@ def reorder_depthwise_weights(op, arch, nng):
return op
-def fixup_strided_conv(op, arch, nng):
+def fixup_strided_conv(op: Operation, arch, nng):
+ """Optimize or fixup strided Conv2DBias
+ Optimization:
+ Reduce, when possible, the Conv2DBias stride from 2 to 1 by re-shaping
+ both IFM and filter.
+
+ Fixup:
+ Introduce software support for Conv2DBias with stride_width = 4 by
+ reducing it to 1 when possible by re-shaping both IFM and filter.
+ """
if op.type != Op.Conv2DBias:
return op
stride_x, stride_y = op.get_kernel_stride()
weight_tensor = op.weights
ifm_shape = op.ifm_shapes[0]
-
- # Do not optimize if op is not the first in the network and stride is
- # supported by the hardware
- if op.op_index != 0 and stride_x < 4:
- return op
- op.ifm.needs_linear_format = True
-
if (
(stride_x == 2 or stride_x == 4)
and ifm_shape.depth <= 4
@@ -1004,6 +1006,7 @@ def fixup_strided_conv(op, arch, nng):
stride_x = 1
op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
+ op.ifm.force_linear_format = True
return op
@@ -2125,7 +2128,6 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
convert_prelu,
convert_mul_max_to_abs_or_lrelu,
convert_lrelu,
- fixup_strided_conv,
convert_hardswish_to_lut,
rewrite_fully_connected_input,
convert_batched_fc_shape,
@@ -2139,6 +2141,7 @@ def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
convert_tanh_sigmoid_to_lut,
replace_pad_by_hw_pad,
fixup_dilation_gt2,
+ fixup_strided_conv,
]
for idx, sg in enumerate(nng.subgraphs):