aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharles Xu <charles.xu@arm.com>2020-05-13 10:15:26 +0200
committerTim Hall <tim.hall@arm.com>2020-06-18 17:53:52 +0100
commit78792223369fa34dacd0e69e189af035283da2ae (patch)
treeac3826df5528866319fd65d7a99eef8e87cd4084
parent620d88c60482bad4d96da4d32cc4cca5561cca9e (diff)
downloadethos-u-vela-78792223369fa34dacd0e69e189af035283da2ae.tar.gz
Add elementwise vector scalars support
Write the constant scalars into flash. In case it's Dram or OffChipFlash, DMA the scalars from flash to sram. Signed-off-by: Charles Xu <charles.xu@arm.com> Change-Id: I42300a05dfe968d623b8aec8549644549e0f54b5
-rw-r--r--ethosu/vela/graph_optimiser.py13
-rw-r--r--ethosu/vela/high_level_command_stream_generator.py27
-rw-r--r--ethosu/vela/insert_dma.py5
-rw-r--r--ethosu/vela/npu_serialisation.py30
-rw-r--r--ethosu/vela/supported_operators.py7
5 files changed, 58 insertions, 24 deletions
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 913b9a6a..351716e0 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -25,6 +25,7 @@ from .data_type import DataType
from .operation import NpuBlockType
from .operation import Operation
from .tensor import Tensor
+from .numeric_util import full_shape
passthrough_nodes = set(("Identity",))
@@ -313,6 +314,7 @@ fc_op = set(
depthwise_op = set(("DepthwiseConv2dNative", "DepthwiseConv2dBiasAct",))
pool_op = set(("AvgPool", "MaxPool", "QuantizedAvgPool", "QuantizedMaxPool", "AvgPoolAct", "MaxPoolAct", "ResizeBilinear",))
elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum", "LeakyRelu", "Abs"))
+binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum"))
activation_ops = set(("Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh"))
memory_only_ops = set(("Reshape",))
@@ -399,6 +401,16 @@ def fixup_act_reorder(op, arch):
op.type = "Identity"
return op
+def fixup_elementwise_with_scalars(op, arch):
+ if op.type in binary_elementwise_op:
+ ifm_tensor, ifm2_tensor, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+ if ifm2_tensor.shape != [] and ifm_tensor.shape != []:
+ diff = len(ifm_tensor.shape) - len(ifm2_tensor.shape)
+ if diff > 0:
+ ifm2_tensor.shape = full_shape(len(ifm_tensor.shape), ifm2_tensor.shape, 1)
+ elif diff < 0:
+ ifm_tensor.shape = full_shape(len(ifm2_tensor.shape), ifm_tensor.shape, 1)
+ return op
# Set input/output tensor equivalence to the same id for memory operations
def set_tensor_equivalence(op, arch):
@@ -492,6 +504,7 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
fixup_act_reorder,
add_padding_fields,
mark_npu_block_type,
+ fixup_elementwise_with_scalars,
# convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved
]
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index ef21e06c..0cc70a7f 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -24,17 +24,18 @@ from .high_level_command_stream import NpuStripe
from .nn_graph import PassPlacement
from .nn_graph import SchedulingStrategy
from .operation import NpuBlockType
+from .tensor import TensorPurpose
def need_dma(tens):
return len(tens.ops) == 1 and tens.ops[0].type == "DMA"
-def dma_weights_if_necessary(ps, box, weight_tensor):
- if need_dma(weight_tensor):
- dma_op = weight_tensor.ops[0]
+def dma_if_necessary(ps, box, tensor):
+ if need_dma(tensor):
+ dma_op = tensor.ops[0]
in_tensor = dma_op.inputs[0]
- yield DMA(in_tensor, weight_tensor, box)
+ yield DMA(in_tensor, tensor, box)
def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
@@ -115,6 +116,13 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
else:
ifm2_box = Box([], [])
+ for intermediate in ps.intermediates:
+ if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
+ intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+ strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
+ )
+ yield from dma_if_necessary(ps, intermediate_box, intermediate)
+
weight_box = None
if weight_tensor is not None:
weight_oc_start = start
@@ -130,7 +138,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
weight_oc_end,
weight_tensor.weight_transpose_depthwise,
)
- yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+ yield from dma_if_necessary(ps, weight_box, weight_tensor)
yield NpuStripe(
ps,
@@ -201,6 +209,13 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
)
+ for intermediate in ps.intermediates:
+ if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
+ intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+ strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
+ )
+ yield from dma_if_necessary(ps, intermediate_box, intermediate)
+
ifm_y_needed = 1
if len(ifm_box.end_coord) >= 3:
ifm_y_needed = ifm_box.end_coord[-3]
@@ -217,7 +232,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
weight_box = Box.make_weight_box(
weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
)
- yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+ yield from dma_if_necessary(ps, weight_box, weight_tensor)
# Check if first/last stripe in pass
is_first_h_stripe = start == y_start
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
index 703ab9d7..b1b89856 100644
--- a/ethosu/vela/insert_dma.py
+++ b/ethosu/vela/insert_dma.py
@@ -21,6 +21,7 @@ from .operation import Operation
from .tensor import MemArea
from .tensor import TensorPurpose
+binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum"))
def insert_dma_cmd(op, arch):
if op.type == "DMA":
@@ -28,7 +29,9 @@ def insert_dma_cmd(op, arch):
for idx, tens in enumerate(op.inputs):
if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area:
- if tens.purpose == TensorPurpose.Weights:
+ if (tens.purpose == TensorPurpose.Weights or
+ (tens.purpose == TensorPurpose.FeatureMap and
+ op.type in binary_elementwise_op)):
only_vector_product_consumers = True
for oper in tens.consumers():
if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index b8ac20f3..0cb40ed0 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -46,6 +46,10 @@ def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
memory_tensor.values[start_addr:end_addr] = compressed_values
start_addr = end_addr
+def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor):
+ start_addr = src_tensor.address
+ end_addr = start_addr + src_tensor.quant_values.size
+ memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values
def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
if sg.placement != PassPlacement.Npu:
@@ -90,16 +94,22 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens)
for cps in sg.cascaded_passes:
for ps in cps.passes:
- if ps.placement == PassPlacement.Npu and ps.weight_tensor is not None:
- # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
- # is pointing at the destination address of where the weights should be placed in SRAM.
- # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
- if ps.weight_tensor.ops[0].type == "DMA":
- copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
- else:
- copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
-
- copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+ if ps.placement == PassPlacement.Npu:
+ if ps.weight_tensor != None:
+ # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
+ # is pointing at the destination address of where the weights should be placed in SRAM.
+ # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
+ if ps.weight_tensor.ops[0].type == "DMA":
+ copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
+ else:
+ copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
+
+ copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+
+ if ps.ifm_tensor != None and ps.ifm_tensor.mem_area != MemArea.Sram:
+ copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor)
+ if ps.ifm2_tensor != None and ps.ifm2_tensor.mem_area != MemArea.Sram:
+ copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor)
sg.command_stream_tensor = make_memory_tensor(
sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch
diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py
index e5271450..574b3a49 100644
--- a/ethosu/vela/supported_operators.py
+++ b/ethosu/vela/supported_operators.py
@@ -229,13 +229,6 @@ class SupportedOperators:
if op.type in self.binary_elem_wise_main_ops: # if op type is unary, ifm2_tensor is None
if len(ifm2_tensor.shape) > 2 and ifm2_tensor.shape[0] != 1:
return False
-
- # check scalar size
- if hasattr(ifm_tensor.values, "__len__") and len(ifm_tensor.values) > 1:
- return False
- if op.type in self.binary_elem_wise_main_ops: # same as above
- if hasattr(ifm2_tensor.values, "__len__") and len(ifm2_tensor.values) > 1:
- return False
return True
def check_memory_only_restrictions(self, op):