aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDiqing Zhong <diqing.zhong@arm.com>2020-09-24 09:53:48 +0200
committerpatrik.gustavsson <patrik.gustavsson@arm.com>2020-10-21 12:09:45 +0000
commite8887a3e6ed6638b06ecac9581deaaa89b8059c0 (patch)
tree9bdb11e1e1318a81d14dda1c00b4acf37089d2e8
parent17afa2837ad366f2da32e2bc0e2659ebb35bd1d5 (diff)
downloadethos-u-vela-e8887a3e6ed6638b06ecac9581deaaa89b8059c0.tar.gz
MLBEDSW-603: Improve cycle estimation in elementwise ops
Signed-off-by: Diqing Zhong <diqing.zhong@arm.com> Change-Id: I9f3671041c2b1497519cf42b5f52e3cd01d9c10a (cherry picked from commit e8c989f5236cce12d07a6644329935dbbf0ee8e6)
-rw-r--r--ethosu/vela/architecture_features.py36
-rw-r--r--ethosu/vela/npu_performance.py72
2 files changed, 93 insertions, 15 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 3ef4d1bf..04c1c62e 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -136,15 +136,14 @@ class Accelerator(enum.Enum):
class ArchitectureFeatures:
"""This class is a container for various parameters of the Ethos-U55 core
-and system configuration that can be tuned, either by command line
-parameters or by the Ethos-U55 architects. The class is often passed
-around to passes that need to do architecture-dependent actions.
+ and system configuration that can be tuned, either by command line
+ parameters or by the Ethos-U55 architects. The class is often passed
+ around to passes that need to do architecture-dependent actions.
-Note the difference between ArchitectureFeatures and CompilerOptions
-- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
-- CompilerOptions is for changing the behaviour of the compiler
-
-"""
+ Note the difference between ArchitectureFeatures and CompilerOptions
+ - ArchitectureFeatures is for changing the Ethos-U55 and system architecture
+ - CompilerOptions is for changing the behaviour of the compiler
+ """
ArchitectureConfig = namedtuple(
"ArchitectureConfig", "macs cores ofm_ublock ifm_ublock shram_banks shram_granules elem_units"
@@ -239,6 +238,9 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.npu_clock
+ # Get output/activation performance numbers
+ self._generate_output_perf_tables(self.accelerator_config)
+
# sizes as N x H x W x C. we need to round up to these when allocating storage
self.storage_rounding_quantums = {
TensorFormat.Unknown: (1, 1, 1, 1),
@@ -374,6 +376,24 @@ Note the difference between ArchitectureFeatures and CompilerOptions
key = ArchitectureFeatures.make_block_config_key(w, h, c)
self.block_config_map[key] = self.generate_block_config(w, h, c)
+ def _generate_output_perf_tables(self, accel_config):
+ if accel_config == Accelerator.Ethos_U55_32:
+ self.output_cycles_per_elem = (2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0)
+ self.activation_cycles_per_elem = (1.0, 1.0, 0.0)
+ elif accel_config == Accelerator.Ethos_U55_64:
+ self.output_cycles_per_elem = (1.0, 1.5, 1.5, 1.5, 2.0, 3.0, 0.5, 1.0)
+ self.activation_cycles_per_elem = (1.0, 1.0, 0.0)
+ elif accel_config == Accelerator.Ethos_U55_128:
+ self.output_cycles_per_elem = (0.75, 1.25, 0.75, 0.75, 1.0, 1.5, 0.25, 0.5)
+ self.activation_cycles_per_elem = (1.0, 0.5, 0.0)
+ elif accel_config in (Accelerator.Ethos_U55_256, Accelerator.Yoda_256):
+ self.output_cycles_per_elem = (0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25)
+ self.activation_cycles_per_elem = (1.0, 0.25, 0.0)
+ else:
+ assert accel_config == Accelerator.Yoda_512
+ self.output_cycles_per_elem = (0.3125, 0.5625, 0.25, 0.1875, 0.25, 0.375, 0.0625, 0.125)
+ self.activation_cycles_per_elem = (0.5, 0.125, 0.0)
+
def calc_ifm_block_depth(self, ifm_depth, ifm_bits):
assert ifm_bits in (8, 16, 32)
assert ifm_depth > 0
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index fc148f38..e71e95b1 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -25,9 +25,12 @@ import numpy as np
from . import numeric_util
from .architecture_features import Block
+from .architecture_features import SHRAMElements
+from .data_type import DataType
from .nn_graph import PassPlacement
from .nn_graph import SchedulerRewrite
from .operation import NpuBlockType
+from .operation import Op
from .register_command_stream_generator import get_op_kernel
from .tensor import MemArea
from .tensor import shape_num_elements
@@ -210,6 +213,66 @@ def get_n_blocks_and_area(
return total_blocks, total_area, block_setup
+def get_output_cycle_estimate(arch, ps):
+ primary_op = ps.primary_op
+ assert primary_op
+ npu_block_type = primary_op.type.npu_block_type
+ faf = primary_op.activation
+
+ if npu_block_type == NpuBlockType.ElementWise and ps.ifm_tensor.dtype == DataType.int32:
+ if ps.ifm2_tensor is None:
+ # Unary op
+ output_perf_index = 0
+ else:
+ # Binary op
+ output_perf_index = 1
+ elif ps.primary_op.type == Op.Mul and ps.ofm_tensor.dtype == DataType.int32:
+ output_perf_index = 2
+ elif ps.primary_op.type == Op.Mul or (
+ npu_block_type
+ in (
+ NpuBlockType.ConvolutionMxN,
+ NpuBlockType.ConvolutionDepthWise,
+ NpuBlockType.Pooling,
+ NpuBlockType.ReduceSum,
+ NpuBlockType.VectorProduct,
+ )
+ and ps.shared_buffer.use_accumulator_element == SHRAMElements.Acc40
+ ):
+ output_perf_index = 3
+ elif ps.primary_op.type in (Op.Add, Op.Sub):
+ input_scale = ps.ifm_tensor.quantization.scale_f32
+ input2_scale = ps.ifm2_tensor.quantization.scale_f32
+ output_scale = ps.ofm_tensor.quantization.scale_f32
+
+ if "resizebilinear" in primary_op.attrs:
+ output_scale = input2_scale
+
+ if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
+ # Simple Add/Sub
+ output_perf_index = 4
+ else:
+ # Advanced Add/Sub
+ output_perf_index = 5
+ elif ps.primary_op.type.is_maxpool_op():
+ output_perf_index = 6
+ else:
+ output_perf_index = 7
+
+ if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
+ activation_perf_index = 0
+ elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
+ activation_perf_index = 1
+ else:
+ activation_perf_index = 2
+
+ num_elems = ps.outputs[0].elements()
+ cycle_per_elem = max(
+ arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
+ )
+ return num_elems * cycle_per_elem
+
+
def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False):
if block_config is None:
block_config = ps.block_config
@@ -385,14 +448,9 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f
replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction
ifm_read_multiple = 1
weight_read_multiple = non_zero_fraction
- else:
- if ps.placement == PassPlacement.Npu and len(ps.outputs):
- # Assume element-wise operation going through the element pipelines.
+ elif npu_block_type == NpuBlockType.ElementWise:
# Work out how many elements we have and calculate performance.
- out = ps.outputs[0]
- elms = out.elements()
-
- cycles[PassCycles.ElementWise] = numeric_util.round_up_divide(elms, arch.num_elem_wise_units)
+ cycles[PassCycles.ElementWise] = get_output_cycle_estimate(arch, ps)
# apply the desired rewrites
for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list: