From e8887a3e6ed6638b06ecac9581deaaa89b8059c0 Mon Sep 17 00:00:00 2001 From: Diqing Zhong Date: Thu, 24 Sep 2020 09:53:48 +0200 Subject: MLBEDSW-603: Improve cycle estimation in elementwise ops Signed-off-by: Diqing Zhong Change-Id: I9f3671041c2b1497519cf42b5f52e3cd01d9c10a (cherry picked from commit e8c989f5236cce12d07a6644329935dbbf0ee8e6) --- ethosu/vela/architecture_features.py | 36 ++++++++++++++---- ethosu/vela/npu_performance.py | 72 ++++++++++++++++++++++++++++++++---- 2 files changed, 93 insertions(+), 15 deletions(-) diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py index 3ef4d1bf..04c1c62e 100644 --- a/ethosu/vela/architecture_features.py +++ b/ethosu/vela/architecture_features.py @@ -136,15 +136,14 @@ class Accelerator(enum.Enum): class ArchitectureFeatures: """This class is a container for various parameters of the Ethos-U55 core -and system configuration that can be tuned, either by command line -parameters or by the Ethos-U55 architects. The class is often passed -around to passes that need to do architecture-dependent actions. + and system configuration that can be tuned, either by command line + parameters or by the Ethos-U55 architects. The class is often passed + around to passes that need to do architecture-dependent actions. -Note the difference between ArchitectureFeatures and CompilerOptions -- ArchitectureFeatures is for changing the Ethos-U55 and system architecture -- CompilerOptions is for changing the behaviour of the compiler - -""" + Note the difference between ArchitectureFeatures and CompilerOptions + - ArchitectureFeatures is for changing the Ethos-U55 and system architecture + - CompilerOptions is for changing the behaviour of the compiler + """ ArchitectureConfig = namedtuple( "ArchitectureConfig", "macs cores ofm_ublock ifm_ublock shram_banks shram_granules elem_units" @@ -239,6 +238,9 @@ Note the difference between ArchitectureFeatures and CompilerOptions self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.npu_clock + # Get output/activation performance numbers + self._generate_output_perf_tables(self.accelerator_config) + # sizes as N x H x W x C. we need to round up to these when allocating storage self.storage_rounding_quantums = { TensorFormat.Unknown: (1, 1, 1, 1), @@ -374,6 +376,24 @@ Note the difference between ArchitectureFeatures and CompilerOptions key = ArchitectureFeatures.make_block_config_key(w, h, c) self.block_config_map[key] = self.generate_block_config(w, h, c) + def _generate_output_perf_tables(self, accel_config): + if accel_config == Accelerator.Ethos_U55_32: + self.output_cycles_per_elem = (2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0) + self.activation_cycles_per_elem = (1.0, 1.0, 0.0) + elif accel_config == Accelerator.Ethos_U55_64: + self.output_cycles_per_elem = (1.0, 1.5, 1.5, 1.5, 2.0, 3.0, 0.5, 1.0) + self.activation_cycles_per_elem = (1.0, 1.0, 0.0) + elif accel_config == Accelerator.Ethos_U55_128: + self.output_cycles_per_elem = (0.75, 1.25, 0.75, 0.75, 1.0, 1.5, 0.25, 0.5) + self.activation_cycles_per_elem = (1.0, 0.5, 0.0) + elif accel_config in (Accelerator.Ethos_U55_256, Accelerator.Yoda_256): + self.output_cycles_per_elem = (0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25) + self.activation_cycles_per_elem = (1.0, 0.25, 0.0) + else: + assert accel_config == Accelerator.Yoda_512 + self.output_cycles_per_elem = (0.3125, 0.5625, 0.25, 0.1875, 0.25, 0.375, 0.0625, 0.125) + self.activation_cycles_per_elem = (0.5, 0.125, 0.0) + def calc_ifm_block_depth(self, ifm_depth, ifm_bits): assert ifm_bits in (8, 16, 32) assert ifm_depth > 0 diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index fc148f38..e71e95b1 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -25,9 +25,12 @@ import numpy as np from . import numeric_util from .architecture_features import Block +from .architecture_features import SHRAMElements +from .data_type import DataType from .nn_graph import PassPlacement from .nn_graph import SchedulerRewrite from .operation import NpuBlockType +from .operation import Op from .register_command_stream_generator import get_op_kernel from .tensor import MemArea from .tensor import shape_num_elements @@ -210,6 +213,66 @@ def get_n_blocks_and_area( return total_blocks, total_area, block_setup +def get_output_cycle_estimate(arch, ps): + primary_op = ps.primary_op + assert primary_op + npu_block_type = primary_op.type.npu_block_type + faf = primary_op.activation + + if npu_block_type == NpuBlockType.ElementWise and ps.ifm_tensor.dtype == DataType.int32: + if ps.ifm2_tensor is None: + # Unary op + output_perf_index = 0 + else: + # Binary op + output_perf_index = 1 + elif ps.primary_op.type == Op.Mul and ps.ofm_tensor.dtype == DataType.int32: + output_perf_index = 2 + elif ps.primary_op.type == Op.Mul or ( + npu_block_type + in ( + NpuBlockType.ConvolutionMxN, + NpuBlockType.ConvolutionDepthWise, + NpuBlockType.Pooling, + NpuBlockType.ReduceSum, + NpuBlockType.VectorProduct, + ) + and ps.shared_buffer.use_accumulator_element == SHRAMElements.Acc40 + ): + output_perf_index = 3 + elif ps.primary_op.type in (Op.Add, Op.Sub): + input_scale = ps.ifm_tensor.quantization.scale_f32 + input2_scale = ps.ifm2_tensor.quantization.scale_f32 + output_scale = ps.ofm_tensor.quantization.scale_f32 + + if "resizebilinear" in primary_op.attrs: + output_scale = input2_scale + + if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale: + # Simple Add/Sub + output_perf_index = 4 + else: + # Advanced Add/Sub + output_perf_index = 5 + elif ps.primary_op.type.is_maxpool_op(): + output_perf_index = 6 + else: + output_perf_index = 7 + + if faf in (Op.Sigmoid, Op.Tanh, Op.LUT): + activation_perf_index = 0 + elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1): + activation_perf_index = 1 + else: + activation_perf_index = 2 + + num_elems = ps.outputs[0].elements() + cycle_per_elem = max( + arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index] + ) + return num_elems * cycle_per_elem + + def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False): if block_config is None: block_config = ps.block_config @@ -385,14 +448,9 @@ def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], f replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction ifm_read_multiple = 1 weight_read_multiple = non_zero_fraction - else: - if ps.placement == PassPlacement.Npu and len(ps.outputs): - # Assume element-wise operation going through the element pipelines. + elif npu_block_type == NpuBlockType.ElementWise: # Work out how many elements we have and calculate performance. - out = ps.outputs[0] - elms = out.elements() - - cycles[PassCycles.ElementWise] = numeric_util.round_up_divide(elms, arch.num_elem_wise_units) + cycles[PassCycles.ElementWise] = get_output_cycle_estimate(arch, ps) # apply the desired rewrites for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list: -- cgit v1.2.1