From b9fc33c194036973273604d5fd7af9e814133238 Mon Sep 17 00:00:00 2001 From: Louis Verhaard Date: Thu, 13 Aug 2020 11:47:36 +0200 Subject: MLBEDSW-2688: LeakyRelu rewrite to LUT or MUL/MAX Replaces LeakyRelu operations with LUT activation function when possible, else to a combination of multiplication/maximization. Signed-off-by: Louis Verhaard Change-Id: I3d2eb2dba7145997c3cc711d0ef18ab355fbb416 --- ethosu/vela/graph_optimiser.py | 159 ++++++++++++++++++++++++++++++++++++++++- ethosu/vela/lut.py | 16 +++++ ethosu/vela/mark_tensors.py | 2 +- ethosu/vela/tensor.py | 3 + 4 files changed, 176 insertions(+), 4 deletions(-) (limited to 'ethosu/vela') diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py index 78c0dcd4..8d920d83 100644 --- a/ethosu/vela/graph_optimiser.py +++ b/ethosu/vela/graph_optimiser.py @@ -20,6 +20,7 @@ import math import numpy as np +from . import lut from . import rewrite_graph from .data_type import DataType from .errors import UnsupportedFeatureError @@ -585,6 +586,12 @@ def convert_mul_max_to_abs_or_lrelu(op, arch): # make sure the Mul doesn't have a faf if mul.attrs["fused_activation_function"]: return op + ifm, _, _, ofm = op.get_ifm_weights_biases_ofm() + if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype: + return op + if not ifm.is_scaling_equal(ofm): + # rewrite to LeakyRelu currently only makes sense if the quantization is identical + return op # finds the branched input that goes to both the Max and the Mul shared = set(op.inputs) & set(mul.inputs) @@ -599,6 +606,8 @@ def convert_mul_max_to_abs_or_lrelu(op, arch): # check that it is a constant if const.type != "Const": return op + # Remove the Mul from the shared input's consumers + shared_in.consumer_list.remove(mul) else: return op @@ -618,6 +627,147 @@ def convert_mul_max_to_abs_or_lrelu(op, arch): return op +def convert_lrelu_to_mul_max(op, arch): + # Converts LeakyRelu to Max(alpha * IFM, identity * IFM) + # (the opposite of convert_mul_max_to_abs_or_lrelu) + ifm, _, _, ofm = op.get_ifm_weights_biases_ofm() + + # Add multiplication with alpha + mul_alpha = Operation("MulAct", op.name + "_mul_alpha") + mul_alpha.add_input_tensor(ifm) + # Create const tensor containing alpha as scalar + alpha = op.attrs["alpha"] + quantization = ifm.quantization.clone() + quantization.min = 0 + quantization.max = alpha * (quantization.quant_max - quantization.quant_min) + quantization.scale_f32 = alpha + quantization.zero_point = 0 + alpha_tens = create_const_tensor(op.name + "_alpha_scalar", [], ifm.dtype, [1], np.int8, quantization=quantization) + mul_alpha.add_input_tensor(alpha_tens) + fm_alpha = ofm.clone(op.name + "_alpha") + mul_alpha.set_output_tensor(fm_alpha) + + if ifm.is_scaling_equal(ofm): + # No identity multiplication is needed + fm_id = ifm + else: + # Add multiplication with identity + mul_identity = Operation("MulAct", op.name + "_mul_identity") + mul_identity.add_input_tensor(ifm) + # Create const tensor containing identity as scalar + quantization = ifm.quantization.clone() + quantization.min = 0 + quantization.max = quantization.quant_max - quantization.quant_min + quantization.scale_f32 = 1 + quantization.zero_point = 0 + identity_tens = create_const_tensor( + op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization + ) + mul_identity.add_input_tensor(identity_tens) + fm_id = ofm.clone(op.name + "_id") + mul_identity.set_output_tensor(fm_id) + + # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs + op.type = "Maximum" + op.name = op.name.replace("LeakyRelu", "Maximum") + op.inputs = [] + ifm.consumer_list.remove(op) + op.add_input_tensor(fm_alpha) + op.add_input_tensor(fm_id) + return op + + +def convert_lrelu_to_lut(op, arch): + ifm, _, _, ofm = op.get_ifm_weights_biases_ofm() + # Rewrite LeakyRelu by Add with scalar 0 + LUT activation + op.type = "AddAct" + op.name = op.name + "_add" + op.attrs.update({"npu_block_type": NpuBlockType.ElementWise}) + # Mark as no-op to enable potential fusing optimizations + op.attrs["is_nop"] = True + # Create an input tensor containing scalar zero + quantization = QuantizationParameters(0.0, 255.0) + quantization.scale_f32 = 1.0 + quantization.zero_point = 0 + tens = create_const_tensor(op.inputs[0].name + "_add", [], ifm.dtype, [0], np.uint8, quantization=quantization) + op.add_input_tensor(tens) + alpha = op.attrs["alpha"] + zp = ofm.quantization.zero_point + # Generate the LUT + if ifm.dtype.size_in_bytes() == 1: + dtype = DataType.int8 + ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128) + values = [int(x) if x >= zp else int(round(zp - alpha * (zp - x))) for x in ix] + else: + # int16 + dtype = DataType.int32 + values = [] + for ix in range(512): + x = (ix - 256) * 128 + if x >= zp: + base = x + slope = 128 + else: + base = int(round(zp - alpha * (zp - x))) + next_base = int(round(zp - alpha * (zp - (x + 127)))) + slope = int(round(128 * (next_base - base) / 127)) + value = ((slope << 16) & 0xFFFF0000) + (base & 0xFFFF) + values.append(value) + lut_tensor = lut.create_lut_tensor(op.name + "_lut", values, dtype) + op.set_activation_lut(lut_tensor) + return op + + +def convert_lrelu(op, arch): + # Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max + if op.type != "LeakyRelu": + return op + ifm, _, _, ofm = op.get_ifm_weights_biases_ofm() + use_lut = (ifm.is_scaling_equal(ofm)) and (ifm.dtype == ofm.dtype) and ifm.dtype in (DataType.uint8, DataType.int8) + if use_lut: + return convert_lrelu_to_lut(op, arch) + return convert_lrelu_to_mul_max(op, arch) + + +def fuse_activation_function_with_prev(op, arch): + # if op is a no-op: attempts to move the activation function to the preceding op + if not op.attrs.get("is_nop", False) or op.attrs.get("fused_activation_function", None) is None: + return op + ifm, _, _, ofm = op.get_ifm_weights_biases_ofm() + # finds the input(s) to the operation + prev_op = ifm.ops[0] + # Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed + fuse = ( + prev_op.run_on_npu + and prev_op.attrs["npu_block_type"] != NpuBlockType.Default + and len(ifm.ops) == 1 + and len(prev_op.outputs[0].consumers()) == 1 + and prev_op.attrs.get("fused_activation_function", None) is None + and ifm.is_scaling_equal(ofm) + ) + if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0: + # TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC), + # LUT currently only works correctly for elementwise ops + fuse = False + if fuse and op.activation_lut is not None: + # Check if LUT can be used with prev_op + prev_ifm, prev_ifm2, _, _ = prev_op.get_ifm_ifm2_weights_ofm() + fuse = prev_ifm is not None and prev_ifm.quantization is not None and prev_ifm.is_scaling_equal(ifm) + if prev_ifm2 is not None: + fuse = fuse and prev_ifm2.quantization is not None and prev_ifm2.is_scaling_equal(ifm) + if not fuse: + return op + # Move the fused activation function + corresponding info to prev_op + for attr in ("fused_activation_function", "alpha"): + if attr in op.attrs: + prev_op.attrs[attr] = op.attrs[attr] + if op.activation_lut is not None: + prev_op.set_activation_lut(op.activation_lut) + # Bypass op + prev_op.set_output_tensor(op.outputs[0]) + return op + + def add_attrs_to_resizebilinear(op, arch): if op.type == "ResizeBilinear" and op.run_on_npu: input_tensor = op.inputs[0] @@ -679,7 +829,8 @@ def optimise_graph_a(nng, arch, verbose_graph=False): reorder_depthwise_weights, fixup_resizebilinear, add_bias_tensor, - # convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved + convert_mul_max_to_abs_or_lrelu, + convert_lrelu, ] for idx, sg in enumerate(nng.subgraphs): @@ -689,8 +840,10 @@ def optimise_graph_a(nng, arch, verbose_graph=False): ) for idx, sg in enumerate(nng.subgraphs): - # remove passthrough tensors - nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [remove_passthrough_tensor], []) + # remove passthrough tensors and attempt further optimizations + nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order( + sg, arch, [remove_passthrough_tensor], [fuse_activation_function_with_prev] + ) if verbose_graph: nng.print_graph() diff --git a/ethosu/vela/lut.py b/ethosu/vela/lut.py index 39101fac..0e8dcc95 100644 --- a/ethosu/vela/lut.py +++ b/ethosu/vela/lut.py @@ -18,8 +18,11 @@ import uuid from functools import lru_cache +import numpy as np + from . import numeric_util from .high_level_command_stream import CommandType +from .tensor import create_const_tensor from .tensor import TensorPurpose @@ -85,6 +88,19 @@ def get_lut_index(arch, lut_tensor): return slot +def create_lut_tensor(name, values, dtype): + # Creates constant LUT tensor with the given values as lookup table. + # The tensor's equivalence_id is based on these values, so if multiple + # LUT tensors are created with identical values, they will get the same + # address in constant memory, and unnecessary DMA operations can be avoided. + sz = len(values) + assert sz in (256, 512) + ntype = np.uint8 if dtype.size_in_bytes() == 1 else np.uint32 + tens = create_const_tensor(name, [1, 1, 1, sz], dtype, values, ntype, TensorPurpose.LUT) + tens.equivalence_id = create_equivalence_id(tuple(values)) + return tens + + def optimize_high_level_cmd_stream(sg, arch): # - Allocates SHRAM address/lut index to LUT tensors # - Removes unnecessary DMA operations of LUT-s that are already present in SHRAM from sg's command stream diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py index 40ce467b..03ab83fe 100644 --- a/ethosu/vela/mark_tensors.py +++ b/ethosu/vela/mark_tensors.py @@ -284,7 +284,7 @@ def mark_tensor_purpose(nng, arch, verbose_tensor_purpose=False): ) for idx, tens in enumerate(op.inputs): - purpose = input_purpose(op, idx) + purpose = input_purpose(op, idx) if tens.purpose == TensorPurpose.Unknown else tens.purpose mark_tensor_helper(tens, purpose) if op.type == "Reshape": diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 5fdea979..f0e7ea44 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -728,6 +728,9 @@ class Tensor: return True return False + def is_scaling_equal(self, tens): + return self.quantization.is_scaling_equal(tens.quantization) + def equivalent(self, tens): return self.equivalence_id == tens.equivalence_id -- cgit v1.2.1