From 880e73543120648f08886365a45e8b2ce32d5ff1 Mon Sep 17 00:00:00 2001 From: Fredrik Svedberg Date: Tue, 25 Aug 2020 11:31:47 +0200 Subject: [MLBEDSW-2846] Do not use NHCWB16 for reduce_sum int32 Added checks for not using NHCWB16 for reduce_sum int32 which makes int8/uint8 softmax work. Also enabled softmax graph rewrite by default and fixed a saturation problem. Change-Id: Ic01bd9ece7e5c3edb2900b7915cc747efe9e5760 Signed-off-by: Fredrik Svedberg --- ethosu/vela/architecture_features.py | 3 +-- ethosu/vela/scheduler.py | 3 ++- ethosu/vela/softmax.py | 24 ++++++++++++++---------- ethosu/vela/supported_operators.py | 6 +----- ethosu/vela/test/testutil.py | 1 - ethosu/vela/vela.py | 9 +-------- 6 files changed, 19 insertions(+), 27 deletions(-) (limited to 'ethosu') diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py index 8b968a3e..fd0e5c06 100644 --- a/ethosu/vela/architecture_features.py +++ b/ethosu/vela/architecture_features.py @@ -183,7 +183,6 @@ Note the difference between ArchitectureFeatures and CompilerOptions block_config_limit, global_memory_clock_scale, max_blockdep, - softmax_support, weight_estimation_scaling, ): accelerator_config = accelerator_config.lower() @@ -332,7 +331,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions self.generate_block_config_map(Block(ifm_block_max.width, ifm_block_max.height, 128)) # Setup supported operators and restriction checkers class - self.supported_operators = SupportedOperators(softmax_support) + self.supported_operators = SupportedOperators() # Returns available number of SHRAM banks depending on activation lookup table # being used or not diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 9b492f01..41902d67 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -24,6 +24,7 @@ import numpy as np from . import live_range from . import npu_performance from . import stats_writer +from .data_type import DataType from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_pass_list from .nn_graph import CascadedPass from .nn_graph import PassPlacement @@ -963,7 +964,7 @@ class DynamicProgrammingScheduler: use_NHCWB16 = True rewrites = [] for op in output.consumer_list: - if op is None: + if op is None or (op.type == "ReduceSum" and output.dtype == DataType.int32): use_NHCWB16 = False elif op.type == "Reshape": # Detect no-op reshapes by comparing their full input and output tensor shapes. diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py index eb97c792..7c23f472 100644 --- a/ethosu/vela/softmax.py +++ b/ethosu/vela/softmax.py @@ -391,7 +391,9 @@ class SoftMax: F2_one = create_const_tensor( "F2_one_const", [1, 1, 1, 1], DataType.int32, [(1 << 29)], np.int32, quantization=no_scale_quant ) - two = create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant) + four = create_const_tensor( + "four_const", [1, 1, 1, 1], DataType.int32, [4], np.int32, quantization=no_scale_quant + ) for i in range(3): # PASS 13, 18, 23 - MUL mul_op = Operation("MulAct", self.op.name + "_mul%d" % (13 + i * 5)) @@ -416,10 +418,10 @@ class SoftMax: to_rescale.quantization = one_scale_quant.clone() to_rescale.quantization.scale_f32 = 2.0 mul_op.set_output_tensor(to_rescale) - # PASS 16, 21, 26 - SHL - shl_op = Operation("SHL", self.op.name + "_shl%d" % (16 + i * 5)) + # PASS 16, 21, 26 - MUL + shl_op = Operation("MulAct", self.op.name + "_mul%d" % (16 + i * 5)) shl_op.add_input_tensor(to_rescale) - shl_op.add_input_tensor(two) + shl_op.add_input_tensor(four) to_add = Tensor(reduce_sum_shape, DataType.int32, shl_op.name + "_0") to_add.quantization = no_scale_quant shl_op.set_output_tensor(to_add) @@ -431,13 +433,15 @@ class SoftMax: nr_x.quantization = one_scale_quant add_op.set_output_tensor(nr_x) - # PASS 28 - SHL - shl28_op = Operation("SHL", self.op.name + "_shl28") - shl28_op.add_input_tensor(nr_x) - shl28_op.add_input_tensor(one) - scale_factor = Tensor(reduce_sum_shape, DataType.int32, shl28_op.name + "_0") + # PASS 28 - Multiply + mul28_op = Operation("MulAct", self.op.name + "_mul28") + mul28_op.add_input_tensor(nr_x) + mul28_op.add_input_tensor( + create_const_tensor("two_const", [1, 1, 1, 1], DataType.int32, [2], np.int32, quantization=no_scale_quant) + ) + scale_factor = Tensor(reduce_sum_shape, DataType.int32, mul28_op.name + "_0") scale_factor.quantization = one_scale_quant - shl28_op.set_output_tensor(scale_factor) + mul28_op.set_output_tensor(scale_factor) # PASS 29 - Multiply mul_op = Operation("MulAct", self.op.name + "_mul29") diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py index 567c05ca..f57cbee2 100644 --- a/ethosu/vela/supported_operators.py +++ b/ethosu/vela/supported_operators.py @@ -22,8 +22,7 @@ from .data_type import DataType class SupportedOperators: - def __init__(self, softmax_support): - self.softmax_support = softmax_support + def __init__(self): # Categorised lists of supported operators self.npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",)) self.convolution_ops = set(("Conv2DBiasAct", "Conv2D", "QuantizedConv2D",)) @@ -393,9 +392,6 @@ class SupportedOperators: def check_activation_ops(self, op): if op.type == "Softmax": - if not self.softmax_support: - return False - ifm_tensor = op.inputs[0] ofm_tensor = op.outputs[0] diff --git a/ethosu/vela/test/testutil.py b/ethosu/vela/test/testutil.py index 68866fc7..fb6ca591 100644 --- a/ethosu/vela/test/testutil.py +++ b/ethosu/vela/test/testutil.py @@ -37,7 +37,6 @@ def create_arch(): block_config_limit=None, global_memory_clock_scale=1.0, max_blockdep=0, - softmax_support=True, weight_estimation_scaling=1.0, ) diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index 19080926..91899c28 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -253,19 +253,13 @@ def main(args=None): choices=[True, False], help="Control if NHCWB16 or NHWC should be used in between cascaded passes (default: %(default)s)", ) - parser.add_argument( - "--softmax-support", - type=ast.literal_eval, - default=False, - choices=[True, False], - help="Control if Softmax should be transformed into a set of npu operations (default: %(default)s)", - ) parser.add_argument( "--weight-estimation-scaling", type=float, default=1.0, help=("Performs an additional scaling of weight compression scale estimate (default: %(default)s)"), ) + args = parser.parse_args(args=args) # Read configuration file @@ -295,7 +289,6 @@ def main(args=None): block_config_limit=args.block_config_limit, global_memory_clock_scale=args.global_memory_clock_scale, max_blockdep=args.max_block_dependency, - softmax_support=args.softmax_support, weight_estimation_scaling=args.weight_estimation_scaling, ) -- cgit v1.2.1