From 347c57bb88c1286bcd1c2775e7c67296410e2e6d Mon Sep 17 00:00:00 2001 From: Johan Alfven Date: Mon, 3 Apr 2023 15:29:13 +0200 Subject: MLBEDSW-7442: Removed ofm quantization for ArgMax - Quantization for the OFM was added for the ArgMax operator as a workaround in order to avoid a crash in the weight compressor. This quantization is now removed. - The weight compressor expects that all tensors have a quantization. Updated code to use scale = 1.0 and zero point = 0 for tensor without quantization. Change-Id: I6816dce2db55f7d795d19f88d7fbe7ee419347fc Signed-off-by: Johan Alfven --- ethosu/vela/tensor.py | 6 ++++-- ethosu/vela/tflite_graph_optimiser.py | 2 -- ethosu/vela/weight_compressor.py | 29 ++++++++++++++++++++++------- 3 files changed, 26 insertions(+), 11 deletions(-) (limited to 'ethosu') diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 51c7592e..9ba6ab77 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -229,6 +229,8 @@ class QuantizationParameters: max: Union[float, np.ndarray, None] = None, num_bits=None, narrow_range=None, + scale_f32: Union[float, np.ndarray, None] = None, + zero_point: Union[int, np.ndarray, None] = None, ): self.min = min self.max = max @@ -240,8 +242,8 @@ class QuantizationParameters: # natural rounding to perform rounding away from zero. This only affects the ofm scale and bias tensor, it has # no affect on global scaling i.e. the ofm_scale register self.next_after = False - self.scale_f32: Union[float, np.ndarray, None] = None - self.zero_point: Union[int, np.ndarray, None] = None + self.scale_f32: Union[float, np.ndarray, None] = scale_f32 + self.zero_point: Union[int, np.ndarray, None] = zero_point self.quant_min: Optional[float] = None self.quant_max: Optional[float] = None self.quant_dim: Optional[int] = None diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py index e0c7fd2c..5b0e2fb3 100644 --- a/ethosu/vela/tflite_graph_optimiser.py +++ b/ethosu/vela/tflite_graph_optimiser.py @@ -501,8 +501,6 @@ def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng): identity_quant = QuantizationParameters() identity_quant.zero_point = 0 identity_quant.scale_f32 = 1.0 - if ofm.quantization is None: - ofm.quantization = identity_quant # Add last dimension to ofm shape ofm.shape += [1] ofm.ops = [] diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py index ab22e94f..a37ff6af 100644 --- a/ethosu/vela/weight_compressor.py +++ b/ethosu/vela/weight_compressor.py @@ -34,6 +34,7 @@ from .operation import NpuBlockType from .operation import Op from .scaling import quantise_scale from .scaling import reduced_quantise_scale +from .tensor import QuantizationParameters from .tensor import Tensor from .tensor import TensorFormat from .tensor import TensorPurpose @@ -235,6 +236,20 @@ def core_deinterleave(hwio, core, ncores): return ohwi[core : ohwi.shape[0] : ncores] +def _get_input_quantization(op): + quant = op.get_input_quantization() + if not quant: + quant = QuantizationParameters(scale_f32=1.0, zero_point=0) + return quant + + +def _get_output_quantization(op): + quant = op.get_output_quantization() + if not quant: + quant = QuantizationParameters(scale_f32=1.0, zero_point=0) + return quant + + def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling): assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias] assert tens.format == TensorFormat.NHWC @@ -250,14 +265,14 @@ def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling): first_consumer_op = tens.consumer_list[0] ifm_dtype = first_consumer_op.inputs[0].dtype - ifm_scale = first_consumer_op.get_input_quantization().scale_f32 - ofm_scale = first_consumer_op.get_output_quantization().scale_f32 + ifm_scale = _get_input_quantization(first_consumer_op).scale_f32 + ofm_scale = _get_output_quantization(first_consumer_op).scale_f32 weight_scales = first_consumer_op.inputs[1].quantization.scale_f32 # biases can have multiple consumers for rnn cells. if so, then check that they are all the same for op in tens.consumer_list[1:]: - assert ifm_scale == op.get_input_quantization().scale_f32 - assert ofm_scale == op.get_output_quantization().scale_f32 + assert ifm_scale == _get_input_quantization(op).scale_f32 + assert ofm_scale == _get_output_quantization(op).scale_f32 assert weight_scales == op.inputs[1].quantization.scale_f32 if not hasattr(weight_scales, "__iter__"): @@ -298,7 +313,7 @@ def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling): quantised_scales = [quantise_scale(scale) for scale in scales] # Check the output quantisation to see if the scale value needs increasing to the next one - if first_consumer_op.get_output_quantization().next_after: + if _get_output_quantization(first_consumer_op).next_after: for i, quant_scale in enumerate(quantised_scales): q_scale, q_shift = quant_scale quantised_scales[i] = (q_scale + 1, q_shift) @@ -315,8 +330,8 @@ def encode_weight_and_scale_tensor( ) -> Tuple[Optional[NpuWeightTensor], Optional[NpuWeightTensor]]: npu_block_type = op.type.npu_block_type - ifm_scale = scale_tens and scale_tens.consumer_list[0].get_input_quantization().scale_f32 - ofm_scale = scale_tens and scale_tens.consumer_list[0].get_output_quantization().scale_f32 + ifm_scale = scale_tens and _get_input_quantization(scale_tens.consumer_list[0]).scale_f32 + ofm_scale = scale_tens and _get_output_quantization(scale_tens.consumer_list[0]).scale_f32 wcc = create_weight_compression_config( weight_tens, npu_block_type, block_config.ofm_block.depth, hash(str(depth_offsets)), kernel.dilation -- cgit v1.2.1