aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfven <johan.alfven@arm.com>2023-04-03 15:29:13 +0200
committerJohan Alfven <johan.alfven@arm.com>2023-04-04 15:44:15 +0200
commit347c57bb88c1286bcd1c2775e7c67296410e2e6d (patch)
tree98c209e597be597b67853bfc0ee50c255dac1370
parent56811e6d3c62ae017f6eb298fb553f7d1e77cc96 (diff)
downloadethos-u-vela-347c57bb88c1286bcd1c2775e7c67296410e2e6d.tar.gz
MLBEDSW-7442: Removed ofm quantization for ArgMax
- Quantization for the OFM was added for the ArgMax operator as a workaround in order to avoid a crash in the weight compressor. This quantization is now removed. - The weight compressor expects that all tensors have a quantization. Updated code to use scale = 1.0 and zero point = 0 for tensor without quantization. Change-Id: I6816dce2db55f7d795d19f88d7fbe7ee419347fc Signed-off-by: Johan Alfven <johan.alfven@arm.com>
-rw-r--r--ethosu/vela/tensor.py6
-rw-r--r--ethosu/vela/tflite_graph_optimiser.py2
-rw-r--r--ethosu/vela/weight_compressor.py29
3 files changed, 26 insertions, 11 deletions
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 51c7592e..9ba6ab77 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -229,6 +229,8 @@ class QuantizationParameters:
max: Union[float, np.ndarray, None] = None,
num_bits=None,
narrow_range=None,
+ scale_f32: Union[float, np.ndarray, None] = None,
+ zero_point: Union[int, np.ndarray, None] = None,
):
self.min = min
self.max = max
@@ -240,8 +242,8 @@ class QuantizationParameters:
# natural rounding to perform rounding away from zero. This only affects the ofm scale and bias tensor, it has
# no affect on global scaling i.e. the ofm_scale register
self.next_after = False
- self.scale_f32: Union[float, np.ndarray, None] = None
- self.zero_point: Union[int, np.ndarray, None] = None
+ self.scale_f32: Union[float, np.ndarray, None] = scale_f32
+ self.zero_point: Union[int, np.ndarray, None] = zero_point
self.quant_min: Optional[float] = None
self.quant_max: Optional[float] = None
self.quant_dim: Optional[int] = None
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index e0c7fd2c..5b0e2fb3 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -501,8 +501,6 @@ def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng):
identity_quant = QuantizationParameters()
identity_quant.zero_point = 0
identity_quant.scale_f32 = 1.0
- if ofm.quantization is None:
- ofm.quantization = identity_quant
# Add last dimension to ofm shape
ofm.shape += [1]
ofm.ops = []
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index ab22e94f..a37ff6af 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -34,6 +34,7 @@ from .operation import NpuBlockType
from .operation import Op
from .scaling import quantise_scale
from .scaling import reduced_quantise_scale
+from .tensor import QuantizationParameters
from .tensor import Tensor
from .tensor import TensorFormat
from .tensor import TensorPurpose
@@ -235,6 +236,20 @@ def core_deinterleave(hwio, core, ncores):
return ohwi[core : ohwi.shape[0] : ncores]
+def _get_input_quantization(op):
+ quant = op.get_input_quantization()
+ if not quant:
+ quant = QuantizationParameters(scale_f32=1.0, zero_point=0)
+ return quant
+
+
+def _get_output_quantization(op):
+ quant = op.get_output_quantization()
+ if not quant:
+ quant = QuantizationParameters(scale_f32=1.0, zero_point=0)
+ return quant
+
+
def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling):
assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias]
assert tens.format == TensorFormat.NHWC
@@ -250,14 +265,14 @@ def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling):
first_consumer_op = tens.consumer_list[0]
ifm_dtype = first_consumer_op.inputs[0].dtype
- ifm_scale = first_consumer_op.get_input_quantization().scale_f32
- ofm_scale = first_consumer_op.get_output_quantization().scale_f32
+ ifm_scale = _get_input_quantization(first_consumer_op).scale_f32
+ ofm_scale = _get_output_quantization(first_consumer_op).scale_f32
weight_scales = first_consumer_op.inputs[1].quantization.scale_f32
# biases can have multiple consumers for rnn cells. if so, then check that they are all the same
for op in tens.consumer_list[1:]:
- assert ifm_scale == op.get_input_quantization().scale_f32
- assert ofm_scale == op.get_output_quantization().scale_f32
+ assert ifm_scale == _get_input_quantization(op).scale_f32
+ assert ofm_scale == _get_output_quantization(op).scale_f32
assert weight_scales == op.inputs[1].quantization.scale_f32
if not hasattr(weight_scales, "__iter__"):
@@ -298,7 +313,7 @@ def _prepare_scale_and_bias(arch, tens, rescale_for_faf, explicit_scaling):
quantised_scales = [quantise_scale(scale) for scale in scales]
# Check the output quantisation to see if the scale value needs increasing to the next one
- if first_consumer_op.get_output_quantization().next_after:
+ if _get_output_quantization(first_consumer_op).next_after:
for i, quant_scale in enumerate(quantised_scales):
q_scale, q_shift = quant_scale
quantised_scales[i] = (q_scale + 1, q_shift)
@@ -315,8 +330,8 @@ def encode_weight_and_scale_tensor(
) -> Tuple[Optional[NpuWeightTensor], Optional[NpuWeightTensor]]:
npu_block_type = op.type.npu_block_type
- ifm_scale = scale_tens and scale_tens.consumer_list[0].get_input_quantization().scale_f32
- ofm_scale = scale_tens and scale_tens.consumer_list[0].get_output_quantization().scale_f32
+ ifm_scale = scale_tens and _get_input_quantization(scale_tens.consumer_list[0]).scale_f32
+ ofm_scale = scale_tens and _get_output_quantization(scale_tens.consumer_list[0]).scale_f32
wcc = create_weight_compression_config(
weight_tens, npu_block_type, block_config.ofm_block.depth, hash(str(depth_offsets)), kernel.dilation