diff options
Diffstat (limited to 'python/pyarmnn/examples/speech_recognition/audio_utils.py')
-rw-r--r-- | python/pyarmnn/examples/speech_recognition/audio_utils.py | 128 |
1 files changed, 128 insertions, 0 deletions
diff --git a/python/pyarmnn/examples/speech_recognition/audio_utils.py b/python/pyarmnn/examples/speech_recognition/audio_utils.py new file mode 100644 index 0000000000..a522a0e2a7 --- /dev/null +++ b/python/pyarmnn/examples/speech_recognition/audio_utils.py @@ -0,0 +1,128 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT + +"""Utilities for speech recognition apps.""" + +import numpy as np +import pyarmnn as ann + + +def decode(model_output: np.ndarray, labels: dict) -> str: + """Decodes the integer encoded results from inference into a string. + + Args: + model_output: Results from running inference. + labels: Dictionary of labels keyed on the classification index. + + Returns: + Decoded string. + """ + top1_results = [labels[np.argmax(row[0])] for row in model_output] + return filter_characters(top1_results) + + +def filter_characters(results: list) -> str: + """Filters unwanted and duplicate characters. + + Args: + results: List of top 1 results from inference. + + Returns: + Final output string to present to user. + """ + text = "" + for i in range(len(results)): + if results[i] == "$": + continue + elif i + 1 < len(results) and results[i] == results[i + 1]: + continue + else: + text += results[i] + return text + + +def display_text(text: str): + """Presents the results on the console. + + Args: + text: Results of performing ASR on the input audio data. + """ + print(text, sep="", end="", flush=True) + + +def quantize_input(data, input_binding_info): + """Quantize the float input to (u)int8 ready for inputting to model.""" + if data.ndim != 2: + raise RuntimeError("Audio data must have 2 dimensions for quantization") + + quant_scale = input_binding_info[1].GetQuantizationScale() + quant_offset = input_binding_info[1].GetQuantizationOffset() + data_type = input_binding_info[1].GetDataType() + + if data_type == ann.DataType_QAsymmS8: + data_type = np.int8 + elif data_type == ann.DataType_QAsymmU8: + data_type = np.uint8 + else: + raise ValueError("Could not quantize data to required data type") + + d_min = np.iinfo(data_type).min + d_max = np.iinfo(data_type).max + + for row in range(data.shape[0]): + for col in range(data.shape[1]): + data[row, col] = (data[row, col] / quant_scale) + quant_offset + data[row, col] = np.clip(data[row, col], d_min, d_max) + data = data.astype(data_type) + return data + + +def decode_text(is_first_window, labels, output_result): + """ + Slices the text appropriately depending on the window, and decodes for wav2letter output. + * First run, take the left context, and inner context. + * Every other run, take the inner context. + Stores the current right context, and updates it for each inference. Will get used after last inference + + Args: + is_first_window: Boolean to show if it is the first window we are running inference on + labels: the label set + output_result: the output from the inference + text: the current text string, to be displayed at the end + Returns: + current_r_context: the current right context + text: the current text string, with the latest output decoded and appended + """ + + if is_first_window: + # Since it's the first inference, keep the left context, and inner context, and decode + text = decode(output_result[0][0:472], labels) + else: + # Only decode the inner context + text = decode(output_result[0][49:472], labels) + + # Store the right context, we will need it after the last inference + current_r_context = decode(output_result[0][473:521], labels) + return current_r_context, text + + +def prepare_input_tensors(audio_data, input_binding_info, mfcc_preprocessor): + """ + Takes a block of audio data, extracts the MFCC features, quantizes the array, and uses ArmNN to create the + input tensors. + + Args: + audio_data: The audio data to process + mfcc_instance: the mfcc class instance + input_binding_info: the model input binding info + mfcc_preprocessor: the mfcc preprocessor instance + Returns: + input_tensors: the prepared input tensors, ready to be consumed by the ArmNN NetworkExecutor + """ + + data_type = input_binding_info[1].GetDataType() + input_tensor = mfcc_preprocessor.extract_features(audio_data) + if data_type != ann.DataType_Float32: + input_tensor = quantize_input(input_tensor, input_binding_info) + input_tensors = ann.make_input_tensors([input_binding_info], [input_tensor]) + return input_tensors |