aboutsummaryrefslogtreecommitdiff
path: root/python/pyarmnn/examples/speech_recognition/audio_utils.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyarmnn/examples/speech_recognition/audio_utils.py')
-rw-r--r--python/pyarmnn/examples/speech_recognition/audio_utils.py128
1 files changed, 128 insertions, 0 deletions
diff --git a/python/pyarmnn/examples/speech_recognition/audio_utils.py b/python/pyarmnn/examples/speech_recognition/audio_utils.py
new file mode 100644
index 0000000000..a522a0e2a7
--- /dev/null
+++ b/python/pyarmnn/examples/speech_recognition/audio_utils.py
@@ -0,0 +1,128 @@
+# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+"""Utilities for speech recognition apps."""
+
+import numpy as np
+import pyarmnn as ann
+
+
+def decode(model_output: np.ndarray, labels: dict) -> str:
+ """Decodes the integer encoded results from inference into a string.
+
+ Args:
+ model_output: Results from running inference.
+ labels: Dictionary of labels keyed on the classification index.
+
+ Returns:
+ Decoded string.
+ """
+ top1_results = [labels[np.argmax(row[0])] for row in model_output]
+ return filter_characters(top1_results)
+
+
+def filter_characters(results: list) -> str:
+ """Filters unwanted and duplicate characters.
+
+ Args:
+ results: List of top 1 results from inference.
+
+ Returns:
+ Final output string to present to user.
+ """
+ text = ""
+ for i in range(len(results)):
+ if results[i] == "$":
+ continue
+ elif i + 1 < len(results) and results[i] == results[i + 1]:
+ continue
+ else:
+ text += results[i]
+ return text
+
+
+def display_text(text: str):
+ """Presents the results on the console.
+
+ Args:
+ text: Results of performing ASR on the input audio data.
+ """
+ print(text, sep="", end="", flush=True)
+
+
+def quantize_input(data, input_binding_info):
+ """Quantize the float input to (u)int8 ready for inputting to model."""
+ if data.ndim != 2:
+ raise RuntimeError("Audio data must have 2 dimensions for quantization")
+
+ quant_scale = input_binding_info[1].GetQuantizationScale()
+ quant_offset = input_binding_info[1].GetQuantizationOffset()
+ data_type = input_binding_info[1].GetDataType()
+
+ if data_type == ann.DataType_QAsymmS8:
+ data_type = np.int8
+ elif data_type == ann.DataType_QAsymmU8:
+ data_type = np.uint8
+ else:
+ raise ValueError("Could not quantize data to required data type")
+
+ d_min = np.iinfo(data_type).min
+ d_max = np.iinfo(data_type).max
+
+ for row in range(data.shape[0]):
+ for col in range(data.shape[1]):
+ data[row, col] = (data[row, col] / quant_scale) + quant_offset
+ data[row, col] = np.clip(data[row, col], d_min, d_max)
+ data = data.astype(data_type)
+ return data
+
+
+def decode_text(is_first_window, labels, output_result):
+ """
+ Slices the text appropriately depending on the window, and decodes for wav2letter output.
+ * First run, take the left context, and inner context.
+ * Every other run, take the inner context.
+ Stores the current right context, and updates it for each inference. Will get used after last inference
+
+ Args:
+ is_first_window: Boolean to show if it is the first window we are running inference on
+ labels: the label set
+ output_result: the output from the inference
+ text: the current text string, to be displayed at the end
+ Returns:
+ current_r_context: the current right context
+ text: the current text string, with the latest output decoded and appended
+ """
+
+ if is_first_window:
+ # Since it's the first inference, keep the left context, and inner context, and decode
+ text = decode(output_result[0][0:472], labels)
+ else:
+ # Only decode the inner context
+ text = decode(output_result[0][49:472], labels)
+
+ # Store the right context, we will need it after the last inference
+ current_r_context = decode(output_result[0][473:521], labels)
+ return current_r_context, text
+
+
+def prepare_input_tensors(audio_data, input_binding_info, mfcc_preprocessor):
+ """
+ Takes a block of audio data, extracts the MFCC features, quantizes the array, and uses ArmNN to create the
+ input tensors.
+
+ Args:
+ audio_data: The audio data to process
+ mfcc_instance: the mfcc class instance
+ input_binding_info: the model input binding info
+ mfcc_preprocessor: the mfcc preprocessor instance
+ Returns:
+ input_tensors: the prepared input tensors, ready to be consumed by the ArmNN NetworkExecutor
+ """
+
+ data_type = input_binding_info[1].GetDataType()
+ input_tensor = mfcc_preprocessor.extract_features(audio_data)
+ if data_type != ann.DataType_Float32:
+ input_tensor = quantize_input(input_tensor, input_binding_info)
+ input_tensors = ann.make_input_tensors([input_binding_info], [input_tensor])
+ return input_tensors