From f42f56870c6201a876f025a423eb5540d7438e83 Mon Sep 17 00:00:00 2001 From: alexander Date: Fri, 16 Jul 2021 11:30:56 +0100 Subject: MLECO-2079 Adding the python KWS example Signed-off-by: Eanna O Cathain Change-Id: Ie1463aaeb5e3cade22df8f560ae99a8e1c4a9c17 --- .../examples/speech_recognition/audio_utils.py | 53 +--------------------- 1 file changed, 1 insertion(+), 52 deletions(-) (limited to 'python/pyarmnn/examples/speech_recognition/audio_utils.py') diff --git a/python/pyarmnn/examples/speech_recognition/audio_utils.py b/python/pyarmnn/examples/speech_recognition/audio_utils.py index f03d2e1290..1ac78e8074 100644 --- a/python/pyarmnn/examples/speech_recognition/audio_utils.py +++ b/python/pyarmnn/examples/speech_recognition/audio_utils.py @@ -1,10 +1,9 @@ -# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# Copyright © 2021 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT """Utilities for speech recognition apps.""" import numpy as np -import pyarmnn as ann def decode(model_output: np.ndarray, labels: dict) -> str: @@ -50,33 +49,6 @@ def display_text(text: str): print(text, sep="", end="", flush=True) -def quantize_input(data, input_binding_info): - """Quantize the float input to (u)int8 ready for inputting to model.""" - if data.ndim != 2: - raise RuntimeError("Audio data must have 2 dimensions for quantization") - - quant_scale = input_binding_info[1].GetQuantizationScale() - quant_offset = input_binding_info[1].GetQuantizationOffset() - data_type = input_binding_info[1].GetDataType() - - if data_type == ann.DataType_QAsymmS8: - data_type = np.int8 - elif data_type == ann.DataType_QAsymmU8: - data_type = np.uint8 - else: - raise ValueError("Could not quantize data to required data type") - - d_min = np.iinfo(data_type).min - d_max = np.iinfo(data_type).max - - for row in range(data.shape[0]): - for col in range(data.shape[1]): - data[row, col] = (data[row, col] / quant_scale) + quant_offset - data[row, col] = np.clip(data[row, col], d_min, d_max) - data = data.astype(data_type) - return data - - def decode_text(is_first_window, labels, output_result): """ Slices the text appropriately depending on the window, and decodes for wav2letter output. @@ -88,7 +60,6 @@ def decode_text(is_first_window, labels, output_result): is_first_window: Boolean to show if it is the first window we are running inference on labels: the label set output_result: the output from the inference - text: the current text string, to be displayed at the end Returns: current_r_context: the current right context text: the current text string, with the latest output decoded and appended @@ -109,25 +80,3 @@ def decode_text(is_first_window, labels, output_result): # Store the right context, we will need it after the last inference current_r_context = decode(output_result[0][0][0][right_context_start:], labels) return current_r_context, text - - -def prepare_input_tensors(audio_data, input_binding_info, mfcc_preprocessor): - """ - Takes a block of audio data, extracts the MFCC features, quantizes the array, and uses ArmNN to create the - input tensors. - - Args: - audio_data: The audio data to process - mfcc_instance: the mfcc class instance - input_binding_info: the model input binding info - mfcc_preprocessor: the mfcc preprocessor instance - Returns: - input_tensors: the prepared input tensors, ready to be consumed by the ArmNN NetworkExecutor - """ - - data_type = input_binding_info[1].GetDataType() - input_tensor = mfcc_preprocessor.extract_features(audio_data) - if data_type != ann.DataType_Float32: - input_tensor = quantize_input(input_tensor, input_binding_info) - input_tensors = ann.make_input_tensors([input_binding_info], [input_tensor]) - return input_tensors -- cgit v1.2.1