diff options
author | Éanna Ó Catháin <eanna.ocathain@arm.com> | 2020-11-16 14:12:11 +0000 |
---|---|---|
committer | Jim Flynn <jim.flynn@arm.com> | 2020-11-17 12:23:56 +0000 |
commit | 145c88f851d12d2cadc2f080d232c1d5963d6e47 (patch) | |
tree | 6ae197d74782cd2c7ef8965f4b36acabc65ce453 /python/pyarmnn/examples/speech_recognition/run_audio_file.py | |
parent | aa41d5d2f43790938f3a32586626be5ef55b6ca9 (diff) | |
download | armnn-145c88f851d12d2cadc2f080d232c1d5963d6e47.tar.gz |
MLECO-1253 Adding ASR sample application using the PyArmNN api
Change-Id: I450b23800ca316a5bfd4608c8559cf4f11271c21
Signed-off-by: Éanna Ó Catháin <eanna.ocathain@arm.com>
Diffstat (limited to 'python/pyarmnn/examples/speech_recognition/run_audio_file.py')
-rw-r--r-- | python/pyarmnn/examples/speech_recognition/run_audio_file.py | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/python/pyarmnn/examples/speech_recognition/run_audio_file.py b/python/pyarmnn/examples/speech_recognition/run_audio_file.py new file mode 100644 index 0000000000..c7e4c6bc31 --- /dev/null +++ b/python/pyarmnn/examples/speech_recognition/run_audio_file.py @@ -0,0 +1,94 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT + +"""Automatic speech recognition with PyArmNN demo for processing audio clips to text.""" + +import sys +import os +from argparse import ArgumentParser + +script_dir = os.path.dirname(__file__) +sys.path.insert(1, os.path.join(script_dir, '..', 'common')) + +from network_executor import ArmnnNetworkExecutor +from utils import dict_labels +from preprocess import MFCCParams, Preprocessor, MFCC +from audio_capture import AudioCapture, ModelParams +from audio_utils import decode_text, prepare_input_tensors, display_text + + +def parse_args(): + parser = ArgumentParser(description="ASR with PyArmNN") + parser.add_argument( + "--audio_file_path", + required=True, + type=str, + help="Path to the audio file to perform ASR", + ) + parser.add_argument( + "--model_file_path", + required=True, + type=str, + help="Path to ASR model to use", + ) + parser.add_argument( + "--labels_file_path", + required=True, + type=str, + help="Path to text file containing labels to map to model output", + ) + parser.add_argument( + "--preferred_backends", + type=str, + nargs="+", + default=["CpuAcc", "CpuRef"], + help="""List of backends in order of preference for optimizing + subgraphs, falling back to the next backend in the list on unsupported + layers. Defaults to [CpuAcc, CpuRef]""", + ) + return parser.parse_args() + + +def main(args): + # Read command line args + audio_file = args.audio_file_path + model = ModelParams(args.model_file_path) + labels = dict_labels(args.labels_file_path) + + # Create the ArmNN inference runner + network = ArmnnNetworkExecutor(model.path, args.preferred_backends) + + audio_capture = AudioCapture(model) + buffer = audio_capture.from_audio_file(audio_file) + + # Create the preprocessor + mfcc_params = MFCCParams(sampling_freq=16000, num_fbank_bins=128, mel_lo_freq=0, mel_hi_freq=8000, + num_mfcc_feats=13, frame_len=512, use_htk_method=False, n_FFT=512) + mfcc = MFCC(mfcc_params) + preprocessor = Preprocessor(mfcc, model_input_size=1044, stride=160) + + text = "" + current_r_context = "" + is_first_window = True + + print("Processing Audio Frames...") + for audio_data in buffer: + # Prepare the input Tensors + input_tensors = prepare_input_tensors(audio_data, network.input_binding_info, preprocessor) + + # Run inference + output_result = network.run(input_tensors) + + # Slice and Decode the text, and store the right context + current_r_context, text = decode_text(is_first_window, labels, output_result) + + is_first_window = False + + display_text(text) + + print(current_r_context, flush=True) + + +if __name__ == "__main__": + args = parse_args() + main(args) |