From 4018b21cd41437f1e1b2e528d5521136f39ff2b1 Mon Sep 17 00:00:00 2001 From: Nina Drozd Date: Tue, 2 Feb 2021 17:49:17 +0000 Subject: MLECO-1253: update ASR example python app * add link to new wav2letter model from ModelZoo in Readme * update model input size * update to match new model's output Signed-off-by: Nina Drozd Change-Id: I8e85d025610a458e9ae7be93fd7179c71bac5b18 --- python/pyarmnn/examples/speech_recognition/README.md | 9 +++++++++ .../pyarmnn/examples/speech_recognition/audio_capture.py | 2 +- python/pyarmnn/examples/speech_recognition/audio_utils.py | 15 ++++++++++----- .../pyarmnn/examples/speech_recognition/run_audio_file.py | 2 +- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/python/pyarmnn/examples/speech_recognition/README.md b/python/pyarmnn/examples/speech_recognition/README.md index 2323eac656..5ccf003c77 100644 --- a/python/pyarmnn/examples/speech_recognition/README.md +++ b/python/pyarmnn/examples/speech_recognition/README.md @@ -35,6 +35,15 @@ Install the required Python modules: $ pip install -r requirements.txt ``` +### Model + +The model for this can be found in the Arm Model Zoo repository: +https://github.com/ARM-software/ML-zoo/tree/master/models + +The model we're looking for: +https://github.com/ARM-software/ML-zoo/tree/master/models/speech_recognition/wav2letter/tflite_int8 + + ## Performing Automatic Speech Recognition ### Processing Audio Files diff --git a/python/pyarmnn/examples/speech_recognition/audio_capture.py b/python/pyarmnn/examples/speech_recognition/audio_capture.py index 9f28d1006e..0c899208a4 100644 --- a/python/pyarmnn/examples/speech_recognition/audio_capture.py +++ b/python/pyarmnn/examples/speech_recognition/audio_capture.py @@ -20,7 +20,7 @@ class ModelParams: self.mono = True self.dtype = np.float32 self.samplerate = 16000 - self.min_samples = 167392 + self.min_samples = 47712 # (model_input_size-1)*stride + frame_len class AudioCapture: diff --git a/python/pyarmnn/examples/speech_recognition/audio_utils.py b/python/pyarmnn/examples/speech_recognition/audio_utils.py index a522a0e2a7..f03d2e1290 100644 --- a/python/pyarmnn/examples/speech_recognition/audio_utils.py +++ b/python/pyarmnn/examples/speech_recognition/audio_utils.py @@ -17,7 +17,7 @@ def decode(model_output: np.ndarray, labels: dict) -> str: Returns: Decoded string. """ - top1_results = [labels[np.argmax(row[0])] for row in model_output] + top1_results = [labels[np.argmax(row)] for row in model_output] return filter_characters(top1_results) @@ -82,7 +82,7 @@ def decode_text(is_first_window, labels, output_result): Slices the text appropriately depending on the window, and decodes for wav2letter output. * First run, take the left context, and inner context. * Every other run, take the inner context. - Stores the current right context, and updates it for each inference. Will get used after last inference + Stores the current right context, and updates it for each inference. Will get used after last inference. Args: is_first_window: Boolean to show if it is the first window we are running inference on @@ -93,16 +93,21 @@ def decode_text(is_first_window, labels, output_result): current_r_context: the current right context text: the current text string, with the latest output decoded and appended """ + # For wav2letter with 148 output steps: + # Left context is index 0-48, inner context 49-99, right context 100-147 + inner_context_start = 49 + inner_context_end = 99 + right_context_start = 100 if is_first_window: # Since it's the first inference, keep the left context, and inner context, and decode - text = decode(output_result[0][0:472], labels) + text = decode(output_result[0][0][0][0:inner_context_end], labels) else: # Only decode the inner context - text = decode(output_result[0][49:472], labels) + text = decode(output_result[0][0][0][inner_context_start:inner_context_end], labels) # Store the right context, we will need it after the last inference - current_r_context = decode(output_result[0][473:521], labels) + current_r_context = decode(output_result[0][0][0][right_context_start:], labels) return current_r_context, text diff --git a/python/pyarmnn/examples/speech_recognition/run_audio_file.py b/python/pyarmnn/examples/speech_recognition/run_audio_file.py index c7e4c6bc31..942de2081c 100644 --- a/python/pyarmnn/examples/speech_recognition/run_audio_file.py +++ b/python/pyarmnn/examples/speech_recognition/run_audio_file.py @@ -65,7 +65,7 @@ def main(args): mfcc_params = MFCCParams(sampling_freq=16000, num_fbank_bins=128, mel_lo_freq=0, mel_hi_freq=8000, num_mfcc_feats=13, frame_len=512, use_htk_method=False, n_FFT=512) mfcc = MFCC(mfcc_params) - preprocessor = Preprocessor(mfcc, model_input_size=1044, stride=160) + preprocessor = Preprocessor(mfcc, model_input_size=296, stride=160) text = "" current_r_context = "" -- cgit v1.2.1