aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNina Drozd <nina.drozd@arm.com>2021-02-02 17:49:17 +0000
committerRichard <richard.burton@arm.com>2021-02-23 21:20:02 +0000
commit4018b21cd41437f1e1b2e528d5521136f39ff2b1 (patch)
tree5c1b790d1ebae59240281b49ef833d24f5aa7ef1
parent0909c5602e2cbe7ac73a7db6787f7bdb1facd2ab (diff)
downloadarmnn-4018b21cd41437f1e1b2e528d5521136f39ff2b1.tar.gz
MLECO-1253: update ASR example python app
* add link to new wav2letter model from ModelZoo in Readme * update model input size * update to match new model's output Signed-off-by: Nina Drozd <nina.drozd@arm.com> Change-Id: I8e85d025610a458e9ae7be93fd7179c71bac5b18
-rw-r--r--python/pyarmnn/examples/speech_recognition/README.md9
-rw-r--r--python/pyarmnn/examples/speech_recognition/audio_capture.py2
-rw-r--r--python/pyarmnn/examples/speech_recognition/audio_utils.py15
-rw-r--r--python/pyarmnn/examples/speech_recognition/run_audio_file.py2
4 files changed, 21 insertions, 7 deletions
diff --git a/python/pyarmnn/examples/speech_recognition/README.md b/python/pyarmnn/examples/speech_recognition/README.md
index 2323eac656..5ccf003c77 100644
--- a/python/pyarmnn/examples/speech_recognition/README.md
+++ b/python/pyarmnn/examples/speech_recognition/README.md
@@ -35,6 +35,15 @@ Install the required Python modules:
$ pip install -r requirements.txt
```
+### Model
+
+The model for this can be found in the Arm Model Zoo repository:
+https://github.com/ARM-software/ML-zoo/tree/master/models
+
+The model we're looking for:
+https://github.com/ARM-software/ML-zoo/tree/master/models/speech_recognition/wav2letter/tflite_int8
+
+
## Performing Automatic Speech Recognition
### Processing Audio Files
diff --git a/python/pyarmnn/examples/speech_recognition/audio_capture.py b/python/pyarmnn/examples/speech_recognition/audio_capture.py
index 9f28d1006e..0c899208a4 100644
--- a/python/pyarmnn/examples/speech_recognition/audio_capture.py
+++ b/python/pyarmnn/examples/speech_recognition/audio_capture.py
@@ -20,7 +20,7 @@ class ModelParams:
self.mono = True
self.dtype = np.float32
self.samplerate = 16000
- self.min_samples = 167392
+ self.min_samples = 47712 # (model_input_size-1)*stride + frame_len
class AudioCapture:
diff --git a/python/pyarmnn/examples/speech_recognition/audio_utils.py b/python/pyarmnn/examples/speech_recognition/audio_utils.py
index a522a0e2a7..f03d2e1290 100644
--- a/python/pyarmnn/examples/speech_recognition/audio_utils.py
+++ b/python/pyarmnn/examples/speech_recognition/audio_utils.py
@@ -17,7 +17,7 @@ def decode(model_output: np.ndarray, labels: dict) -> str:
Returns:
Decoded string.
"""
- top1_results = [labels[np.argmax(row[0])] for row in model_output]
+ top1_results = [labels[np.argmax(row)] for row in model_output]
return filter_characters(top1_results)
@@ -82,7 +82,7 @@ def decode_text(is_first_window, labels, output_result):
Slices the text appropriately depending on the window, and decodes for wav2letter output.
* First run, take the left context, and inner context.
* Every other run, take the inner context.
- Stores the current right context, and updates it for each inference. Will get used after last inference
+ Stores the current right context, and updates it for each inference. Will get used after last inference.
Args:
is_first_window: Boolean to show if it is the first window we are running inference on
@@ -93,16 +93,21 @@ def decode_text(is_first_window, labels, output_result):
current_r_context: the current right context
text: the current text string, with the latest output decoded and appended
"""
+ # For wav2letter with 148 output steps:
+ # Left context is index 0-48, inner context 49-99, right context 100-147
+ inner_context_start = 49
+ inner_context_end = 99
+ right_context_start = 100
if is_first_window:
# Since it's the first inference, keep the left context, and inner context, and decode
- text = decode(output_result[0][0:472], labels)
+ text = decode(output_result[0][0][0][0:inner_context_end], labels)
else:
# Only decode the inner context
- text = decode(output_result[0][49:472], labels)
+ text = decode(output_result[0][0][0][inner_context_start:inner_context_end], labels)
# Store the right context, we will need it after the last inference
- current_r_context = decode(output_result[0][473:521], labels)
+ current_r_context = decode(output_result[0][0][0][right_context_start:], labels)
return current_r_context, text
diff --git a/python/pyarmnn/examples/speech_recognition/run_audio_file.py b/python/pyarmnn/examples/speech_recognition/run_audio_file.py
index c7e4c6bc31..942de2081c 100644
--- a/python/pyarmnn/examples/speech_recognition/run_audio_file.py
+++ b/python/pyarmnn/examples/speech_recognition/run_audio_file.py
@@ -65,7 +65,7 @@ def main(args):
mfcc_params = MFCCParams(sampling_freq=16000, num_fbank_bins=128, mel_lo_freq=0, mel_hi_freq=8000,
num_mfcc_feats=13, frame_len=512, use_htk_method=False, n_FFT=512)
mfcc = MFCC(mfcc_params)
- preprocessor = Preprocessor(mfcc, model_input_size=1044, stride=160)
+ preprocessor = Preprocessor(mfcc, model_input_size=296, stride=160)
text = ""
current_r_context = ""