From f42f56870c6201a876f025a423eb5540d7438e83 Mon Sep 17 00:00:00 2001
From: alexander <alexander.efremov@arm.com>
Date: Fri, 16 Jul 2021 11:30:56 +0100
Subject: MLECO-2079 Adding the python KWS example

Signed-off-by: Eanna O Cathain <eanna.ocathain@arm.com>
Change-Id: Ie1463aaeb5e3cade22df8f560ae99a8e1c4a9c17
---
 .../examples/speech_recognition/run_audio_file.py  | 48 ++++++++++++----------
 1 file changed, 26 insertions(+), 22 deletions(-)

(limited to 'python/pyarmnn/examples/speech_recognition/run_audio_file.py')

diff --git a/python/pyarmnn/examples/speech_recognition/run_audio_file.py b/python/pyarmnn/examples/speech_recognition/run_audio_file.py
index 942de2081c..0430f68c16 100644
--- a/python/pyarmnn/examples/speech_recognition/run_audio_file.py
+++ b/python/pyarmnn/examples/speech_recognition/run_audio_file.py
@@ -1,20 +1,29 @@
-# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+# Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 # SPDX-License-Identifier: MIT
 
 """Automatic speech recognition with PyArmNN demo for processing audio clips to text."""
 
 import sys
 import os
-from argparse import ArgumentParser
+import numpy as np
 
 script_dir = os.path.dirname(__file__)
 sys.path.insert(1, os.path.join(script_dir, '..', 'common'))
 
+from argparse import ArgumentParser
 from network_executor import ArmnnNetworkExecutor
-from utils import dict_labels
-from preprocess import MFCCParams, Preprocessor, MFCC
-from audio_capture import AudioCapture, ModelParams
-from audio_utils import decode_text, prepare_input_tensors, display_text
+from utils import prepare_input_tensors
+from audio_capture import AudioCaptureParams, capture_audio
+from audio_utils import decode_text, display_text
+from wav2letter_mfcc import Wav2LetterMFCC, W2LAudioPreprocessor
+from mfcc import MFCCParams
+
+# Model Specific Labels
+labels = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm',
+          13: 'n',
+          14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y',
+          25: 'z',
+          26: "'", 27: ' ', 28: '$'}
 
 
 def parse_args():
@@ -31,12 +40,6 @@ def parse_args():
         type=str,
         help="Path to ASR model to use",
     )
-    parser.add_argument(
-        "--labels_file_path",
-        required=True,
-        type=str,
-        help="Path to text file containing labels to map to model output",
-    )
     parser.add_argument(
         "--preferred_backends",
         type=str,
@@ -52,22 +55,23 @@ def parse_args():
 def main(args):
     # Read command line args
     audio_file = args.audio_file_path
-    model = ModelParams(args.model_file_path)
-    labels = dict_labels(args.labels_file_path)
 
     # Create the ArmNN inference runner
-    network = ArmnnNetworkExecutor(model.path, args.preferred_backends)
+    network = ArmnnNetworkExecutor(args.model_file_path, args.preferred_backends)
+
+    # Specify model specific audio data requirements
+    audio_capture_params = AudioCaptureParams(dtype=np.float32, overlap=31712, min_samples=47712, sampling_freq=16000,
+                                              mono=True)
+
+    buffer = capture_audio(audio_file, audio_capture_params)
 
-    audio_capture = AudioCapture(model)
-    buffer = audio_capture.from_audio_file(audio_file)
+    # Extract features and create the preprocessor
 
-    # Create the preprocessor
     mfcc_params = MFCCParams(sampling_freq=16000, num_fbank_bins=128, mel_lo_freq=0, mel_hi_freq=8000,
-                                        num_mfcc_feats=13, frame_len=512, use_htk_method=False, n_FFT=512)
-    mfcc = MFCC(mfcc_params)
-    preprocessor = Preprocessor(mfcc, model_input_size=296, stride=160)
+                             num_mfcc_feats=13, frame_len=512, use_htk_method=False, n_fft=512)
 
-    text = ""
+    wmfcc = Wav2LetterMFCC(mfcc_params)
+    preprocessor = W2LAudioPreprocessor(wmfcc, model_input_size=296, stride=160)
     current_r_context = ""
     is_first_window = True
 
-- 
cgit v1.2.1