MLECO-1253 Adding ASR sample application using the PyArmNN api

Change-Id: I450b23800ca316a5bfd4608c8559cf4f11271c21 Signed-off-by: Éanna Ó Catháin <eanna.ocathain@arm.com>
author: Éanna Ó Catháin <eanna.ocathain@arm.com> 2020-11-16 14:12:11 +0000
committer: Jim Flynn <jim.flynn@arm.com> 2020-11-17 12:23:56 +0000
commit: 145c88f851d12d2cadc2f080d232c1d5963d6e47 (patch)
tree: 6ae197d74782cd2c7ef8965f4b36acabc65ce453 /python/pyarmnn/examples/speech_recognition/audio_capture.py
parent: aa41d5d2f43790938f3a32586626be5ef55b6ca9 (diff)
download: armnn-145c88f851d12d2cadc2f080d232c1d5963d6e47.tar.gz
1 files changed, 56 insertions, 0 deletions
diff --git a/python/pyarmnn/examples/speech_recognition/audio_capture.py b/python/pyarmnn/examples/speech_recognition/audio_capture.py
new file mode 100644
index 0000000000..9f28d1006e
--- /dev/null
+++ b/python/pyarmnn/examples/speech_recognition/audio_capture.py
@@ -0,0 +1,56 @@
+# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+"""Contains AudioCapture class for capturing chunks of audio data from file."""
+
+from typing import Generator
+
+import numpy as np
+import soundfile as sf
+
+
+class ModelParams:
+    def __init__(self, model_file_path: str):
+        """Defines sampling parameters for model used.
+
+        Args:
+            model_file_path: Path to ASR model to use.
+        """
+        self.path = model_file_path
+        self.mono = True
+        self.dtype = np.float32
+        self.samplerate = 16000
+        self.min_samples = 167392
+
+
+class AudioCapture:
+    def __init__(self, model_params):
+        """Sampling parameters for model used."""
+        self.model_params = model_params
+
+    def from_audio_file(self, audio_file_path, overlap=31712) -> Generator[np.ndarray, None, None]:
+        """Creates a generator that yields audio data from a file. Data is padded with
+        zeros if necessary to make up minimum number of samples.
+
+        Args:
+            audio_file_path: Path to audio file provided by user.
+            overlap: The overlap with previous buffer. We need the offset to be the same as the inner context
+                    of the mfcc output, which is sized as 100 x 39. Each mfcc compute produces 1 x 39 vector,
+                    and consumes 160 audio samples. The default overlap is then calculated to be 47712 - (160 x 100)
+                    where 47712 is the min_samples needed for 1 inference of wav2letter.
+
+        Yields:
+            Blocks of audio data of minimum sample size.
+        """
+        with sf.SoundFile(audio_file_path) as audio_file:
+            for block in audio_file.blocks(
+                    blocksize=self.model_params.min_samples,
+                    dtype=self.model_params.dtype,
+                    always_2d=True,
+                    fill_value=0,
+                    overlap=overlap
+            ):
+                # Convert to mono if specified
+                if self.model_params.mono and block.shape[0] > 1:
+                    block = np.mean(block, axis=1)
+                yield block
author	Éanna Ó Catháin <eanna.ocathain@arm.com>	2020-11-16 14:12:11 +0000
committer	Jim Flynn <jim.flynn@arm.com>	2020-11-17 12:23:56 +0000
commit	145c88f851d12d2cadc2f080d232c1d5963d6e47 (patch)
tree	6ae197d74782cd2c7ef8965f4b36acabc65ce453 /python/pyarmnn/examples/speech_recognition/audio_capture.py
parent	aa41d5d2f43790938f3a32586626be5ef55b6ca9 (diff)
download	armnn-145c88f851d12d2cadc2f080d232c1d5963d6e47.tar.gz