aboutsummaryrefslogtreecommitdiff
path: root/python/pyarmnn/examples/speech_recognition/tests
diff options
context:
space:
mode:
Diffstat (limited to 'python/pyarmnn/examples/speech_recognition/tests')
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/conftest.py58
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/context.py13
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/test_audio_file.py17
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/test_decoder.py15
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/test_mfcc.py286
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/testdata/inf_out.npybin4420 -> 0 bytes
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/testdata/inference_output.npybin0 -> 2999 bytes
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/testdata/quick_brown_fox_16000khz.wavbin196728 -> 0 bytes
-rw-r--r--python/pyarmnn/examples/speech_recognition/tests/testdata/wav2letter_labels.txt29
9 files changed, 32 insertions, 386 deletions
diff --git a/python/pyarmnn/examples/speech_recognition/tests/conftest.py b/python/pyarmnn/examples/speech_recognition/tests/conftest.py
index 730c291cfa..151816e919 100644
--- a/python/pyarmnn/examples/speech_recognition/tests/conftest.py
+++ b/python/pyarmnn/examples/speech_recognition/tests/conftest.py
@@ -1,34 +1,24 @@
-# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-import os
-import ntpath
-
-import urllib.request
-
-import pytest
-
-script_dir = os.path.dirname(__file__)
-
-@pytest.fixture(scope="session")
-def test_data_folder(request):
- """
- This fixture returns path to folder with shared test resources among all tests
- """
-
- data_dir = os.path.join(script_dir, "testdata")
-
- if not os.path.exists(data_dir):
- os.mkdir(data_dir)
-
- files_to_download = ["https://raw.githubusercontent.com/Azure-Samples/cognitive-services-speech-sdk/master"
- "/sampledata/audiofiles/myVoiceIsMyPassportVerifyMe04.wav"]
-
- for file in files_to_download:
- path, filename = ntpath.split(file)
- file_path = os.path.join(script_dir, "testdata", filename)
- if not os.path.exists(file_path):
- print("\nDownloading test file: " + file_path + "\n")
- urllib.request.urlretrieve(file, file_path)
-
- return data_dir
+# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+import os
+import ntpath
+
+import urllib.request
+
+import pytest
+
+script_dir = os.path.dirname(__file__)
+
+@pytest.fixture(scope="session")
+def test_data_folder(request):
+ """
+ This fixture returns path to folder with shared test resources among asr tests
+ """
+
+ data_dir = os.path.join(script_dir, "testdata")
+
+ if not os.path.exists(data_dir):
+ os.mkdir(data_dir)
+
+ return data_dir \ No newline at end of file
diff --git a/python/pyarmnn/examples/speech_recognition/tests/context.py b/python/pyarmnn/examples/speech_recognition/tests/context.py
deleted file mode 100644
index a810010e9f..0000000000
--- a/python/pyarmnn/examples/speech_recognition/tests/context.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-import os
-import sys
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'common'))
-import utils as common_utils
-
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
-import audio_capture
-import audio_utils
-import preprocess
diff --git a/python/pyarmnn/examples/speech_recognition/tests/test_audio_file.py b/python/pyarmnn/examples/speech_recognition/tests/test_audio_file.py
deleted file mode 100644
index 281d0df587..0000000000
--- a/python/pyarmnn/examples/speech_recognition/tests/test_audio_file.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-import os
-
-import numpy as np
-
-from context import audio_capture
-
-
-def test_audio_file(test_data_folder):
- audio_file = os.path.join(test_data_folder, "myVoiceIsMyPassportVerifyMe04.wav")
- capture = audio_capture.AudioCapture(audio_capture.ModelParams(""))
- buffer = capture.from_audio_file(audio_file)
- audio_data = next(buffer)
- assert audio_data.shape == (47712,)
- assert audio_data.dtype == np.float32
diff --git a/python/pyarmnn/examples/speech_recognition/tests/test_decoder.py b/python/pyarmnn/examples/speech_recognition/tests/test_decoder.py
index 1db71a47b8..14db7f2064 100644
--- a/python/pyarmnn/examples/speech_recognition/tests/test_decoder.py
+++ b/python/pyarmnn/examples/speech_recognition/tests/test_decoder.py
@@ -5,13 +5,16 @@ import os
import numpy as np
-from context import common_utils
from context import audio_utils
+labels = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm',
+ 13: 'n',
+ 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y',
+ 25: 'z',
+ 26: "'", 27: ' ', 28: '$'}
+
def test_labels(test_data_folder):
- labels_file = os.path.join(test_data_folder, "wav2letter_labels.txt")
- labels = common_utils.dict_labels(labels_file)
assert len(labels) == 29
assert labels[26] == "\'"
assert labels[27] == r" "
@@ -19,10 +22,8 @@ def test_labels(test_data_folder):
def test_decoder(test_data_folder):
- labels_file = os.path.join(test_data_folder, "wav2letter_labels.txt")
- labels = common_utils.dict_labels(labels_file)
- output_tensor = os.path.join(test_data_folder, "inf_out.npy")
+ output_tensor = os.path.join(test_data_folder, "inference_output.npy")
encoded = np.load(output_tensor)
decoded_text = audio_utils.decode(encoded, labels)
- assert decoded_text == "and they walkd immediately out of the apartiment by anothe"
+ assert decoded_text == "my voice is my pass"
diff --git a/python/pyarmnn/examples/speech_recognition/tests/test_mfcc.py b/python/pyarmnn/examples/speech_recognition/tests/test_mfcc.py
deleted file mode 100644
index d692ab51c8..0000000000
--- a/python/pyarmnn/examples/speech_recognition/tests/test_mfcc.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-import numpy as np
-
-from context import preprocess
-
-test_wav = [
- -3,0,1,-1,2,3,-2,2,
- 1,-2,0,3,-1,8,3,2,
- -1,-1,2,7,3,5,6,6,
- 6,12,5,6,3,3,5,4,
- 4,6,7,7,7,3,7,2,
- 8,4,4,2,-4,-1,-1,-4,
- 2,1,-1,-4,0,-7,-6,-2,
- -5,1,-5,-1,-7,-3,-3,-7,
- 0,-3,3,-5,0,1,-2,-2,
- -3,-3,-7,-3,-2,-6,-5,-8,
- -2,-8,4,-9,-4,-9,-5,-5,
- -3,-9,-3,-9,-1,-7,-4,1,
- -3,2,-8,-4,-4,-5,1,-3,
- -1,0,-1,-2,-3,-2,-4,-1,
- 1,-1,3,0,3,2,0,0,
- 0,-3,1,1,0,8,3,4,
- 1,5,6,4,7,3,3,0,
- 3,6,7,6,4,5,9,9,
- 5,5,8,1,6,9,6,6,
- 7,1,8,1,5,0,5,5,
- 0,3,2,7,2,-3,3,0,
- 3,0,0,0,2,0,-1,-1,
- -2,-3,-8,0,1,0,-3,-3,
- -3,-2,-3,-3,-4,-6,-2,-8,
- -9,-4,-1,-5,-3,-3,-4,-3,
- -6,3,0,-1,-2,-9,-4,-2,
- 2,-1,3,-5,-5,-2,0,-2,
- 0,-1,-3,1,-2,9,4,5,
- 2,2,1,0,-6,-2,0,0,
- 0,-1,4,-4,3,-7,-1,5,
- -6,-1,-5,4,3,9,-2,1,
- 3,0,0,-2,1,2,1,1,
- 0,3,2,-1,3,-3,7,0,
- 0,3,2,2,-2,3,-2,2,
- -3,4,-1,-1,-5,-1,-3,-2,
- 1,-1,3,2,4,1,2,-2,
- 0,2,7,0,8,-3,6,-3,
- 6,1,2,-3,-1,-1,-1,1,
- -2,2,1,2,0,-2,3,-2,
- 3,-2,1,0,-3,-1,-2,-4,
- -6,-5,-8,-1,-4,0,-3,-1,
- -1,-1,0,-2,-3,-7,-1,0,
- 1,5,0,5,1,1,-3,0,
- -6,3,-8,4,-8,6,-6,1,
- -6,-2,-5,-6,0,-5,4,-1,
- 4,-2,1,2,1,0,-2,0,
- 0,2,-2,2,-5,2,0,-2,
- 1,-2,0,5,1,0,1,5,
- 0,8,3,2,2,0,5,-2,
- 3,1,0,1,0,-2,-1,-3,
- 1,-1,3,0,3,0,-2,-1,
- -4,-4,-4,-1,-4,-4,-3,-6,
- -3,-7,-3,-1,-2,0,-5,-4,
- -7,-3,-2,-2,1,2,2,8,
- 5,4,2,4,3,5,0,3,
- 3,6,4,2,2,-2,4,-2,
- 3,3,2,1,1,4,-5,2,
- -3,0,-1,1,-2,2,5,1,
- 4,2,3,1,-1,1,0,6,
- 0,-2,-1,1,-1,2,-5,-1,
- -5,-1,-6,-3,-3,2,4,0,
- -1,-5,3,-4,-1,-3,-4,1,
- -4,1,-1,-1,0,-5,-4,-2,
- -1,-1,-3,-7,-3,-3,4,4,
-]
-
-def test_mel_scale_function_with_htk_true():
- samp_freq = 16000
- frame_len_ms = 32
- frame_len_samples = samp_freq * frame_len_ms * 0.001
- num_mfcc_feats = 13
- num_fbank_bins = 128
- mel_lo_freq = 0
- mil_hi_freq = 8000
- use_htk = False
- n_FFT = 512
-
- mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
- frame_len_samples, use_htk, n_FFT)
-
- mfcc_inst = preprocess.MFCC(mfcc_params)
-
- mel = mfcc_inst.mel_scale(16, True)
-
- assert np.isclose(mel, 25.470010570730597)
-
-
-def test_mel_scale_function_with_htk_false():
- samp_freq = 16000
- frame_len_ms = 32
- frame_len_samples = samp_freq * frame_len_ms * 0.001
- num_mfcc_feats = 13
- num_fbank_bins = 128
- mel_lo_freq = 0
- mil_hi_freq = 8000
- use_htk = False
- n_FFT = 512
-
- mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
- frame_len_samples, use_htk, n_FFT)
-
- mfcc_inst = preprocess.MFCC(mfcc_params)
-
- mel = mfcc_inst.mel_scale(16, False)
-
- assert np.isclose(mel, 0.24)
-
-
-def test_inverse_mel_scale_function_with_htk_true():
- samp_freq = 16000
- frame_len_ms = 32
- frame_len_samples = samp_freq * frame_len_ms * 0.001
- num_mfcc_feats = 13
- num_fbank_bins = 128
- mel_lo_freq = 0
- mil_hi_freq = 8000
- use_htk = False
- n_FFT = 512
-
- mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
- frame_len_samples, use_htk, n_FFT)
-
- mfcc_inst = preprocess.MFCC(mfcc_params)
-
- mel = mfcc_inst.inv_mel_scale(16, True)
-
- assert np.isclose(mel, 10.008767240008943)
-
-
-def test_inverse_mel_scale_function_with_htk_false():
- samp_freq = 16000
- frame_len_ms = 32
- frame_len_samples = samp_freq * frame_len_ms * 0.001
- num_mfcc_feats = 13
- num_fbank_bins = 128
- mel_lo_freq = 0
- mil_hi_freq = 8000
- use_htk = False
- n_FFT = 512
-
- mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
- frame_len_samples, use_htk, n_FFT)
-
- mfcc_inst = preprocess.MFCC(mfcc_params)
-
- mel = mfcc_inst.inv_mel_scale(16, False)
-
- assert np.isclose(mel, 1071.170287494467)
-
-
-def test_create_mel_filter_bank():
- samp_freq = 16000
- frame_len_ms = 32
- frame_len_samples = samp_freq * frame_len_ms * 0.001
- num_mfcc_feats = 13
- num_fbank_bins = 128
- mel_lo_freq = 0
- mil_hi_freq = 8000
- use_htk = False
- n_FFT = 512
-
- mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
- frame_len_samples, use_htk, n_FFT)
-
- mfcc_inst = preprocess.MFCC(mfcc_params)
-
- mel_filter_bank = mfcc_inst.create_mel_filter_bank()
-
- assert len(mel_filter_bank) == 128
-
- assert str(mel_filter_bank[0]) == "[0.02837754]"
- assert str(mel_filter_bank[1]) == "[0.01438901 0.01398853]"
- assert str(mel_filter_bank[2]) == "[0.02877802]"
- assert str(mel_filter_bank[3]) == "[0.04236608]"
- assert str(mel_filter_bank[4]) == "[0.00040047 0.02797707]"
- assert str(mel_filter_bank[5]) == "[0.01478948 0.01358806]"
- assert str(mel_filter_bank[50]) == "[0.03298853]"
- assert str(mel_filter_bank[100]) == "[0.00260166 0.00588759 0.00914814 0.00798015 0.00476919 0.00158245]"
-
-
-def test_mfcc_compute():
- samp_freq = 16000
- frame_len_ms = 32
- frame_len_samples = samp_freq * frame_len_ms * 0.001
- num_mfcc_feats = 13
- num_fbank_bins = 128
- mel_lo_freq = 0
- mil_hi_freq = 8000
- use_htk = False
- n_FFT = 512
-
- audio_data = np.array(test_wav) / (2 ** 15)
-
- mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
- frame_len_samples, use_htk, n_FFT)
- mfcc_inst = preprocess.MFCC(mfcc_params)
- mfcc_feats = mfcc_inst.mfcc_compute(audio_data)
-
- assert np.isclose((mfcc_feats[0]), -834.9656973095651)
- assert np.isclose((mfcc_feats[1]), 21.026915475076322)
- assert np.isclose((mfcc_feats[2]), 18.628541708201688)
- assert np.isclose((mfcc_feats[3]), 7.341153529494758)
- assert np.isclose((mfcc_feats[4]), 18.907974386153214)
- assert np.isclose((mfcc_feats[5]), -5.360387487466194)
- assert np.isclose((mfcc_feats[6]), 6.523572638527085)
- assert np.isclose((mfcc_feats[7]), -11.270643644983316)
- assert np.isclose((mfcc_feats[8]), 8.375177203773777)
- assert np.isclose((mfcc_feats[9]), 12.06721844362991)
- assert np.isclose((mfcc_feats[10]), 8.30815892468875)
- assert np.isclose((mfcc_feats[11]), -13.499911910889917)
- assert np.isclose((mfcc_feats[12]), -18.176121251436165)
-
-
-def test_sliding_window_for_small_num_samples():
- samp_freq = 16000
- frame_len_ms = 32
- frame_len_samples = samp_freq * frame_len_ms * 0.001
- num_mfcc_feats = 13
- mode_input_size = 9
- stride = 160
- num_fbank_bins = 128
- mel_lo_freq = 0
- mil_hi_freq = 8000
- use_htk = False
- n_FFT = 512
-
- audio_data = np.array(test_wav) / (2 ** 15)
-
- full_audio_data = np.tile(audio_data, 9)
-
- mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
- frame_len_samples, use_htk, n_FFT)
- mfcc_inst = preprocess.MFCC(mfcc_params)
- preprocessor = preprocess.Preprocessor(mfcc_inst, mode_input_size, stride)
-
- input_tensor = preprocessor.extract_features(full_audio_data)
-
- assert np.isclose(input_tensor[0][0], -3.4660944830426454)
- assert np.isclose(input_tensor[0][1], 0.3587718932127629)
- assert np.isclose(input_tensor[0][2], 0.3480551325669172)
- assert np.isclose(input_tensor[0][3], 0.2976191917228921)
- assert np.isclose(input_tensor[0][4], 0.3493037340849936)
- assert np.isclose(input_tensor[0][5], 0.2408643285767937)
- assert np.isclose(input_tensor[0][6], 0.2939659585037282)
- assert np.isclose(input_tensor[0][7], 0.2144552669573928)
- assert np.isclose(input_tensor[0][8], 0.302239565899944)
- assert np.isclose(input_tensor[0][9], 0.3187368787077345)
- assert np.isclose(input_tensor[0][10], 0.3019401051295793)
- assert np.isclose(input_tensor[0][11], 0.20449412797602678)
-
- assert np.isclose(input_tensor[0][38], -0.18751440767749533)
-
-
-def test_sliding_window_for_wav_2_letter_sized_input():
- samp_freq = 16000
- frame_len_ms = 32
- frame_len_samples = samp_freq * frame_len_ms * 0.001
- num_mfcc_feats = 13
- mode_input_size = 296
- stride = 160
- num_fbank_bins = 128
- mel_lo_freq = 0
- mil_hi_freq = 8000
- use_htk = False
- n_FFT = 512
-
- audio_data = np.zeros(47712, dtype=int)
-
- mfcc_params = preprocess.MFCCParams(samp_freq, num_fbank_bins, mel_lo_freq, mil_hi_freq, num_mfcc_feats,
- frame_len_samples, use_htk, n_FFT)
-
- mfcc_inst = preprocess.MFCC(mfcc_params)
- preprocessor = preprocess.Preprocessor(mfcc_inst, mode_input_size, stride)
-
- input_tensor = preprocessor.extract_features(audio_data)
-
- assert len(input_tensor[0]) == 39
- assert len(input_tensor) == 296
diff --git a/python/pyarmnn/examples/speech_recognition/tests/testdata/inf_out.npy b/python/pyarmnn/examples/speech_recognition/tests/testdata/inf_out.npy
deleted file mode 100644
index a6f9ec0c70..0000000000
--- a/python/pyarmnn/examples/speech_recognition/tests/testdata/inf_out.npy
+++ /dev/null
Binary files differ
diff --git a/python/pyarmnn/examples/speech_recognition/tests/testdata/inference_output.npy b/python/pyarmnn/examples/speech_recognition/tests/testdata/inference_output.npy
new file mode 100644
index 0000000000..88c42e0b70
--- /dev/null
+++ b/python/pyarmnn/examples/speech_recognition/tests/testdata/inference_output.npy
Binary files differ
diff --git a/python/pyarmnn/examples/speech_recognition/tests/testdata/quick_brown_fox_16000khz.wav b/python/pyarmnn/examples/speech_recognition/tests/testdata/quick_brown_fox_16000khz.wav
deleted file mode 100644
index 761c36062e..0000000000
--- a/python/pyarmnn/examples/speech_recognition/tests/testdata/quick_brown_fox_16000khz.wav
+++ /dev/null
Binary files differ
diff --git a/python/pyarmnn/examples/speech_recognition/tests/testdata/wav2letter_labels.txt b/python/pyarmnn/examples/speech_recognition/tests/testdata/wav2letter_labels.txt
deleted file mode 100644
index d7485b7da2..0000000000
--- a/python/pyarmnn/examples/speech_recognition/tests/testdata/wav2letter_labels.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-'
-
-$ \ No newline at end of file