aboutsummaryrefslogtreecommitdiff
path: root/samples/SpeechRecognition/src/AudioCapture.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'samples/SpeechRecognition/src/AudioCapture.cpp')
-rw-r--r--samples/SpeechRecognition/src/AudioCapture.cpp104
1 files changed, 104 insertions, 0 deletions
diff --git a/samples/SpeechRecognition/src/AudioCapture.cpp b/samples/SpeechRecognition/src/AudioCapture.cpp
new file mode 100644
index 0000000000..f3b9092218
--- /dev/null
+++ b/samples/SpeechRecognition/src/AudioCapture.cpp
@@ -0,0 +1,104 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "AudioCapture.hpp"
+#include <alsa/asoundlib.h>
+#include <sndfile.h>
+#include <samplerate.h>
+
+namespace asr
+{
+ std::vector<float> AudioCapture::LoadAudioFile(std::string filePath)
+ {
+ SF_INFO inputSoundFileInfo;
+ SNDFILE* infile = NULL;
+ infile = sf_open(filePath.c_str(), SFM_READ, &inputSoundFileInfo);
+
+ float audioIn[inputSoundFileInfo.channels * inputSoundFileInfo.frames];
+ sf_read_float(infile, audioIn, inputSoundFileInfo.channels * inputSoundFileInfo.frames);
+
+ float sampleRate = 16000.0f;
+ float srcRatio = sampleRate / (float)inputSoundFileInfo.samplerate;
+ int outputFrames = ceil(inputSoundFileInfo.frames * srcRatio);
+ float dataOut[outputFrames];
+
+ // Convert to mono
+ float monoData[inputSoundFileInfo.frames];
+ for(int i = 0; i < inputSoundFileInfo.frames; i++)
+ {
+ float val = 0.0f;
+ for(int j = 0; j < inputSoundFileInfo.channels; j++)
+ monoData[i] += audioIn[i * inputSoundFileInfo.channels + j];
+ monoData[i] /= inputSoundFileInfo.channels;
+ }
+
+ // Resample
+ SRC_DATA srcData;
+ srcData.data_in = monoData;
+ srcData.input_frames = inputSoundFileInfo.frames;
+ srcData.data_out = dataOut;
+ srcData.output_frames = outputFrames;
+ srcData.src_ratio = srcRatio;
+
+ src_simple(&srcData, SRC_SINC_BEST_QUALITY, 1);
+
+ // Convert to Vector
+ std::vector<float> processedInput;
+
+ for(int i = 0; i < srcData.output_frames_gen; ++i)
+ {
+ processedInput.push_back(srcData.data_out[i]);
+ }
+
+ sf_close(infile);
+
+ return processedInput;
+ }
+
+ void AudioCapture::InitSlidingWindow(float* data, size_t dataSize, int minSamples, size_t stride)
+ {
+ this->m_window = SlidingWindow<const float>(data, dataSize, minSamples, stride);
+ }
+
+ bool AudioCapture::HasNext()
+ {
+ return m_window.HasNext();
+ }
+
+ std::vector<float> AudioCapture::Next()
+ {
+ if (this->m_window.HasNext())
+ {
+ int remainingData = this->m_window.RemainingData();
+ const float* windowData = this->m_window.Next();
+
+ size_t windowSize = this->m_window.GetWindowSize();
+
+ if(remainingData < windowSize)
+ {
+ std::vector<float> mfccAudioData(windowSize, 0.0f);
+ for(int i = 0; i < remainingData; ++i)
+ {
+ mfccAudioData[i] = *windowData;
+ if(i < remainingData - 1)
+ {
+ ++windowData;
+ }
+ }
+ return mfccAudioData;
+ }
+ else
+ {
+ std::vector<float> mfccAudioData(windowData, windowData + windowSize);
+ return mfccAudioData;
+ }
+ }
+ else
+ {
+ throw std::out_of_range("Error, end of audio data reached.");
+ }
+ }
+} //namespace asr
+