diff options
Diffstat (limited to 'samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp')
-rw-r--r-- | samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp | 187 |
1 files changed, 187 insertions, 0 deletions
diff --git a/samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp b/samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp new file mode 100644 index 0000000000..9329d5e4d5 --- /dev/null +++ b/samples/SpeechRecognition/src/Wav2LetterPreprocessor.cpp @@ -0,0 +1,187 @@ +// +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#include "MathUtils.hpp" +#include <cstring> +#include <cmath> +#include <numeric> +#include <algorithm> +#include <memory> +#include "Wav2LetterPreprocessor.hpp" +#include "Wav2LetterMFCC.hpp" + +float Wav2LetterPreprocessor::GetMean(Array2d<float>& vec) +{ + return MathUtils::MeanF32(vec.begin(), vec.totalSize()); +} + +float Wav2LetterPreprocessor::GetStdDev(Array2d<float>& vec, const float mean) +{ + return MathUtils::StdDevF32(vec.begin(), vec.totalSize(), mean); +} + +void Wav2LetterPreprocessor::NormaliseVec(Array2d<float>& vec) +{ + auto mean = Wav2LetterPreprocessor::GetMean(vec); + auto stddev = Wav2LetterPreprocessor::GetStdDev(vec, mean); + + if (stddev == 0) + { + std::fill(vec.begin(), vec.end(), 0); + } + else + { + const float stddevInv = 1.f/stddev; + const float normalisedMean = mean/stddev; + + auto NormalisingFunction = [=](float &value) { + value = value * stddevInv - normalisedMean; + }; + std::for_each(vec.begin(), vec.end(), NormalisingFunction); + } +} + +void Wav2LetterPreprocessor::Normalise() +{ + Wav2LetterPreprocessor::NormaliseVec(this->m_mfccBuf); + Wav2LetterPreprocessor::NormaliseVec(this->m_delta1Buf); + Wav2LetterPreprocessor::NormaliseVec(this->m_delta2Buf); +} + +float Wav2LetterPreprocessor::GetQuantElem( + const float elem, + const float quantScale, + const int quantOffset, + const float minVal, + const float maxVal) +{ + float val = std::round((elem/quantScale) + quantOffset); + float returnVal = std::min<float>(std::max<float>(val, minVal), maxVal); + return returnVal; +} + +bool Wav2LetterPreprocessor::Invoke(const float* audioData, const uint32_t audioDataLen, std::vector<int8_t>& output, + int quantOffset, float quantScale) +{ + this->m_window = SlidingWindow<const float>( + audioData, audioDataLen, + this->m_windowLen, this->m_windowStride); + + uint32_t mfccBufIdx = 0; + + // Init buffers with 0 + std::fill(m_mfccBuf.begin(), m_mfccBuf.end(), 0.f); + std::fill(m_delta1Buf.begin(), m_delta1Buf.end(), 0.f); + std::fill(m_delta2Buf.begin(), m_delta2Buf.end(), 0.f); + + // While we can slide over the window + while (this->m_window.HasNext()) + { + const float* mfccWindow = this->m_window.Next(); + auto mfccAudioData = std::vector<float>( + mfccWindow, + mfccWindow + this->m_windowLen); + + auto mfcc = this->m_mfcc->MfccCompute(mfccAudioData); + for (size_t i = 0; i < this->m_mfccBuf.size(0); ++i) + { + this->m_mfccBuf(i, mfccBufIdx) = mfcc[i]; + } + ++mfccBufIdx; + } + + // Pad MFCC if needed by repeating last feature vector + while (mfccBufIdx != this->m_mfcc->m_params.m_numMfccVectors) + { + memcpy(&this->m_mfccBuf(0, mfccBufIdx), + &this->m_mfccBuf(0, mfccBufIdx - 1), sizeof(float) * this->m_mfcc->m_params.m_numMfccFeatures); + ++mfccBufIdx; + } + + // Compute first and second order deltas from MFCCs + Wav2LetterPreprocessor::ComputeDeltas(this->m_mfccBuf, + this->m_delta1Buf, + this->m_delta2Buf); + + // Normalise + this->Normalise(); + + return this->Quantise<int8_t>(output.data(), quantOffset, quantScale); +} + +bool Wav2LetterPreprocessor::ComputeDeltas(Array2d<float>& mfcc, + Array2d<float>& delta1, + Array2d<float>& delta2) +{ + const std::vector <float> delta1Coeffs = + {6.66666667e-02, 5.00000000e-02, 3.33333333e-02, + 1.66666667e-02, -3.46944695e-18, -1.66666667e-02, + -3.33333333e-02, -5.00000000e-02, -6.66666667e-02}; + + const std::vector <float> delta2Coeffs = + {0.06060606, 0.01515152, -0.01731602, + -0.03679654, -0.04329004, -0.03679654, + -0.01731602, 0.01515152, 0.06060606}; + + if (delta1.size(0) == 0 || delta2.size(0) != delta1.size(0) || + mfcc.size(0) == 0 || mfcc.size(1) == 0) + { + return false; + } + + // Get the middle index; coeff vec len should always be odd + const size_t coeffLen = delta1Coeffs.size(); + const size_t fMidIdx = (coeffLen - 1)/2; + const size_t numFeatures = mfcc.size(0); + const size_t numFeatVectors = mfcc.size(1); + + // iterate through features in MFCC vector + for (size_t i = 0; i < numFeatures; ++i) + { + /* for each feature, iterate through time (t) samples representing feature evolution and + * calculate d/dt and d^2/dt^2, using 1d convolution with differential kernels. + * Convolution padding = valid, result size is `time length - kernel length + 1`. + * The result is padded with 0 from both sides to match the size of initial time samples data. + * + * For the small filter, conv1d implementation as a simple loop is efficient enough. + * Filters of a greater size would need CMSIS-DSP functions to be used, like arm_fir_f32. + */ + + for (size_t j = fMidIdx; j < numFeatVectors - fMidIdx; ++j) + { + float d1 = 0; + float d2 = 0; + const size_t mfccStIdx = j - fMidIdx; + + for (size_t k = 0, m = coeffLen - 1; k < coeffLen; ++k, --m) + { + + d1 += mfcc(i,mfccStIdx + k) * delta1Coeffs[m]; + d2 += mfcc(i,mfccStIdx + k) * delta2Coeffs[m]; + } + + delta1(i,j) = d1; + delta2(i,j) = d2; + } + } + + return true; +} + +Wav2LetterPreprocessor::Wav2LetterPreprocessor(const uint32_t windowLen, + const uint32_t windowStride, + std::unique_ptr<Wav2LetterMFCC> mfccInst): + m_mfcc(std::move(mfccInst)), + m_mfccBuf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors), + m_delta1Buf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors), + m_delta2Buf(m_mfcc->m_params.m_numMfccFeatures, m_mfcc->m_params.m_numMfccVectors), + m_windowLen(windowLen), + m_windowStride(windowStride) +{ + if (m_mfcc->m_params.m_numMfccFeatures > 0 && windowLen > 0) + { + this->m_mfcc->Init(); + } + std::fill(m_mfccBuf.begin(), m_mfccBuf.end(), 0.f); +}
\ No newline at end of file |