// // Copyright © 2020 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once #include "DataStructures.hpp" #include "SlidingWindow.hpp" #include #include "MFCC.hpp" /* Class to facilitate pre-processing calculation for Wav2Letter model * for ASR */ using AudioWindow = SlidingWindow ; class Preprocess { public: MFCC _m_mfcc; /* MFCC instance */ /* Actual buffers to be populated */ Array2d _m_mfccBuf; /* Contiguous buffer 1D: MFCC */ Array2d _m_delta1Buf; /* Contiguous buffer 1D: Delta 1 */ Array2d _m_delta2Buf; /* Contiguous buffer 1D: Delta 2 */ uint32_t _m_windowLen; /* Window length for MFCC */ uint32_t _m_windowStride; /* Window stride len for MFCC */ AudioWindow _m_window; /* Sliding window */ /** * @brief Constructor * @param[in] numMfccFeatures number of MFCC features per window * @param[in] windowLen number of elements in a window * @param[in] windowStride stride (in number of elements) for * moving the window * @param[in] numMfccVectors number of MFCC vectors per window */ Preprocess( const uint32_t windowLen, const uint32_t windowStride, const MFCC mfccInst); Preprocess() = delete; ~Preprocess(); /** * @brief Calculates the features required from audio data. This * includes MFCC, first and second order deltas, * normalisation and finally, quantisation. The tensor is * populated with feature from a given window placed along * in a single row. * @param[in] audioData pointer to the first element of audio data * @param[in] audioDataLen number of elements in the audio data * @param[in] tensor tensor to be populated * @return true if successful, false in case of error. */ bool Invoke(const float* audioData, const uint32_t audioDataLen, std::vector& output, int quantOffset, float quantScale); protected: /** * @brief Computes the first and second order deltas for the * MFCC buffers - they are assumed to be populated. * * @param[in] mfcc MFCC buffers * @param[out] delta1 result of the first diff computation * @param[out] delta2 result of the second diff computation * * @return true if successful, false otherwise */ static bool _ComputeDeltas(Array2d& mfcc, Array2d& delta1, Array2d& delta2); /** * @brief Given a 2D vector of floats, computes the mean * @param[in] vec vector of vector of floats * @return mean value */ static float _GetMean(Array2d& vec); /** * @brief Given a 2D vector of floats, computes the stddev * @param[in] vec vector of vector of floats * @param[in] mean mean value of the vector passed in * @return stddev value */ static float _GetStdDev(Array2d& vec, const float mean); /** * @brief Given a 2D vector of floats, normalises it using * the mean and the stddev * @param[in/out] vec vector of vector of floats * @return */ static void _NormaliseVec(Array2d& vec); /** * @brief Normalises the MFCC and delta buffers * @return */ void _Normalise(); /** * @brief Given the quantisation and data type limits, computes * the quantised values of a floating point input data. * @param[in] elem Element to be quantised * @param[in] quantScale Scale * @param[in] quantOffset Offset * @param[in] minVal Numerical limit - minimum * @param[in] maxVal Numerical limit - maximum * @return floating point quantised value */ static float _GetQuantElem( const float elem, const float quantScale, const int quantOffset, const float minVal, const float maxVal); /** * @brief Quantises the MFCC and delta buffers, and places them * in the output buffer. While doing so, it transposes * the data. Reason: Buffers in this class are arranged * for "time" axis to be row major. Primary reason for * this being the convolution speed up (as we can use * contiguous memory). The output, however, requires the * time axis to be in column major arrangement. * @param[in] outputBuf pointer to the output buffer * @param[in] outputBufSz output buffer's size * @param[in] quantScale quantisation scale * @param[in] quantOffset quantisation offset */ template bool _Quantise(T* outputBuf, int quantOffset, float quantScale) { /* Populate */ T* outputBufMfcc = outputBuf; T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures; T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures; const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */ const float minVal = std::numeric_limits::min(); const float maxVal = std::numeric_limits::max(); /* We need to do a transpose while copying and concatenating * the tensor*/ for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) { for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i) { *outputBufMfcc++ = static_cast(this->_GetQuantElem( this->_m_mfccBuf(i, j), quantScale, quantOffset, minVal, maxVal)); *outputBufD1++ = static_cast(this->_GetQuantElem( this->_m_delta1Buf(i, j), quantScale, quantOffset, minVal, maxVal)); *outputBufD2++ = static_cast(this->_GetQuantElem( this->_m_delta2Buf(i, j), quantScale, quantOffset, minVal, maxVal)); } outputBufMfcc += ptrIncr; outputBufD1 += ptrIncr; outputBufD2 += ptrIncr; } return true; } };