aboutsummaryrefslogtreecommitdiff
path: root/samples/SpeechRecognition/include/Preprocess.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'samples/SpeechRecognition/include/Preprocess.hpp')
-rw-r--r--samples/SpeechRecognition/include/Preprocess.hpp175
1 files changed, 175 insertions, 0 deletions
diff --git a/samples/SpeechRecognition/include/Preprocess.hpp b/samples/SpeechRecognition/include/Preprocess.hpp
new file mode 100644
index 0000000000..80c568439b
--- /dev/null
+++ b/samples/SpeechRecognition/include/Preprocess.hpp
@@ -0,0 +1,175 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "DataStructures.hpp"
+#include "SlidingWindow.hpp"
+#include <numeric>
+#include "MFCC.hpp"
+
+/* Class to facilitate pre-processing calculation for Wav2Letter model
+ * for ASR */
+using AudioWindow = SlidingWindow <const float>;
+
+class Preprocess
+{
+public:
+
+ MFCC _m_mfcc; /* MFCC instance */
+
+ /* Actual buffers to be populated */
+ Array2d<float> _m_mfccBuf; /* Contiguous buffer 1D: MFCC */
+ Array2d<float> _m_delta1Buf; /* Contiguous buffer 1D: Delta 1 */
+ Array2d<float> _m_delta2Buf; /* Contiguous buffer 1D: Delta 2 */
+
+ uint32_t _m_windowLen; /* Window length for MFCC */
+ uint32_t _m_windowStride; /* Window stride len for MFCC */
+ AudioWindow _m_window; /* Sliding window */
+
+ /**
+ * @brief Constructor
+ * @param[in] numMfccFeatures number of MFCC features per window
+ * @param[in] windowLen number of elements in a window
+ * @param[in] windowStride stride (in number of elements) for
+ * moving the window
+ * @param[in] numMfccVectors number of MFCC vectors per window
+ */
+ Preprocess(
+ const uint32_t windowLen,
+ const uint32_t windowStride,
+ const MFCC mfccInst);
+ Preprocess() = delete;
+ ~Preprocess();
+
+ /**
+ * @brief Calculates the features required from audio data. This
+ * includes MFCC, first and second order deltas,
+ * normalisation and finally, quantisation. The tensor is
+ * populated with feature from a given window placed along
+ * in a single row.
+ * @param[in] audioData pointer to the first element of audio data
+ * @param[in] audioDataLen number of elements in the audio data
+ * @param[in] tensor tensor to be populated
+ * @return true if successful, false in case of error.
+ */
+ bool Invoke(const float* audioData,
+ const uint32_t audioDataLen,
+ std::vector<int8_t>& output,
+ int quantOffset,
+ float quantScale);
+
+
+protected:
+ /**
+ * @brief Computes the first and second order deltas for the
+ * MFCC buffers - they are assumed to be populated.
+ *
+ * @param[in] mfcc MFCC buffers
+ * @param[out] delta1 result of the first diff computation
+ * @param[out] delta2 result of the second diff computation
+ *
+ * @return true if successful, false otherwise
+ */
+ static bool _ComputeDeltas(Array2d<float>& mfcc,
+ Array2d<float>& delta1,
+ Array2d<float>& delta2);
+
+ /**
+ * @brief Given a 2D vector of floats, computes the mean
+ * @param[in] vec vector of vector of floats
+ * @return mean value
+ */
+ static float _GetMean(Array2d<float>& vec);
+
+ /**
+ * @brief Given a 2D vector of floats, computes the stddev
+ * @param[in] vec vector of vector of floats
+ * @param[in] mean mean value of the vector passed in
+ * @return stddev value
+ */
+ static float _GetStdDev(Array2d<float>& vec,
+ const float mean);
+
+ /**
+ * @brief Given a 2D vector of floats, normalises it using
+ * the mean and the stddev
+ * @param[in/out] vec vector of vector of floats
+ * @return
+ */
+ static void _NormaliseVec(Array2d<float>& vec);
+
+ /**
+ * @brief Normalises the MFCC and delta buffers
+ * @return
+ */
+ void _Normalise();
+
+ /**
+ * @brief Given the quantisation and data type limits, computes
+ * the quantised values of a floating point input data.
+ * @param[in] elem Element to be quantised
+ * @param[in] quantScale Scale
+ * @param[in] quantOffset Offset
+ * @param[in] minVal Numerical limit - minimum
+ * @param[in] maxVal Numerical limit - maximum
+ * @return floating point quantised value
+ */
+ static float _GetQuantElem(
+ const float elem,
+ const float quantScale,
+ const int quantOffset,
+ const float minVal,
+ const float maxVal);
+
+ /**
+ * @brief Quantises the MFCC and delta buffers, and places them
+ * in the output buffer. While doing so, it transposes
+ * the data. Reason: Buffers in this class are arranged
+ * for "time" axis to be row major. Primary reason for
+ * this being the convolution speed up (as we can use
+ * contiguous memory). The output, however, requires the
+ * time axis to be in column major arrangement.
+ * @param[in] outputBuf pointer to the output buffer
+ * @param[in] outputBufSz output buffer's size
+ * @param[in] quantScale quantisation scale
+ * @param[in] quantOffset quantisation offset
+ */
+ template <typename T>
+ bool _Quantise(T* outputBuf, int quantOffset, float quantScale)
+ {
+ /* Populate */
+ T* outputBufMfcc = outputBuf;
+ T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures;
+ T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures;
+ const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */
+
+ const float minVal = std::numeric_limits<T>::min();
+ const float maxVal = std::numeric_limits<T>::max();
+
+ /* We need to do a transpose while copying and concatenating
+ * the tensor*/
+ for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) {
+ for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i)
+ {
+ *outputBufMfcc++ = static_cast<T>(this->_GetQuantElem(
+ this->_m_mfccBuf(i, j), quantScale,
+ quantOffset, minVal, maxVal));
+ *outputBufD1++ = static_cast<T>(this->_GetQuantElem(
+ this->_m_delta1Buf(i, j), quantScale,
+ quantOffset, minVal, maxVal));
+ *outputBufD2++ = static_cast<T>(this->_GetQuantElem(
+ this->_m_delta2Buf(i, j), quantScale,
+ quantOffset, minVal, maxVal));
+ }
+ outputBufMfcc += ptrIncr;
+ outputBufD1 += ptrIncr;
+ outputBufD2 += ptrIncr;
+ }
+
+ return true;
+ }
+};
+