aboutsummaryrefslogtreecommitdiff
path: root/samples/SpeechRecognition/include
diff options
context:
space:
mode:
Diffstat (limited to 'samples/SpeechRecognition/include')
-rw-r--r--samples/SpeechRecognition/include/AudioCapture.hpp62
-rw-r--r--samples/SpeechRecognition/include/DataStructures.hpp102
-rw-r--r--samples/SpeechRecognition/include/Decoder.hpp63
-rw-r--r--samples/SpeechRecognition/include/MFCC.hpp244
-rw-r--r--samples/SpeechRecognition/include/MathUtils.hpp85
-rw-r--r--samples/SpeechRecognition/include/Preprocess.hpp175
-rw-r--r--samples/SpeechRecognition/include/SlidingWindow.hpp161
-rw-r--r--samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp139
8 files changed, 1031 insertions, 0 deletions
diff --git a/samples/SpeechRecognition/include/AudioCapture.hpp b/samples/SpeechRecognition/include/AudioCapture.hpp
new file mode 100644
index 0000000000..90c2eccacf
--- /dev/null
+++ b/samples/SpeechRecognition/include/AudioCapture.hpp
@@ -0,0 +1,62 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <string>
+#include <iostream>
+
+#include <math.h>
+
+#include <vector>
+
+#include <exception>
+
+#include "SlidingWindow.hpp"
+
+namespace asr
+{
+
+/**
+* @brief Class used to capture the audio data loaded from file, and to provide a method of
+ * extracting correctly positioned and appropriately sized audio windows
+*
+*/
+ class AudioCapture
+ {
+ public:
+
+ SlidingWindow<const float> m_window;
+ int lastReadIdx= 0;
+
+ /**
+ * @brief Default constructor
+ */
+ AudioCapture()
+ {};
+
+ /**
+ * @brief Function to load the audio data captured from the
+ * input file to memory.
+ */
+ std::vector<float> LoadAudioFile(std::string filePath);
+
+ /**
+ * @brief Function to initialize the sliding window. This will set its position in memory, its
+ * window size and its stride.
+ */
+ void InitSlidingWindow(float* data, size_t dataSize, int minSamples, size_t stride);
+
+ /**
+ * Checks whether there is another block of audio in memory to read
+ */
+ bool HasNext();
+
+ /**
+ * Retrieves the next block of audio if its available
+ */
+ std::vector<float> Next();
+ };
+} // namespace asr \ No newline at end of file
diff --git a/samples/SpeechRecognition/include/DataStructures.hpp b/samples/SpeechRecognition/include/DataStructures.hpp
new file mode 100644
index 0000000000..9922265299
--- /dev/null
+++ b/samples/SpeechRecognition/include/DataStructures.hpp
@@ -0,0 +1,102 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+/**
+ * Class Array2d is a data structure that represents a two dimensional array.
+ * The data is allocated in contiguous memory, arranged row-wise
+ * and individual elements can be accessed with the () operator.
+ * For example a two dimensional array D of size (M, N) can be accessed:
+ *
+ * _|<------------- col size = N -------->|
+ * | D(r=0, c=0) D(r=0, c=1)... D(r=0, c=N)
+ * | D(r=1, c=0) D(r=1, c=1)... D(r=1, c=N)
+ * | ...
+ * row size = M ...
+ * | ...
+ * _ D(r=M, c=0) D(r=M, c=1)... D(r=M, c=N)
+ *
+ */
+template<typename T>
+class Array2d
+{
+private:
+ size_t m_rows;
+ size_t m_cols;
+ T* m_data;
+
+public:
+ /**
+ * Creates the array2d with the given sizes.
+ *
+ * @param rows number of rows.
+ * @param cols number of columns.
+ */
+ Array2d(unsigned rows, unsigned cols)
+ {
+ if (rows == 0 || cols == 0) {
+ printf("Array2d constructor has 0 size.\n");
+ m_data = nullptr;
+ return;
+ }
+ m_rows = rows;
+ m_cols = cols;
+ m_data = new T[rows * cols];
+ }
+
+ ~Array2d()
+ {
+ delete[] m_data;
+ }
+
+ T& operator() (unsigned int row, unsigned int col)
+ {
+ return m_data[m_cols * row + col];
+ }
+
+ T operator() (unsigned int row, unsigned int col) const
+ {
+ return m_data[m_cols * row + col];
+ }
+
+ /**
+ * Gets rows number of the current array2d.
+ * @return number of rows.
+ */
+ size_t size(size_t dim)
+ {
+ switch (dim)
+ {
+ case 0:
+ return m_rows;
+ case 1:
+ return m_cols;
+ default:
+ return 0;
+ }
+ }
+
+ /**
+ * Gets the array2d total size.
+ */
+ size_t totalSize()
+ {
+ return m_rows * m_cols;
+ }
+
+ /**
+ * array2d iterator.
+ */
+ using iterator=T*;
+ using const_iterator=T const*;
+
+ iterator begin() { return m_data; }
+ iterator end() { return m_data + totalSize(); }
+ const_iterator begin() const { return m_data; }
+ const_iterator end() const { return m_data + totalSize(); };
+};
diff --git a/samples/SpeechRecognition/include/Decoder.hpp b/samples/SpeechRecognition/include/Decoder.hpp
new file mode 100644
index 0000000000..69d97ccf64
--- /dev/null
+++ b/samples/SpeechRecognition/include/Decoder.hpp
@@ -0,0 +1,63 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <string>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+# pragma once
+
+namespace asr
+{
+/**
+* @brief Class used to Decode the output of the ASR inference
+*
+*/
+ class Decoder
+ {
+ public:
+ std::map<int, std::string> m_labels;
+ /**
+ * @brief Default constructor
+ * @param[in] labels - map of labels to be used for decoding to text.
+ */
+ Decoder(std::map<int, std::string>& labels);
+
+ /**
+ * @brief Function to decode the output into a text string
+ * @param[in] output - the output vector to decode.
+ */
+ template<typename T>
+ std::string DecodeOutput(std::vector<T>& contextToProcess)
+ {
+ int rowLength = 29;
+
+ std::vector<char> unfilteredText;
+
+ for(int row = 0; row < contextToProcess.size()/rowLength; ++row)
+ {
+ std::vector<int16_t> rowVector;
+ for(int j = 0; j < rowLength; ++j)
+ {
+ rowVector.emplace_back(static_cast<int16_t>(contextToProcess[row * rowLength + j]));
+ }
+
+ int max_index = std::distance(rowVector.begin(),std::max_element(rowVector.begin(), rowVector.end()));
+ unfilteredText.emplace_back(this->m_labels.at(max_index)[0]);
+ }
+
+ std::string filteredText = FilterCharacters(unfilteredText);
+ return filteredText;
+ }
+
+ /**
+ * @brief Function to filter out unwanted characters
+ * @param[in] unfiltered - the unfiltered output to be processed.
+ */
+ std::string FilterCharacters(std::vector<char>& unfiltered);
+ };
+} // namespace asr
diff --git a/samples/SpeechRecognition/include/MFCC.hpp b/samples/SpeechRecognition/include/MFCC.hpp
new file mode 100644
index 0000000000..14b6d9fe79
--- /dev/null
+++ b/samples/SpeechRecognition/include/MFCC.hpp
@@ -0,0 +1,244 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <vector>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+#include <string>
+
+/* MFCC's consolidated parameters */
+class MfccParams
+{
+public:
+ float m_samplingFreq;
+ int m_numFbankBins;
+ float m_melLoFreq;
+ float m_melHiFreq;
+ int m_numMfccFeatures;
+ int m_frameLen;
+ int m_frameLenPadded;
+ bool m_useHtkMethod;
+ int m_numMfccVectors;
+
+ /** @brief Constructor */
+ MfccParams(const float samplingFreq, const int numFbankBins,
+ const float melLoFreq, const float melHiFreq,
+ const int numMfccFeats, const int frameLen,
+ const bool useHtkMethod, const int numMfccVectors);
+
+ /* Delete the default constructor */
+ MfccParams() = delete;
+
+ /* Default destructor */
+ ~MfccParams() = default;
+
+ /** @brief String representation of parameters */
+ std::string Str();
+};
+
+/**
+ * @brief Class for MFCC feature extraction.
+ * Based on https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Source/MFCC/mfcc.cpp
+ * This class is designed to be generic and self-sufficient but
+ * certain calculation routines can be overridden to accommodate
+ * use-case specific requirements.
+ */
+class MFCC
+{
+
+public:
+
+ /**
+ * @brief Extract MFCC features for one single small frame of
+ * audio data e.g. 640 samples.
+ * @param[in] audioData - Vector of audio samples to calculate
+ * features for.
+ * @return Vector of extracted MFCC features.
+ **/
+ std::vector<float> MfccCompute(const std::vector<float>& audioData);
+
+ MfccParams _m_params;
+
+ /**
+ * @brief Constructor
+ * @param[in] params - MFCC parameters
+ */
+ MFCC(const MfccParams& params);
+
+ /* Delete the default constructor */
+ MFCC() = delete;
+
+ /** @brief Default destructor */
+ ~MFCC() = default;
+
+ /** @brief Initialise */
+ void Init();
+
+ /**
+ * @brief Extract MFCC features and quantise for one single small
+ * frame of audio data e.g. 640 samples.
+ * @param[in] audioData - Vector of audio samples to calculate
+ * features for.
+ * @param[in] quantScale - quantisation scale.
+ * @param[in] quantOffset - quantisation offset
+ * @return Vector of extracted quantised MFCC features.
+ **/
+ template<typename T>
+ std::vector<T> MfccComputeQuant(const std::vector<float>& audioData,
+ const float quantScale,
+ const int quantOffset)
+ {
+ this->_MfccComputePreFeature(audioData);
+ float minVal = std::numeric_limits<T>::min();
+ float maxVal = std::numeric_limits<T>::max();
+
+ std::vector<T> mfccOut(this->_m_params.m_numMfccFeatures);
+ const size_t numFbankBins = this->_m_params.m_numFbankBins;
+
+ /* Take DCT. Uses matrix mul. */
+ for (size_t i = 0, j = 0; i < mfccOut.size(); ++i, j += numFbankBins)
+ {
+ float sum = 0;
+ for (size_t k = 0; k < numFbankBins; ++k)
+ {
+ sum += this->_m_dctMatrix[j + k] * this->_m_melEnergies[k];
+ }
+ /* Quantize to T. */
+ sum = std::round((sum / quantScale) + quantOffset);
+ mfccOut[i] = static_cast<T>(std::min<float>(std::max<float>(sum, minVal), maxVal));
+ }
+
+ return mfccOut;
+ }
+
+ /* Constants */
+ static constexpr float logStep = 1.8562979903656 / 27.0;
+ static constexpr float freqStep = 200.0 / 3;
+ static constexpr float minLogHz = 1000.0;
+ static constexpr float minLogMel = minLogHz / freqStep;
+
+protected:
+ /**
+ * @brief Project input frequency to Mel Scale.
+ * @param[in] freq - input frequency in floating point
+ * @param[in] useHTKmethod - bool to signal if HTK method is to be
+ * used for calculation
+ * @return Mel transformed frequency in floating point
+ **/
+ static float MelScale(const float freq,
+ const bool useHTKMethod = true);
+
+ /**
+ * @brief Inverse Mel transform - convert MEL warped frequency
+ * back to normal frequency
+ * @param[in] freq - Mel frequency in floating point
+ * @param[in] useHTKmethod - bool to signal if HTK method is to be
+ * used for calculation
+ * @return Real world frequency in floating point
+ **/
+ static float InverseMelScale(const float melFreq,
+ const bool useHTKMethod = true);
+
+ /**
+ * @brief Populates MEL energies after applying the MEL filter
+ * bank weights and adding them up to be placed into
+ * bins, according to the filter bank's first and last
+ * indices (pre-computed for each filter bank element
+ * by _CreateMelFilterBank function).
+ * @param[in] fftVec Vector populated with FFT magnitudes
+ * @param[in] melFilterBank 2D Vector with filter bank weights
+ * @param[in] filterBankFilterFirst Vector containing the first indices of filter bank
+ * to be used for each bin.
+ * @param[in] filterBankFilterLast Vector containing the last indices of filter bank
+ * to be used for each bin.
+ * @param[out] melEnergies Pre-allocated vector of MEL energies to be
+ * populated.
+ * @return true if successful, false otherwise
+ */
+ virtual bool ApplyMelFilterBank(
+ std::vector<float>& fftVec,
+ std::vector<std::vector<float>>& melFilterBank,
+ std::vector<int32_t>& filterBankFilterFirst,
+ std::vector<int32_t>& filterBankFilterLast,
+ std::vector<float>& melEnergies);
+
+ /**
+ * @brief Converts the Mel energies for logarithmic scale
+ * @param[in/out] melEnergies - 1D vector of Mel energies
+ **/
+ virtual void ConvertToLogarithmicScale(std::vector<float>& melEnergies);
+
+ /**
+ * @brief Create a matrix used to calculate Discrete Cosine
+ * Transform.
+ * @param[in] inputLength - input length of the buffer on which
+ * DCT will be performed
+ * @param[in] coefficientCount - Total coefficients per input
+ * length
+ * @return 1D vector with inputLength x coefficientCount elements
+ * populated with DCT coefficients.
+ */
+ virtual std::vector<float> CreateDCTMatrix(
+ const int32_t inputLength,
+ const int32_t coefficientCount);
+
+ /**
+ * @brief Given the low and high Mel values, get the normaliser
+ * for weights to be applied when populating the filter
+ * bank.
+ * @param[in] leftMel - low Mel frequency value
+ * @param[in] rightMel - high Mel frequency value
+ * @param[in] useHTKMethod - bool to signal if HTK method is to be
+ * used for calculation
+ */
+ virtual float GetMelFilterBankNormaliser(
+ const float& leftMel,
+ const float& rightMel,
+ const bool useHTKMethod);
+
+private:
+
+ std::vector<float> _m_frame;
+ std::vector<float> _m_buffer;
+ std::vector<float> _m_melEnergies;
+ std::vector<float> _m_windowFunc;
+ std::vector<std::vector<float>> _m_melFilterBank;
+ std::vector<float> _m_dctMatrix;
+ std::vector<int32_t> _m_filterBankFilterFirst;
+ std::vector<int32_t> _m_filterBankFilterLast;
+ bool _m_filterBankInitialised;
+
+ /**
+ * @brief Initialises the filter banks and the DCT matrix **/
+ void _InitMelFilterBank();
+
+ /**
+ * @brief Signals whether the instance of MFCC has had its
+ * required buffers initialised
+ * @return True if initialised, false otherwise
+ **/
+ bool _IsMelFilterBankInited();
+
+ /**
+ * @brief Create mel filter banks for MFCC calculation.
+ * @return 2D vector of floats
+ **/
+ std::vector<std::vector<float>> _CreateMelFilterBank();
+
+ /**
+ * @brief Computes and populates internal memeber buffers used
+ * in MFCC feature calculation
+ * @param[in] audioData - 1D vector of 16-bit audio data
+ */
+ void _MfccComputePreFeature(const std::vector<float>& audioData);
+
+ /** @brief Computes the magnitude from an interleaved complex array */
+ void _ConvertToPowerSpectrum();
+
+};
+
diff --git a/samples/SpeechRecognition/include/MathUtils.hpp b/samples/SpeechRecognition/include/MathUtils.hpp
new file mode 100644
index 0000000000..5f81fb6507
--- /dev/null
+++ b/samples/SpeechRecognition/include/MathUtils.hpp
@@ -0,0 +1,85 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <vector>
+#include <cmath>
+#include <cstdint>
+#include <numeric>
+
+class MathUtils
+{
+
+public:
+
+ /**
+ * @brief Computes the FFT for the input vector
+ * @param[in] input Floating point vector of input elements
+ * @param[out] fftOutput Output buffer to be populated by computed
+ * FFTs
+ * @return none
+ */
+ static void FftF32(std::vector<float>& input,
+ std::vector<float>& fftOutput);
+
+
+ /**
+ * @brief Computes the dot product of two 1D floating point
+ * vectors.
+ * result = sum(srcA[0]*srcB[0] + srcA[1]*srcB[1] + ..)
+ * @param[in] srcPtrA pointer to the first element of first
+ * array
+ * @param[in] srcPtrB pointer to the first element of second
+ * array
+ * @param[in] srcLen Number of elements in the array/vector
+ * @return dot product
+ */
+ static float DotProductF32(float* srcPtrA, float* srcPtrB,
+ const int srcLen);
+
+ /**
+ * @brief Computes the squared magnitude of floating point
+ * complex number array.
+ * @param[in] ptrSrc pointer to the first element of input
+ * array
+ * @param[in] srcLen Number of elements in the array/vector
+ * @param[out] ptrDst Output buffer to be populated
+ * @param[in] dstLen output buffer len (for sanity check only)
+ * @return true if successful, false otherwise
+ */
+ static bool ComplexMagnitudeSquaredF32(float* ptrSrc,
+ const int srcLen,
+ float* ptrDst,
+ const int dstLen);
+
+ /**
+ * @brief Computes the natural logarithms of input floating point
+ * vector
+ * @param[in] input Floating point input vector
+ * @param[out] output Pre-allocated buffer to be populated with
+ * natural log values of each input element
+ * @return none
+ */
+ static void VecLogarithmF32(std::vector <float>& input,
+ std::vector <float>& output);
+
+ /**
+ * @brief Gets the mean of a floating point array of elements
+ * @param[in] ptrSrc pointer to the first element
+ * @param[in] srcLen Number of elements in the array/vector
+ * @return average value
+ */
+ static float MeanF32(float* ptrSrc, const uint32_t srcLen);
+
+ /**
+ * @brief Gets the standard deviation of a floating point array
+ * of elements
+ * @param[in] ptrSrc pointer to the first element
+ * @param[in] srcLen Number of elements in the array/vector
+ * @param[in] mean pre-computed mean value
+ * @return standard deviation value
+ */
+ static float StdDevF32(float* ptrSrc, const uint32_t srcLen,
+ const float mean);
+};
diff --git a/samples/SpeechRecognition/include/Preprocess.hpp b/samples/SpeechRecognition/include/Preprocess.hpp
new file mode 100644
index 0000000000..80c568439b
--- /dev/null
+++ b/samples/SpeechRecognition/include/Preprocess.hpp
@@ -0,0 +1,175 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "DataStructures.hpp"
+#include "SlidingWindow.hpp"
+#include <numeric>
+#include "MFCC.hpp"
+
+/* Class to facilitate pre-processing calculation for Wav2Letter model
+ * for ASR */
+using AudioWindow = SlidingWindow <const float>;
+
+class Preprocess
+{
+public:
+
+ MFCC _m_mfcc; /* MFCC instance */
+
+ /* Actual buffers to be populated */
+ Array2d<float> _m_mfccBuf; /* Contiguous buffer 1D: MFCC */
+ Array2d<float> _m_delta1Buf; /* Contiguous buffer 1D: Delta 1 */
+ Array2d<float> _m_delta2Buf; /* Contiguous buffer 1D: Delta 2 */
+
+ uint32_t _m_windowLen; /* Window length for MFCC */
+ uint32_t _m_windowStride; /* Window stride len for MFCC */
+ AudioWindow _m_window; /* Sliding window */
+
+ /**
+ * @brief Constructor
+ * @param[in] numMfccFeatures number of MFCC features per window
+ * @param[in] windowLen number of elements in a window
+ * @param[in] windowStride stride (in number of elements) for
+ * moving the window
+ * @param[in] numMfccVectors number of MFCC vectors per window
+ */
+ Preprocess(
+ const uint32_t windowLen,
+ const uint32_t windowStride,
+ const MFCC mfccInst);
+ Preprocess() = delete;
+ ~Preprocess();
+
+ /**
+ * @brief Calculates the features required from audio data. This
+ * includes MFCC, first and second order deltas,
+ * normalisation and finally, quantisation. The tensor is
+ * populated with feature from a given window placed along
+ * in a single row.
+ * @param[in] audioData pointer to the first element of audio data
+ * @param[in] audioDataLen number of elements in the audio data
+ * @param[in] tensor tensor to be populated
+ * @return true if successful, false in case of error.
+ */
+ bool Invoke(const float* audioData,
+ const uint32_t audioDataLen,
+ std::vector<int8_t>& output,
+ int quantOffset,
+ float quantScale);
+
+
+protected:
+ /**
+ * @brief Computes the first and second order deltas for the
+ * MFCC buffers - they are assumed to be populated.
+ *
+ * @param[in] mfcc MFCC buffers
+ * @param[out] delta1 result of the first diff computation
+ * @param[out] delta2 result of the second diff computation
+ *
+ * @return true if successful, false otherwise
+ */
+ static bool _ComputeDeltas(Array2d<float>& mfcc,
+ Array2d<float>& delta1,
+ Array2d<float>& delta2);
+
+ /**
+ * @brief Given a 2D vector of floats, computes the mean
+ * @param[in] vec vector of vector of floats
+ * @return mean value
+ */
+ static float _GetMean(Array2d<float>& vec);
+
+ /**
+ * @brief Given a 2D vector of floats, computes the stddev
+ * @param[in] vec vector of vector of floats
+ * @param[in] mean mean value of the vector passed in
+ * @return stddev value
+ */
+ static float _GetStdDev(Array2d<float>& vec,
+ const float mean);
+
+ /**
+ * @brief Given a 2D vector of floats, normalises it using
+ * the mean and the stddev
+ * @param[in/out] vec vector of vector of floats
+ * @return
+ */
+ static void _NormaliseVec(Array2d<float>& vec);
+
+ /**
+ * @brief Normalises the MFCC and delta buffers
+ * @return
+ */
+ void _Normalise();
+
+ /**
+ * @brief Given the quantisation and data type limits, computes
+ * the quantised values of a floating point input data.
+ * @param[in] elem Element to be quantised
+ * @param[in] quantScale Scale
+ * @param[in] quantOffset Offset
+ * @param[in] minVal Numerical limit - minimum
+ * @param[in] maxVal Numerical limit - maximum
+ * @return floating point quantised value
+ */
+ static float _GetQuantElem(
+ const float elem,
+ const float quantScale,
+ const int quantOffset,
+ const float minVal,
+ const float maxVal);
+
+ /**
+ * @brief Quantises the MFCC and delta buffers, and places them
+ * in the output buffer. While doing so, it transposes
+ * the data. Reason: Buffers in this class are arranged
+ * for "time" axis to be row major. Primary reason for
+ * this being the convolution speed up (as we can use
+ * contiguous memory). The output, however, requires the
+ * time axis to be in column major arrangement.
+ * @param[in] outputBuf pointer to the output buffer
+ * @param[in] outputBufSz output buffer's size
+ * @param[in] quantScale quantisation scale
+ * @param[in] quantOffset quantisation offset
+ */
+ template <typename T>
+ bool _Quantise(T* outputBuf, int quantOffset, float quantScale)
+ {
+ /* Populate */
+ T* outputBufMfcc = outputBuf;
+ T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures;
+ T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures;
+ const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */
+
+ const float minVal = std::numeric_limits<T>::min();
+ const float maxVal = std::numeric_limits<T>::max();
+
+ /* We need to do a transpose while copying and concatenating
+ * the tensor*/
+ for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) {
+ for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i)
+ {
+ *outputBufMfcc++ = static_cast<T>(this->_GetQuantElem(
+ this->_m_mfccBuf(i, j), quantScale,
+ quantOffset, minVal, maxVal));
+ *outputBufD1++ = static_cast<T>(this->_GetQuantElem(
+ this->_m_delta1Buf(i, j), quantScale,
+ quantOffset, minVal, maxVal));
+ *outputBufD2++ = static_cast<T>(this->_GetQuantElem(
+ this->_m_delta2Buf(i, j), quantScale,
+ quantOffset, minVal, maxVal));
+ }
+ outputBufMfcc += ptrIncr;
+ outputBufD1 += ptrIncr;
+ outputBufD2 += ptrIncr;
+ }
+
+ return true;
+ }
+};
+
diff --git a/samples/SpeechRecognition/include/SlidingWindow.hpp b/samples/SpeechRecognition/include/SlidingWindow.hpp
new file mode 100644
index 0000000000..791a0b7fc0
--- /dev/null
+++ b/samples/SpeechRecognition/include/SlidingWindow.hpp
@@ -0,0 +1,161 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+template<class T>
+class SlidingWindow
+{
+protected:
+ T* m_start = nullptr;
+ size_t m_dataSize = 0;
+ size_t m_size = 0;
+ size_t m_stride = 0;
+ size_t m_count = 0;
+public:
+
+ /**
+ * Creates the window slider through the given data.
+ *
+ * @param data pointer to the data to slide through.
+ * @param dataSize size in T type elements wise.
+ * @param windowSize sliding window size in T type wise elements.
+ * @param stride stride size in T type wise elements.
+ */
+ SlidingWindow(T* data, size_t dataSize,
+ size_t windowSize, size_t stride)
+ {
+ m_start = data;
+ m_dataSize = dataSize;
+ m_size = windowSize;
+ m_stride = stride;
+ }
+
+ SlidingWindow() = default;
+
+ ~SlidingWindow() = default;
+
+ /**
+ * Get the next data window.
+ * @return pointer to the next window, if next window is not available nullptr is returned.
+ */
+ virtual T* Next()
+ {
+ if (HasNext())
+ {
+ m_count++;
+ return m_start + Index() * m_stride;
+ }
+ else
+ {
+ return nullptr;
+ }
+ }
+
+ /**
+ * Checks if the next data portion is available.
+ * @return true if next data portion is available
+ */
+ bool HasNext()
+ {
+ return this->m_count < 1 + this->FractionalTotalStrides() && (this->NextWindowStartIndex() < this->m_dataSize);
+ }
+
+ /**
+ * Resest the slider to the initial position.
+ */
+ virtual void Reset()
+ {
+ m_count = 0;
+ }
+
+ /**
+ * Resest the slider to the initial position.
+ */
+ virtual size_t GetWindowSize()
+ {
+ return m_size;
+ }
+
+ /**
+ * Resets the slider to the start of the new data.
+ * New data size MUST be the same as the old one.
+ * @param newStart pointer to the new data to slide through.
+ */
+ virtual void Reset(T* newStart)
+ {
+ m_start = newStart;
+ Reset();
+ }
+
+ /**
+ * Gets current index of the sliding window.
+ * @return current position of the sliding window in number of strides
+ */
+ size_t Index()
+ {
+ return m_count == 0? 0: m_count - 1;
+ }
+
+ /**
+ * Gets the index from the start of the data where the next window will begin.
+ * While Index() returns the index of sliding window itself this function returns the index of the data
+ * element itself.
+ * @return Index from the start of the data where the next sliding window will begin.
+ */
+ virtual size_t NextWindowStartIndex()
+ {
+ return m_count == 0? 0: ((m_count) * m_stride);
+ }
+
+ /**
+ * Go to given sliding window index.
+ * @param index new position of the sliding window. if index is invalid (greater than possible range of strides)
+ * then next call to Next() will return nullptr.
+ */
+ void FastForward(size_t index)
+ {
+ m_count = index;
+ }
+
+ /**
+ * Calculates whole number of times the window can stride through the given data.
+ * @return maximum number of strides.
+ */
+ size_t TotalStrides()
+ {
+ if (m_size > m_dataSize)
+ {
+ return 0;
+ }
+ return ((m_dataSize - m_size)/m_stride);
+ }
+
+ /**
+ * Calculates number of times the window can stride through the given data. May not be a whole number.
+ * @return Number of strides to cover all data.
+ */
+ float FractionalTotalStrides()
+ {
+ if(this->m_size > this->m_dataSize)
+ {
+ return this->m_dataSize / this->m_size;
+ }
+ else
+ {
+ return ((this->m_dataSize - this->m_size)/ static_cast<float>(this->m_stride));
+ }
+
+ }
+
+ /**
+ * Calculates the remaining data left to be processed
+ * @return The remaining unprocessed data
+ */
+ int RemainingData()
+ {
+ return this->m_dataSize - this->NextWindowStartIndex();
+ }
+}; \ No newline at end of file
diff --git a/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp
new file mode 100644
index 0000000000..47ce30416f
--- /dev/null
+++ b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp
@@ -0,0 +1,139 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ArmnnNetworkExecutor.hpp"
+#include "Decoder.hpp"
+#include "MFCC.hpp"
+#include "Preprocess.hpp"
+
+namespace asr
+{
+/**
+ * Generic Speech Recognition pipeline with 3 steps: data pre-processing, inference execution and inference
+ * result post-processing.
+ *
+ */
+class ASRPipeline
+{
+public:
+
+ /**
+ * Creates speech recognition pipeline with given network executor and decoder.
+ * @param executor - unique pointer to inference runner
+ * @param decoder - unique pointer to inference results decoder
+ */
+ ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
+ std::unique_ptr<Decoder> decoder);
+
+ /**
+ * @brief Standard audio pre-processing implementation.
+ *
+ * Preprocesses and prepares the data for inference by
+ * extracting the MFCC features.
+
+ * @param[in] audio - the raw audio data
+ * @param[out] preprocessor - the preprocessor object, which handles the data prepreration
+ */
+ template<typename Tin,typename Tout>
+ std::vector<Tout> PreProcessing(std::vector<Tin>& audio, Preprocess& preprocessor)
+ {
+ int audioDataToPreProcess = preprocessor._m_windowLen +
+ ((preprocessor._m_mfcc._m_params.m_numMfccVectors -1) *preprocessor._m_windowStride);
+ int outputBufferSize = preprocessor._m_mfcc._m_params.m_numMfccVectors
+ * preprocessor._m_mfcc._m_params.m_numMfccFeatures * 3;
+ std::vector<Tout> outputBuffer(outputBufferSize);
+ preprocessor.Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(),
+ m_executor->GetQuantizationScale());
+ return outputBuffer;
+ }
+
+ /**
+ * @brief Executes inference
+ *
+ * Calls inference runner provided during instance construction.
+ *
+ * @param[in] preprocessedData - input inference data. Data type should be aligned with input tensor.
+ * @param[out] result - raw inference results.
+ */
+ template<typename T>
+ void Inference(const std::vector<T>& preprocessedData, common::InferenceResults<int8_t>& result)
+ {
+ size_t data_bytes = sizeof(std::vector<T>) + (sizeof(T) * preprocessedData.size());
+ m_executor->Run(preprocessedData.data(), data_bytes, result);
+ }
+
+ /**
+ * @brief Standard inference results post-processing implementation.
+ *
+ * Decodes inference results using decoder provided during construction.
+ *
+ * @param[in] inferenceResult - inference results to be decoded.
+ * @param[in] isFirstWindow - for checking if this is the first window of the sliding window.
+ * @param[in] isLastWindow - for checking if this is the last window of the sliding window.
+ * @param[in] currentRContext - the right context of the output text. To be output if it is the last window.
+ */
+ template<typename T>
+ void PostProcessing(common::InferenceResults<int8_t>& inferenceResult,
+ bool& isFirstWindow,
+ bool isLastWindow,
+ std::string currentRContext)
+ {
+ int rowLength = 29;
+ int middleContextStart = 49;
+ int middleContextEnd = 99;
+ int leftContextStart = 0;
+ int rightContextStart = 100;
+ int rightContextEnd = 148;
+
+ std::vector<T> contextToProcess;
+
+ // If isFirstWindow we keep the left context of the output
+ if(isFirstWindow)
+ {
+ std::vector<T> chunk(&inferenceResult[0][leftContextStart],
+ &inferenceResult[0][middleContextEnd * rowLength]);
+ contextToProcess = chunk;
+ }
+ // Else we only keep the middle context of the output
+ else
+ {
+ std::vector<T> chunk(&inferenceResult[0][middleContextStart * rowLength],
+ &inferenceResult[0][middleContextEnd * rowLength]);
+ contextToProcess = chunk;
+ }
+ std::string output = this->m_decoder->DecodeOutput<T>(contextToProcess);
+ isFirstWindow = false;
+ std::cout << output << std::flush;
+
+ // If this is the last window, we print the right context of the output
+ if(isLastWindow)
+ {
+ std::vector<T> rContext(&inferenceResult[0][rightContextStart*rowLength],
+ &inferenceResult[0][rightContextEnd * rowLength]);
+ currentRContext = this->m_decoder->DecodeOutput(rContext);
+ std::cout << currentRContext << std::endl;
+ }
+ }
+
+protected:
+ std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> m_executor;
+ std::unique_ptr<Decoder> m_decoder;
+};
+
+using IPipelinePtr = std::unique_ptr<asr::ASRPipeline>;
+
+/**
+ * Constructs speech recognition pipeline based on configuration provided.
+ *
+ * @param[in] config - speech recognition pipeline configuration.
+ * @param[in] labels - asr labels
+ *
+ * @return unique pointer to asr pipeline.
+ */
+IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels);
+
+}// namespace asr \ No newline at end of file