aboutsummaryrefslogtreecommitdiff
path: root/samples/SpeechRecognition/include
diff options
context:
space:
mode:
Diffstat (limited to 'samples/SpeechRecognition/include')
-rw-r--r--samples/SpeechRecognition/include/AudioCapture.hpp62
-rw-r--r--samples/SpeechRecognition/include/DataStructures.hpp102
-rw-r--r--samples/SpeechRecognition/include/Decoder.hpp4
-rw-r--r--samples/SpeechRecognition/include/MFCC.hpp244
-rw-r--r--samples/SpeechRecognition/include/MathUtils.hpp85
-rw-r--r--samples/SpeechRecognition/include/SlidingWindow.hpp161
-rw-r--r--samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp59
-rw-r--r--samples/SpeechRecognition/include/Wav2LetterMFCC.hpp78
-rw-r--r--samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp (renamed from samples/SpeechRecognition/include/Preprocess.hpp)123
9 files changed, 161 insertions, 757 deletions
diff --git a/samples/SpeechRecognition/include/AudioCapture.hpp b/samples/SpeechRecognition/include/AudioCapture.hpp
deleted file mode 100644
index 90c2eccacf..0000000000
--- a/samples/SpeechRecognition/include/AudioCapture.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include <string>
-#include <iostream>
-
-#include <math.h>
-
-#include <vector>
-
-#include <exception>
-
-#include "SlidingWindow.hpp"
-
-namespace asr
-{
-
-/**
-* @brief Class used to capture the audio data loaded from file, and to provide a method of
- * extracting correctly positioned and appropriately sized audio windows
-*
-*/
- class AudioCapture
- {
- public:
-
- SlidingWindow<const float> m_window;
- int lastReadIdx= 0;
-
- /**
- * @brief Default constructor
- */
- AudioCapture()
- {};
-
- /**
- * @brief Function to load the audio data captured from the
- * input file to memory.
- */
- std::vector<float> LoadAudioFile(std::string filePath);
-
- /**
- * @brief Function to initialize the sliding window. This will set its position in memory, its
- * window size and its stride.
- */
- void InitSlidingWindow(float* data, size_t dataSize, int minSamples, size_t stride);
-
- /**
- * Checks whether there is another block of audio in memory to read
- */
- bool HasNext();
-
- /**
- * Retrieves the next block of audio if its available
- */
- std::vector<float> Next();
- };
-} // namespace asr \ No newline at end of file
diff --git a/samples/SpeechRecognition/include/DataStructures.hpp b/samples/SpeechRecognition/include/DataStructures.hpp
deleted file mode 100644
index 9922265299..0000000000
--- a/samples/SpeechRecognition/include/DataStructures.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-/**
- * Class Array2d is a data structure that represents a two dimensional array.
- * The data is allocated in contiguous memory, arranged row-wise
- * and individual elements can be accessed with the () operator.
- * For example a two dimensional array D of size (M, N) can be accessed:
- *
- * _|<------------- col size = N -------->|
- * | D(r=0, c=0) D(r=0, c=1)... D(r=0, c=N)
- * | D(r=1, c=0) D(r=1, c=1)... D(r=1, c=N)
- * | ...
- * row size = M ...
- * | ...
- * _ D(r=M, c=0) D(r=M, c=1)... D(r=M, c=N)
- *
- */
-template<typename T>
-class Array2d
-{
-private:
- size_t m_rows;
- size_t m_cols;
- T* m_data;
-
-public:
- /**
- * Creates the array2d with the given sizes.
- *
- * @param rows number of rows.
- * @param cols number of columns.
- */
- Array2d(unsigned rows, unsigned cols)
- {
- if (rows == 0 || cols == 0) {
- printf("Array2d constructor has 0 size.\n");
- m_data = nullptr;
- return;
- }
- m_rows = rows;
- m_cols = cols;
- m_data = new T[rows * cols];
- }
-
- ~Array2d()
- {
- delete[] m_data;
- }
-
- T& operator() (unsigned int row, unsigned int col)
- {
- return m_data[m_cols * row + col];
- }
-
- T operator() (unsigned int row, unsigned int col) const
- {
- return m_data[m_cols * row + col];
- }
-
- /**
- * Gets rows number of the current array2d.
- * @return number of rows.
- */
- size_t size(size_t dim)
- {
- switch (dim)
- {
- case 0:
- return m_rows;
- case 1:
- return m_cols;
- default:
- return 0;
- }
- }
-
- /**
- * Gets the array2d total size.
- */
- size_t totalSize()
- {
- return m_rows * m_cols;
- }
-
- /**
- * array2d iterator.
- */
- using iterator=T*;
- using const_iterator=T const*;
-
- iterator begin() { return m_data; }
- iterator end() { return m_data + totalSize(); }
- const_iterator begin() const { return m_data; }
- const_iterator end() const { return m_data + totalSize(); };
-};
diff --git a/samples/SpeechRecognition/include/Decoder.hpp b/samples/SpeechRecognition/include/Decoder.hpp
index 69d97ccf64..9dd484a5d1 100644
--- a/samples/SpeechRecognition/include/Decoder.hpp
+++ b/samples/SpeechRecognition/include/Decoder.hpp
@@ -46,8 +46,8 @@ namespace asr
rowVector.emplace_back(static_cast<int16_t>(contextToProcess[row * rowLength + j]));
}
- int max_index = std::distance(rowVector.begin(),std::max_element(rowVector.begin(), rowVector.end()));
- unfilteredText.emplace_back(this->m_labels.at(max_index)[0]);
+ int maxIndex = std::distance(rowVector.begin(), std::max_element(rowVector.begin(), rowVector.end()));
+ unfilteredText.emplace_back(this->m_labels.at(maxIndex)[0]);
}
std::string filteredText = FilterCharacters(unfilteredText);
diff --git a/samples/SpeechRecognition/include/MFCC.hpp b/samples/SpeechRecognition/include/MFCC.hpp
deleted file mode 100644
index 14b6d9fe79..0000000000
--- a/samples/SpeechRecognition/include/MFCC.hpp
+++ /dev/null
@@ -1,244 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include <vector>
-#include <cstdint>
-#include <cmath>
-#include <limits>
-#include <string>
-
-/* MFCC's consolidated parameters */
-class MfccParams
-{
-public:
- float m_samplingFreq;
- int m_numFbankBins;
- float m_melLoFreq;
- float m_melHiFreq;
- int m_numMfccFeatures;
- int m_frameLen;
- int m_frameLenPadded;
- bool m_useHtkMethod;
- int m_numMfccVectors;
-
- /** @brief Constructor */
- MfccParams(const float samplingFreq, const int numFbankBins,
- const float melLoFreq, const float melHiFreq,
- const int numMfccFeats, const int frameLen,
- const bool useHtkMethod, const int numMfccVectors);
-
- /* Delete the default constructor */
- MfccParams() = delete;
-
- /* Default destructor */
- ~MfccParams() = default;
-
- /** @brief String representation of parameters */
- std::string Str();
-};
-
-/**
- * @brief Class for MFCC feature extraction.
- * Based on https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Source/MFCC/mfcc.cpp
- * This class is designed to be generic and self-sufficient but
- * certain calculation routines can be overridden to accommodate
- * use-case specific requirements.
- */
-class MFCC
-{
-
-public:
-
- /**
- * @brief Extract MFCC features for one single small frame of
- * audio data e.g. 640 samples.
- * @param[in] audioData - Vector of audio samples to calculate
- * features for.
- * @return Vector of extracted MFCC features.
- **/
- std::vector<float> MfccCompute(const std::vector<float>& audioData);
-
- MfccParams _m_params;
-
- /**
- * @brief Constructor
- * @param[in] params - MFCC parameters
- */
- MFCC(const MfccParams& params);
-
- /* Delete the default constructor */
- MFCC() = delete;
-
- /** @brief Default destructor */
- ~MFCC() = default;
-
- /** @brief Initialise */
- void Init();
-
- /**
- * @brief Extract MFCC features and quantise for one single small
- * frame of audio data e.g. 640 samples.
- * @param[in] audioData - Vector of audio samples to calculate
- * features for.
- * @param[in] quantScale - quantisation scale.
- * @param[in] quantOffset - quantisation offset
- * @return Vector of extracted quantised MFCC features.
- **/
- template<typename T>
- std::vector<T> MfccComputeQuant(const std::vector<float>& audioData,
- const float quantScale,
- const int quantOffset)
- {
- this->_MfccComputePreFeature(audioData);
- float minVal = std::numeric_limits<T>::min();
- float maxVal = std::numeric_limits<T>::max();
-
- std::vector<T> mfccOut(this->_m_params.m_numMfccFeatures);
- const size_t numFbankBins = this->_m_params.m_numFbankBins;
-
- /* Take DCT. Uses matrix mul. */
- for (size_t i = 0, j = 0; i < mfccOut.size(); ++i, j += numFbankBins)
- {
- float sum = 0;
- for (size_t k = 0; k < numFbankBins; ++k)
- {
- sum += this->_m_dctMatrix[j + k] * this->_m_melEnergies[k];
- }
- /* Quantize to T. */
- sum = std::round((sum / quantScale) + quantOffset);
- mfccOut[i] = static_cast<T>(std::min<float>(std::max<float>(sum, minVal), maxVal));
- }
-
- return mfccOut;
- }
-
- /* Constants */
- static constexpr float logStep = 1.8562979903656 / 27.0;
- static constexpr float freqStep = 200.0 / 3;
- static constexpr float minLogHz = 1000.0;
- static constexpr float minLogMel = minLogHz / freqStep;
-
-protected:
- /**
- * @brief Project input frequency to Mel Scale.
- * @param[in] freq - input frequency in floating point
- * @param[in] useHTKmethod - bool to signal if HTK method is to be
- * used for calculation
- * @return Mel transformed frequency in floating point
- **/
- static float MelScale(const float freq,
- const bool useHTKMethod = true);
-
- /**
- * @brief Inverse Mel transform - convert MEL warped frequency
- * back to normal frequency
- * @param[in] freq - Mel frequency in floating point
- * @param[in] useHTKmethod - bool to signal if HTK method is to be
- * used for calculation
- * @return Real world frequency in floating point
- **/
- static float InverseMelScale(const float melFreq,
- const bool useHTKMethod = true);
-
- /**
- * @brief Populates MEL energies after applying the MEL filter
- * bank weights and adding them up to be placed into
- * bins, according to the filter bank's first and last
- * indices (pre-computed for each filter bank element
- * by _CreateMelFilterBank function).
- * @param[in] fftVec Vector populated with FFT magnitudes
- * @param[in] melFilterBank 2D Vector with filter bank weights
- * @param[in] filterBankFilterFirst Vector containing the first indices of filter bank
- * to be used for each bin.
- * @param[in] filterBankFilterLast Vector containing the last indices of filter bank
- * to be used for each bin.
- * @param[out] melEnergies Pre-allocated vector of MEL energies to be
- * populated.
- * @return true if successful, false otherwise
- */
- virtual bool ApplyMelFilterBank(
- std::vector<float>& fftVec,
- std::vector<std::vector<float>>& melFilterBank,
- std::vector<int32_t>& filterBankFilterFirst,
- std::vector<int32_t>& filterBankFilterLast,
- std::vector<float>& melEnergies);
-
- /**
- * @brief Converts the Mel energies for logarithmic scale
- * @param[in/out] melEnergies - 1D vector of Mel energies
- **/
- virtual void ConvertToLogarithmicScale(std::vector<float>& melEnergies);
-
- /**
- * @brief Create a matrix used to calculate Discrete Cosine
- * Transform.
- * @param[in] inputLength - input length of the buffer on which
- * DCT will be performed
- * @param[in] coefficientCount - Total coefficients per input
- * length
- * @return 1D vector with inputLength x coefficientCount elements
- * populated with DCT coefficients.
- */
- virtual std::vector<float> CreateDCTMatrix(
- const int32_t inputLength,
- const int32_t coefficientCount);
-
- /**
- * @brief Given the low and high Mel values, get the normaliser
- * for weights to be applied when populating the filter
- * bank.
- * @param[in] leftMel - low Mel frequency value
- * @param[in] rightMel - high Mel frequency value
- * @param[in] useHTKMethod - bool to signal if HTK method is to be
- * used for calculation
- */
- virtual float GetMelFilterBankNormaliser(
- const float& leftMel,
- const float& rightMel,
- const bool useHTKMethod);
-
-private:
-
- std::vector<float> _m_frame;
- std::vector<float> _m_buffer;
- std::vector<float> _m_melEnergies;
- std::vector<float> _m_windowFunc;
- std::vector<std::vector<float>> _m_melFilterBank;
- std::vector<float> _m_dctMatrix;
- std::vector<int32_t> _m_filterBankFilterFirst;
- std::vector<int32_t> _m_filterBankFilterLast;
- bool _m_filterBankInitialised;
-
- /**
- * @brief Initialises the filter banks and the DCT matrix **/
- void _InitMelFilterBank();
-
- /**
- * @brief Signals whether the instance of MFCC has had its
- * required buffers initialised
- * @return True if initialised, false otherwise
- **/
- bool _IsMelFilterBankInited();
-
- /**
- * @brief Create mel filter banks for MFCC calculation.
- * @return 2D vector of floats
- **/
- std::vector<std::vector<float>> _CreateMelFilterBank();
-
- /**
- * @brief Computes and populates internal memeber buffers used
- * in MFCC feature calculation
- * @param[in] audioData - 1D vector of 16-bit audio data
- */
- void _MfccComputePreFeature(const std::vector<float>& audioData);
-
- /** @brief Computes the magnitude from an interleaved complex array */
- void _ConvertToPowerSpectrum();
-
-};
-
diff --git a/samples/SpeechRecognition/include/MathUtils.hpp b/samples/SpeechRecognition/include/MathUtils.hpp
deleted file mode 100644
index 5f81fb6507..0000000000
--- a/samples/SpeechRecognition/include/MathUtils.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#include <vector>
-#include <cmath>
-#include <cstdint>
-#include <numeric>
-
-class MathUtils
-{
-
-public:
-
- /**
- * @brief Computes the FFT for the input vector
- * @param[in] input Floating point vector of input elements
- * @param[out] fftOutput Output buffer to be populated by computed
- * FFTs
- * @return none
- */
- static void FftF32(std::vector<float>& input,
- std::vector<float>& fftOutput);
-
-
- /**
- * @brief Computes the dot product of two 1D floating point
- * vectors.
- * result = sum(srcA[0]*srcB[0] + srcA[1]*srcB[1] + ..)
- * @param[in] srcPtrA pointer to the first element of first
- * array
- * @param[in] srcPtrB pointer to the first element of second
- * array
- * @param[in] srcLen Number of elements in the array/vector
- * @return dot product
- */
- static float DotProductF32(float* srcPtrA, float* srcPtrB,
- const int srcLen);
-
- /**
- * @brief Computes the squared magnitude of floating point
- * complex number array.
- * @param[in] ptrSrc pointer to the first element of input
- * array
- * @param[in] srcLen Number of elements in the array/vector
- * @param[out] ptrDst Output buffer to be populated
- * @param[in] dstLen output buffer len (for sanity check only)
- * @return true if successful, false otherwise
- */
- static bool ComplexMagnitudeSquaredF32(float* ptrSrc,
- const int srcLen,
- float* ptrDst,
- const int dstLen);
-
- /**
- * @brief Computes the natural logarithms of input floating point
- * vector
- * @param[in] input Floating point input vector
- * @param[out] output Pre-allocated buffer to be populated with
- * natural log values of each input element
- * @return none
- */
- static void VecLogarithmF32(std::vector <float>& input,
- std::vector <float>& output);
-
- /**
- * @brief Gets the mean of a floating point array of elements
- * @param[in] ptrSrc pointer to the first element
- * @param[in] srcLen Number of elements in the array/vector
- * @return average value
- */
- static float MeanF32(float* ptrSrc, const uint32_t srcLen);
-
- /**
- * @brief Gets the standard deviation of a floating point array
- * of elements
- * @param[in] ptrSrc pointer to the first element
- * @param[in] srcLen Number of elements in the array/vector
- * @param[in] mean pre-computed mean value
- * @return standard deviation value
- */
- static float StdDevF32(float* ptrSrc, const uint32_t srcLen,
- const float mean);
-};
diff --git a/samples/SpeechRecognition/include/SlidingWindow.hpp b/samples/SpeechRecognition/include/SlidingWindow.hpp
deleted file mode 100644
index 791a0b7fc0..0000000000
--- a/samples/SpeechRecognition/include/SlidingWindow.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-template<class T>
-class SlidingWindow
-{
-protected:
- T* m_start = nullptr;
- size_t m_dataSize = 0;
- size_t m_size = 0;
- size_t m_stride = 0;
- size_t m_count = 0;
-public:
-
- /**
- * Creates the window slider through the given data.
- *
- * @param data pointer to the data to slide through.
- * @param dataSize size in T type elements wise.
- * @param windowSize sliding window size in T type wise elements.
- * @param stride stride size in T type wise elements.
- */
- SlidingWindow(T* data, size_t dataSize,
- size_t windowSize, size_t stride)
- {
- m_start = data;
- m_dataSize = dataSize;
- m_size = windowSize;
- m_stride = stride;
- }
-
- SlidingWindow() = default;
-
- ~SlidingWindow() = default;
-
- /**
- * Get the next data window.
- * @return pointer to the next window, if next window is not available nullptr is returned.
- */
- virtual T* Next()
- {
- if (HasNext())
- {
- m_count++;
- return m_start + Index() * m_stride;
- }
- else
- {
- return nullptr;
- }
- }
-
- /**
- * Checks if the next data portion is available.
- * @return true if next data portion is available
- */
- bool HasNext()
- {
- return this->m_count < 1 + this->FractionalTotalStrides() && (this->NextWindowStartIndex() < this->m_dataSize);
- }
-
- /**
- * Resest the slider to the initial position.
- */
- virtual void Reset()
- {
- m_count = 0;
- }
-
- /**
- * Resest the slider to the initial position.
- */
- virtual size_t GetWindowSize()
- {
- return m_size;
- }
-
- /**
- * Resets the slider to the start of the new data.
- * New data size MUST be the same as the old one.
- * @param newStart pointer to the new data to slide through.
- */
- virtual void Reset(T* newStart)
- {
- m_start = newStart;
- Reset();
- }
-
- /**
- * Gets current index of the sliding window.
- * @return current position of the sliding window in number of strides
- */
- size_t Index()
- {
- return m_count == 0? 0: m_count - 1;
- }
-
- /**
- * Gets the index from the start of the data where the next window will begin.
- * While Index() returns the index of sliding window itself this function returns the index of the data
- * element itself.
- * @return Index from the start of the data where the next sliding window will begin.
- */
- virtual size_t NextWindowStartIndex()
- {
- return m_count == 0? 0: ((m_count) * m_stride);
- }
-
- /**
- * Go to given sliding window index.
- * @param index new position of the sliding window. if index is invalid (greater than possible range of strides)
- * then next call to Next() will return nullptr.
- */
- void FastForward(size_t index)
- {
- m_count = index;
- }
-
- /**
- * Calculates whole number of times the window can stride through the given data.
- * @return maximum number of strides.
- */
- size_t TotalStrides()
- {
- if (m_size > m_dataSize)
- {
- return 0;
- }
- return ((m_dataSize - m_size)/m_stride);
- }
-
- /**
- * Calculates number of times the window can stride through the given data. May not be a whole number.
- * @return Number of strides to cover all data.
- */
- float FractionalTotalStrides()
- {
- if(this->m_size > this->m_dataSize)
- {
- return this->m_dataSize / this->m_size;
- }
- else
- {
- return ((this->m_dataSize - this->m_size)/ static_cast<float>(this->m_stride));
- }
-
- }
-
- /**
- * Calculates the remaining data left to be processed
- * @return The remaining unprocessed data
- */
- int RemainingData()
- {
- return this->m_dataSize - this->NextWindowStartIndex();
- }
-}; \ No newline at end of file
diff --git a/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp
index 47ce30416f..bc3fbfe151 100644
--- a/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp
+++ b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp
@@ -8,16 +8,16 @@
#include "ArmnnNetworkExecutor.hpp"
#include "Decoder.hpp"
#include "MFCC.hpp"
-#include "Preprocess.hpp"
+#include "Wav2LetterPreprocessor.hpp"
-namespace asr
+namespace asr
{
/**
* Generic Speech Recognition pipeline with 3 steps: data pre-processing, inference execution and inference
* result post-processing.
*
*/
-class ASRPipeline
+class ASRPipeline
{
public:
@@ -27,7 +27,7 @@ public:
* @param decoder - unique pointer to inference results decoder
*/
ASRPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
- std::unique_ptr<Decoder> decoder);
+ std::unique_ptr<Decoder> decoder, std::unique_ptr<Wav2LetterPreprocessor> preprocessor);
/**
* @brief Standard audio pre-processing implementation.
@@ -36,20 +36,16 @@ public:
* extracting the MFCC features.
* @param[in] audio - the raw audio data
- * @param[out] preprocessor - the preprocessor object, which handles the data prepreration
+ * @param[out] preprocessor - the preprocessor object, which handles the data preparation
*/
- template<typename Tin,typename Tout>
- std::vector<Tout> PreProcessing(std::vector<Tin>& audio, Preprocess& preprocessor)
- {
- int audioDataToPreProcess = preprocessor._m_windowLen +
- ((preprocessor._m_mfcc._m_params.m_numMfccVectors -1) *preprocessor._m_windowStride);
- int outputBufferSize = preprocessor._m_mfcc._m_params.m_numMfccVectors
- * preprocessor._m_mfcc._m_params.m_numMfccFeatures * 3;
- std::vector<Tout> outputBuffer(outputBufferSize);
- preprocessor.Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(),
- m_executor->GetQuantizationScale());
- return outputBuffer;
- }
+ std::vector<int8_t> PreProcessing(std::vector<float>& audio);
+
+ int getInputSamplesSize();
+ int getSlidingWindowOffset();
+
+ // Exposing hardcoded constant as it can only be derived from model knowledge and not from model itself
+ // Will need to be refactored so that hard coded values are not defined outside of model settings
+ int SLIDING_WINDOW_OFFSET;
/**
* @brief Executes inference
@@ -60,9 +56,9 @@ public:
* @param[out] result - raw inference results.
*/
template<typename T>
- void Inference(const std::vector<T>& preprocessedData, common::InferenceResults<int8_t>& result)
+ void Inference(const std::vector<T>& preprocessedData, common::InferenceResults<int8_t>& result)
{
- size_t data_bytes = sizeof(std::vector<T>) + (sizeof(T) * preprocessedData.size());
+ size_t data_bytes = sizeof(T) * preprocessedData.size();
m_executor->Run(preprocessedData.data(), data_bytes, result);
}
@@ -78,9 +74,9 @@ public:
*/
template<typename T>
void PostProcessing(common::InferenceResults<int8_t>& inferenceResult,
- bool& isFirstWindow,
- bool isLastWindow,
- std::string currentRContext)
+ bool& isFirstWindow,
+ bool isLastWindow,
+ std::string currentRContext)
{
int rowLength = 29;
int middleContextStart = 49;
@@ -92,17 +88,17 @@ public:
std::vector<T> contextToProcess;
// If isFirstWindow we keep the left context of the output
- if(isFirstWindow)
+ if (isFirstWindow)
{
std::vector<T> chunk(&inferenceResult[0][leftContextStart],
- &inferenceResult[0][middleContextEnd * rowLength]);
+ &inferenceResult[0][middleContextEnd * rowLength]);
contextToProcess = chunk;
}
- // Else we only keep the middle context of the output
- else
+ else
{
+ // Else we only keep the middle context of the output
std::vector<T> chunk(&inferenceResult[0][middleContextStart * rowLength],
- &inferenceResult[0][middleContextEnd * rowLength]);
+ &inferenceResult[0][middleContextEnd * rowLength]);
contextToProcess = chunk;
}
std::string output = this->m_decoder->DecodeOutput<T>(contextToProcess);
@@ -110,10 +106,10 @@ public:
std::cout << output << std::flush;
// If this is the last window, we print the right context of the output
- if(isLastWindow)
+ if (isLastWindow)
{
- std::vector<T> rContext(&inferenceResult[0][rightContextStart*rowLength],
- &inferenceResult[0][rightContextEnd * rowLength]);
+ std::vector<T> rContext(&inferenceResult[0][rightContextStart * rowLength],
+ &inferenceResult[0][rightContextEnd * rowLength]);
currentRContext = this->m_decoder->DecodeOutput(rContext);
std::cout << currentRContext << std::endl;
}
@@ -122,6 +118,7 @@ public:
protected:
std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> m_executor;
std::unique_ptr<Decoder> m_decoder;
+ std::unique_ptr<Wav2LetterPreprocessor> m_preProcessor;
};
using IPipelinePtr = std::unique_ptr<asr::ASRPipeline>;
@@ -136,4 +133,4 @@ using IPipelinePtr = std::unique_ptr<asr::ASRPipeline>;
*/
IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map<int, std::string>& labels);
-}// namespace asr \ No newline at end of file
+} // namespace asr \ No newline at end of file
diff --git a/samples/SpeechRecognition/include/Wav2LetterMFCC.hpp b/samples/SpeechRecognition/include/Wav2LetterMFCC.hpp
new file mode 100644
index 0000000000..aa88aafb3b
--- /dev/null
+++ b/samples/SpeechRecognition/include/Wav2LetterMFCC.hpp
@@ -0,0 +1,78 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "MFCC.hpp"
+
+/* Class to provide Wav2Letter specific MFCC calculation requirements. */
+class Wav2LetterMFCC : public MFCC
+{
+
+public:
+ explicit Wav2LetterMFCC(const MfccParams& params)
+ : MFCC(params)
+ {}
+
+ Wav2LetterMFCC() = delete;
+ ~Wav2LetterMFCC() = default;
+
+protected:
+
+ /**
+ * @brief Overrides base class implementation of this function.
+ * @param[in] fftVec Vector populated with FFT magnitudes
+ * @param[in] melFilterBank 2D Vector with filter bank weights
+ * @param[in] filterBankFilterFirst Vector containing the first indices of filter bank
+ * to be used for each bin.
+ * @param[in] filterBankFilterLast Vector containing the last indices of filter bank
+ * to be used for each bin.
+ * @param[out] melEnergies Pre-allocated vector of MEL energies to be
+ * populated.
+ * @return true if successful, false otherwise
+ */
+ bool ApplyMelFilterBank(
+ std::vector<float>& fftVec,
+ std::vector<std::vector<float>>& melFilterBank,
+ std::vector<uint32_t>& filterBankFilterFirst,
+ std::vector<uint32_t>& filterBankFilterLast,
+ std::vector<float>& melEnergies) override;
+
+ /**
+ * @brief Override for the base class implementation convert mel
+ * energies to logarithmic scale. The difference from
+ * default behaviour is that the power is converted to dB
+ * and subsequently clamped.
+ * @param[in,out] melEnergies 1D vector of Mel energies
+ **/
+ void ConvertToLogarithmicScale(std::vector<float>& melEnergies) override;
+
+ /**
+ * @brief Create a matrix used to calculate Discrete Cosine
+ * Transform. Override for the base class' default
+ * implementation as the first and last elements
+ * use a different normaliser.
+ * @param[in] inputLength input length of the buffer on which
+ * DCT will be performed
+ * @param[in] coefficientCount Total coefficients per input length.
+ * @return 1D vector with inputLength x coefficientCount elements
+ * populated with DCT coefficients.
+ */
+ std::vector<float> CreateDCTMatrix(int32_t inputLength,
+ int32_t coefficientCount) override;
+
+ /**
+ * @brief Given the low and high Mel values, get the normaliser
+ * for weights to be applied when populating the filter
+ * bank. Override for the base class implementation.
+ * @param[in] leftMel Low Mel frequency value.
+ * @param[in] rightMel High Mel frequency value.
+ * @param[in] useHTKMethod bool to signal if HTK method is to be
+ * used for calculation.
+ * @return Value to use for normalising.
+ */
+ float GetMelFilterBankNormaliser(const float& leftMel,
+ const float& rightMel,
+ bool useHTKMethod) override;
+}; \ No newline at end of file
diff --git a/samples/SpeechRecognition/include/Preprocess.hpp b/samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp
index 80c568439b..ebc9e864e3 100644
--- a/samples/SpeechRecognition/include/Preprocess.hpp
+++ b/samples/SpeechRecognition/include/Wav2LetterPreprocessor.hpp
@@ -1,48 +1,23 @@
//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
+#ifndef SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
+#define SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP
-#pragma once
-
+#include <numeric>
#include "DataStructures.hpp"
#include "SlidingWindow.hpp"
-#include <numeric>
#include "MFCC.hpp"
+#include "Wav2LetterMFCC.hpp"
+// Class to facilitate pre-processing calculation for Wav2Letter model for ASR
+using AudioWindow = SlidingWindow<const float>;
-/* Class to facilitate pre-processing calculation for Wav2Letter model
- * for ASR */
-using AudioWindow = SlidingWindow <const float>;
-
-class Preprocess
+class Wav2LetterPreprocessor
{
public:
-
- MFCC _m_mfcc; /* MFCC instance */
-
- /* Actual buffers to be populated */
- Array2d<float> _m_mfccBuf; /* Contiguous buffer 1D: MFCC */
- Array2d<float> _m_delta1Buf; /* Contiguous buffer 1D: Delta 1 */
- Array2d<float> _m_delta2Buf; /* Contiguous buffer 1D: Delta 2 */
-
- uint32_t _m_windowLen; /* Window length for MFCC */
- uint32_t _m_windowStride; /* Window stride len for MFCC */
- AudioWindow _m_window; /* Sliding window */
-
- /**
- * @brief Constructor
- * @param[in] numMfccFeatures number of MFCC features per window
- * @param[in] windowLen number of elements in a window
- * @param[in] windowStride stride (in number of elements) for
- * moving the window
- * @param[in] numMfccVectors number of MFCC vectors per window
- */
- Preprocess(
- const uint32_t windowLen,
- const uint32_t windowStride,
- const MFCC mfccInst);
- Preprocess() = delete;
- ~Preprocess();
+ Wav2LetterPreprocessor(uint32_t windowLen, uint32_t windowStride,
+ std::unique_ptr<Wav2LetterMFCC> mfccInst);
/**
* @brief Calculates the features required from audio data. This
@@ -55,12 +30,19 @@ public:
* @param[in] tensor tensor to be populated
* @return true if successful, false in case of error.
*/
- bool Invoke(const float* audioData,
- const uint32_t audioDataLen,
- std::vector<int8_t>& output,
- int quantOffset,
+ bool Invoke(const float* audioData, uint32_t audioDataLen, std::vector<int8_t>& output, int quantOffset,
float quantScale);
+ std::unique_ptr<MFCC> m_mfcc;
+
+ // Actual buffers to be populated
+ Array2d<float> m_mfccBuf; // Contiguous buffer 1D: MFCC
+ Array2d<float> m_delta1Buf; // Contiguous buffer 1D: Delta 1
+ Array2d<float> m_delta2Buf; // Contiguous buffer 1D: Delta 2
+
+ uint32_t m_windowLen; // Window length for MFCC
+ uint32_t m_windowStride; // Window stride len for MFCC
+ AudioWindow m_window; // Sliding window
protected:
/**
@@ -73,16 +55,18 @@ protected:
*
* @return true if successful, false otherwise
*/
- static bool _ComputeDeltas(Array2d<float>& mfcc,
- Array2d<float>& delta1,
- Array2d<float>& delta2);
+ static bool ComputeDeltas(Array2d<float>& mfcc,
+ Array2d<float>& delta1,
+ Array2d<float>& delta2);
+
+protected:
/**
* @brief Given a 2D vector of floats, computes the mean
* @param[in] vec vector of vector of floats
* @return mean value
*/
- static float _GetMean(Array2d<float>& vec);
+ static float GetMean(Array2d<float>& vec);
/**
* @brief Given a 2D vector of floats, computes the stddev
@@ -90,8 +74,7 @@ protected:
* @param[in] mean mean value of the vector passed in
* @return stddev value
*/
- static float _GetStdDev(Array2d<float>& vec,
- const float mean);
+ static float GetStdDev(Array2d<float>& vec, float mean);
/**
* @brief Given a 2D vector of floats, normalises it using
@@ -99,13 +82,13 @@ protected:
* @param[in/out] vec vector of vector of floats
* @return
*/
- static void _NormaliseVec(Array2d<float>& vec);
+ static void NormaliseVec(Array2d<float>& vec);
/**
* @brief Normalises the MFCC and delta buffers
* @return
*/
- void _Normalise();
+ void Normalise();
/**
* @brief Given the quantisation and data type limits, computes
@@ -117,12 +100,12 @@ protected:
* @param[in] maxVal Numerical limit - maximum
* @return floating point quantised value
*/
- static float _GetQuantElem(
- const float elem,
- const float quantScale,
- const int quantOffset,
- const float minVal,
- const float maxVal);
+ static float GetQuantElem(
+ float elem,
+ float quantScale,
+ int quantOffset,
+ float minVal,
+ float maxVal);
/**
* @brief Quantises the MFCC and delta buffers, and places them
@@ -137,39 +120,39 @@ protected:
* @param[in] quantScale quantisation scale
* @param[in] quantOffset quantisation offset
*/
- template <typename T>
- bool _Quantise(T* outputBuf, int quantOffset, float quantScale)
+ template<typename T>
+ bool Quantise(T*outputBuf, int quantOffset, float quantScale)
{
- /* Populate */
+ // Populate
T* outputBufMfcc = outputBuf;
- T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures;
- T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures;
- const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */
+ T* outputBufD1 = outputBuf + this->m_mfcc->m_params.m_numMfccFeatures;
+ T* outputBufD2 = outputBufD1 + this->m_mfcc->m_params.m_numMfccFeatures;
+ const uint32_t ptrIncr = this->m_mfcc->m_params.m_numMfccFeatures * 2; // (3 vectors - 1 vector)
const float minVal = std::numeric_limits<T>::min();
const float maxVal = std::numeric_limits<T>::max();
- /* We need to do a transpose while copying and concatenating
- * the tensor*/
- for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) {
- for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i)
+ // We need to do a transpose while copying and concatenating the tensor
+ for (uint32_t j = 0; j < this->m_mfcc->m_params.m_numMfccVectors; ++j)
+ {
+ for (uint32_t i = 0; i < this->m_mfcc->m_params.m_numMfccFeatures; ++i)
{
- *outputBufMfcc++ = static_cast<T>(this->_GetQuantElem(
- this->_m_mfccBuf(i, j), quantScale,
+ *outputBufMfcc++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
+ this->m_mfccBuf(i, j), quantScale,
quantOffset, minVal, maxVal));
- *outputBufD1++ = static_cast<T>(this->_GetQuantElem(
- this->_m_delta1Buf(i, j), quantScale,
+ *outputBufD1++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
+ this->m_delta1Buf(i, j), quantScale,
quantOffset, minVal, maxVal));
- *outputBufD2++ = static_cast<T>(this->_GetQuantElem(
- this->_m_delta2Buf(i, j), quantScale,
+ *outputBufD2++ = static_cast<T>(Wav2LetterPreprocessor::GetQuantElem(
+ this->m_delta2Buf(i, j), quantScale,
quantOffset, minVal, maxVal));
}
outputBufMfcc += ptrIncr;
outputBufD1 += ptrIncr;
outputBufD2 += ptrIncr;
}
-
return true;
}
};
+#endif //SPEECH_RECOGNITION_EXAMPLE_WAV2LETTERPREPROCESSOR_HPP