aboutsummaryrefslogtreecommitdiff
path: root/samples/common/include/Audio
diff options
context:
space:
mode:
Diffstat (limited to 'samples/common/include/Audio')
-rw-r--r--samples/common/include/Audio/AudioCapture.hpp57
-rw-r--r--samples/common/include/Audio/DataStructures.hpp102
-rw-r--r--samples/common/include/Audio/MFCC.hpp234
-rw-r--r--samples/common/include/Audio/MathUtils.hpp85
-rw-r--r--samples/common/include/Audio/SlidingWindow.hpp161
5 files changed, 639 insertions, 0 deletions
diff --git a/samples/common/include/Audio/AudioCapture.hpp b/samples/common/include/Audio/AudioCapture.hpp
new file mode 100644
index 0000000000..898bf911f4
--- /dev/null
+++ b/samples/common/include/Audio/AudioCapture.hpp
@@ -0,0 +1,57 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <string>
+#include <iostream>
+#include <cmath>
+#include <vector>
+#include <exception>
+
+#include "SlidingWindow.hpp"
+
+namespace audio
+{
+
+/**
+* @brief Class used to capture the audio data loaded from file, and to provide a method of
+ * extracting correctly positioned and appropriately sized audio windows
+*
+*/
+ class AudioCapture
+ {
+ public:
+
+ SlidingWindow<const float> m_window;
+
+ /**
+ * @brief Default constructor
+ */
+ AudioCapture() = default;
+
+ /**
+ * @brief Function to load the audio data captured from the
+ * input file to memory.
+ */
+ static std::vector<float> LoadAudioFile(std::string filePath);
+
+ /**
+ * @brief Function to initialize the sliding window. This will set its position in memory, its
+ * window size and its stride.
+ */
+ void InitSlidingWindow(float* data, size_t dataSize, int minSamples, size_t stride);
+
+ /**
+ * Checks whether there is another block of audio in memory to read
+ */
+ bool HasNext();
+
+ /**
+ * Retrieves the next block of audio if its available
+ */
+ std::vector<float> Next();
+ };
+} // namespace audio \ No newline at end of file
diff --git a/samples/common/include/Audio/DataStructures.hpp b/samples/common/include/Audio/DataStructures.hpp
new file mode 100644
index 0000000000..9922265299
--- /dev/null
+++ b/samples/common/include/Audio/DataStructures.hpp
@@ -0,0 +1,102 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+/**
+ * Class Array2d is a data structure that represents a two dimensional array.
+ * The data is allocated in contiguous memory, arranged row-wise
+ * and individual elements can be accessed with the () operator.
+ * For example a two dimensional array D of size (M, N) can be accessed:
+ *
+ * _|<------------- col size = N -------->|
+ * | D(r=0, c=0) D(r=0, c=1)... D(r=0, c=N)
+ * | D(r=1, c=0) D(r=1, c=1)... D(r=1, c=N)
+ * | ...
+ * row size = M ...
+ * | ...
+ * _ D(r=M, c=0) D(r=M, c=1)... D(r=M, c=N)
+ *
+ */
+template<typename T>
+class Array2d
+{
+private:
+ size_t m_rows;
+ size_t m_cols;
+ T* m_data;
+
+public:
+ /**
+ * Creates the array2d with the given sizes.
+ *
+ * @param rows number of rows.
+ * @param cols number of columns.
+ */
+ Array2d(unsigned rows, unsigned cols)
+ {
+ if (rows == 0 || cols == 0) {
+ printf("Array2d constructor has 0 size.\n");
+ m_data = nullptr;
+ return;
+ }
+ m_rows = rows;
+ m_cols = cols;
+ m_data = new T[rows * cols];
+ }
+
+ ~Array2d()
+ {
+ delete[] m_data;
+ }
+
+ T& operator() (unsigned int row, unsigned int col)
+ {
+ return m_data[m_cols * row + col];
+ }
+
+ T operator() (unsigned int row, unsigned int col) const
+ {
+ return m_data[m_cols * row + col];
+ }
+
+ /**
+ * Gets rows number of the current array2d.
+ * @return number of rows.
+ */
+ size_t size(size_t dim)
+ {
+ switch (dim)
+ {
+ case 0:
+ return m_rows;
+ case 1:
+ return m_cols;
+ default:
+ return 0;
+ }
+ }
+
+ /**
+ * Gets the array2d total size.
+ */
+ size_t totalSize()
+ {
+ return m_rows * m_cols;
+ }
+
+ /**
+ * array2d iterator.
+ */
+ using iterator=T*;
+ using const_iterator=T const*;
+
+ iterator begin() { return m_data; }
+ iterator end() { return m_data + totalSize(); }
+ const_iterator begin() const { return m_data; }
+ const_iterator end() const { return m_data + totalSize(); };
+};
diff --git a/samples/common/include/Audio/MFCC.hpp b/samples/common/include/Audio/MFCC.hpp
new file mode 100644
index 0000000000..468bf92fae
--- /dev/null
+++ b/samples/common/include/Audio/MFCC.hpp
@@ -0,0 +1,234 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+
+#include <vector>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+#include <string>
+
+/* MFCC's consolidated parameters */
+class MfccParams
+{
+public:
+ float m_samplingFreq;
+ int m_numFbankBins;
+ float m_melLoFreq;
+ float m_melHiFreq;
+ int m_numMfccFeatures;
+ int m_frameLen;
+ int m_frameLenPadded;
+ bool m_useHtkMethod;
+ int m_numMfccVectors;
+ /** @brief Constructor */
+ MfccParams(const float samplingFreq, const int numFbankBins,
+ const float melLoFreq, const float melHiFreq,
+ const int numMfccFeats, const int frameLen,
+ const bool useHtkMethod, const int numMfccVectors);
+ /* Delete the default constructor */
+ MfccParams() = delete;
+ /* Default destructor */
+ ~MfccParams() = default;
+ /** @brief String representation of parameters */
+ std::string Str();
+};
+
+/**
+ * @brief Class for MFCC feature extraction.
+ * Based on https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Source/MFCC/mfcc.cpp
+ * This class is designed to be generic and self-sufficient but
+ * certain calculation routines can be overridden to accommodate
+ * use-case specific requirements.
+ */
+class MFCC {
+public:
+ /**
+ * @brief Constructor
+ * @param[in] params MFCC parameters
+ */
+ explicit MFCC(const MfccParams& params);
+
+ MFCC() = delete;
+
+ ~MFCC() = default;
+
+ /**
+ * @brief Extract MFCC features for one single small frame of
+ * audio data e.g. 640 samples.
+ * @param[in] audioData Vector of audio samples to calculate
+ * features for.
+ * @return Vector of extracted MFCC features.
+ **/
+ std::vector<float> MfccCompute(const std::vector<float>& audioData);
+
+ /** @brief Initialise. */
+ void Init();
+
+ /**
+ * @brief Extract MFCC features and quantise for one single small
+ * frame of audio data e.g. 640 samples.
+ * @param[in] audioData Vector of audio samples to calculate
+ * features for.
+ * @param[in] quantScale Quantisation scale.
+ * @param[in] quantOffset Quantisation offset.
+ * @return Vector of extracted quantised MFCC features.
+ **/
+ template<typename T>
+ std::vector<T> MfccComputeQuant(const std::vector<float>& audioData,
+ const float quantScale,
+ const int quantOffset)
+ {
+ this->MfccComputePreFeature(audioData);
+ float minVal = std::numeric_limits<T>::min();
+ float maxVal = std::numeric_limits<T>::max();
+
+ std::vector<T> mfccOut(this->m_params.m_numMfccFeatures);
+ const size_t numFbankBins = this->m_params.m_numFbankBins;
+
+ /* Take DCT. Uses matrix mul. */
+ for (size_t i = 0, j = 0; i < mfccOut.size(); ++i, j += numFbankBins)
+ {
+ float sum = 0;
+ for (size_t k = 0; k < numFbankBins; ++k)
+ {
+ sum += this->m_dctMatrix[j + k] * this->m_melEnergies[k];
+ }
+ /* Quantize to T. */
+ sum = std::round((sum / quantScale) + quantOffset);
+ mfccOut[i] = static_cast<T>(std::min<float>(std::max<float>(sum, minVal), maxVal));
+ }
+
+ return mfccOut;
+ }
+
+ MfccParams m_params;
+
+ /* Constants */
+ static constexpr float ms_logStep = /*logf(6.4)*/ 1.8562979903656 / 27.0;
+ static constexpr float ms_freqStep = 200.0 / 3;
+ static constexpr float ms_minLogHz = 1000.0;
+ static constexpr float ms_minLogMel = ms_minLogHz / ms_freqStep;
+
+protected:
+ /**
+ * @brief Project input frequency to Mel Scale.
+ * @param[in] freq Input frequency in floating point.
+ * @param[in] useHTKMethod bool to signal if HTK method is to be
+ * used for calculation.
+ * @return Mel transformed frequency in floating point.
+ **/
+ static float MelScale(float freq,
+ bool useHTKMethod = true);
+
+ /**
+ * @brief Inverse Mel transform - convert MEL warped frequency
+ * back to normal frequency.
+ * @param[in] melFreq Mel frequency in floating point.
+ * @param[in] useHTKMethod bool to signal if HTK method is to be
+ * used for calculation.
+ * @return Real world frequency in floating point.
+ **/
+ static float InverseMelScale(float melFreq,
+ bool useHTKMethod = true);
+
+ /**
+ * @brief Populates MEL energies after applying the MEL filter
+ * bank weights and adding them up to be placed into
+ * bins, according to the filter bank's first and last
+ * indices (pre-computed for each filter bank element
+ * by CreateMelFilterBank function).
+ * @param[in] fftVec Vector populated with FFT magnitudes.
+ * @param[in] melFilterBank 2D Vector with filter bank weights.
+ * @param[in] filterBankFilterFirst Vector containing the first indices of filter bank
+ * to be used for each bin.
+ * @param[in] filterBankFilterLast Vector containing the last indices of filter bank
+ * to be used for each bin.
+ * @param[out] melEnergies Pre-allocated vector of MEL energies to be
+ * populated.
+ * @return true if successful, false otherwise.
+ */
+ virtual bool ApplyMelFilterBank(
+ std::vector<float>& fftVec,
+ std::vector<std::vector<float>>& melFilterBank,
+ std::vector<uint32_t>& filterBankFilterFirst,
+ std::vector<uint32_t>& filterBankFilterLast,
+ std::vector<float>& melEnergies);
+
+ /**
+ * @brief Converts the Mel energies for logarithmic scale.
+ * @param[in,out] melEnergies 1D vector of Mel energies.
+ **/
+ virtual void ConvertToLogarithmicScale(std::vector<float>& melEnergies);
+
+ /**
+ * @brief Create a matrix used to calculate Discrete Cosine
+ * Transform.
+ * @param[in] inputLength Input length of the buffer on which
+ * DCT will be performed.
+ * @param[in] coefficientCount Total coefficients per input length.
+ * @return 1D vector with inputLength x coefficientCount elements
+ * populated with DCT coefficients.
+ */
+ virtual std::vector<float> CreateDCTMatrix(
+ int32_t inputLength,
+ int32_t coefficientCount);
+
+ /**
+ * @brief Given the low and high Mel values, get the normaliser
+ * for weights to be applied when populating the filter
+ * bank.
+ * @param[in] leftMel Low Mel frequency value.
+ * @param[in] rightMel High Mel frequency value.
+ * @param[in] useHTKMethod bool to signal if HTK method is to be
+ * used for calculation.
+ * @return Value to use for normalizing.
+ */
+ virtual float GetMelFilterBankNormaliser(
+ const float& leftMel,
+ const float& rightMel,
+ bool useHTKMethod);
+
+private:
+
+ std::vector<float> m_frame;
+ std::vector<float> m_buffer;
+ std::vector<float> m_melEnergies;
+ std::vector<float> m_windowFunc;
+ std::vector<std::vector<float>> m_melFilterBank;
+ std::vector<float> m_dctMatrix;
+ std::vector<uint32_t> m_filterBankFilterFirst;
+ std::vector<uint32_t> m_filterBankFilterLast;
+ bool m_filterBankInitialised;
+
+ /**
+ * @brief Initialises the filter banks and the DCT matrix. **/
+ void InitMelFilterBank();
+
+ /**
+ * @brief Signals whether the instance of MFCC has had its
+ * required buffers initialised.
+ * @return true if initialised, false otherwise.
+ **/
+ bool IsMelFilterBankInited() const;
+
+ /**
+ * @brief Create mel filter banks for MFCC calculation.
+ * @return 2D vector of floats.
+ **/
+ std::vector<std::vector<float>> CreateMelFilterBank();
+
+ /**
+ * @brief Computes and populates internal memeber buffers used
+ * in MFCC feature calculation
+ * @param[in] audioData 1D vector of 16-bit audio data.
+ */
+ void MfccComputePreFeature(const std::vector<float>& audioData);
+
+ /** @brief Computes the magnitude from an interleaved complex array. */
+ void ConvertToPowerSpectrum();
+
+};
diff --git a/samples/common/include/Audio/MathUtils.hpp b/samples/common/include/Audio/MathUtils.hpp
new file mode 100644
index 0000000000..1d8b0d31cc
--- /dev/null
+++ b/samples/common/include/Audio/MathUtils.hpp
@@ -0,0 +1,85 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <vector>
+#include <cmath>
+#include <cstdint>
+#include <numeric>
+
+class MathUtils
+{
+
+public:
+
+ /**
+ * @brief Computes the FFT for the input vector
+ * @param[in] input Floating point vector of input elements
+ * @param[out] fftOutput Output buffer to be populated by computed
+ * FFTs
+ * @return none
+ */
+ static void FftF32(std::vector<float>& input,
+ std::vector<float>& fftOutput);
+
+
+ /**
+ * @brief Computes the dot product of two 1D floating point
+ * vectors.
+ * result = sum(srcA[0]*srcB[0] + srcA[1]*srcB[1] + ..)
+ * @param[in] srcPtrA pointer to the first element of first
+ * array
+ * @param[in] srcPtrB pointer to the first element of second
+ * array
+ * @param[in] srcLen Number of elements in the array/vector
+ * @return dot product
+ */
+ static float DotProductF32(const float* srcPtrA, float* srcPtrB,
+ int srcLen);
+
+ /**
+ * @brief Computes the squared magnitude of floating point
+ * complex number array.
+ * @param[in] ptrSrc pointer to the first element of input
+ * array
+ * @param[in] srcLen Number of elements in the array/vector
+ * @param[out] ptrDst Output buffer to be populated
+ * @param[in] dstLen output buffer len (for sanity check only)
+ * @return true if successful, false otherwise
+ */
+ static bool ComplexMagnitudeSquaredF32(const float* ptrSrc,
+ int srcLen,
+ float* ptrDst,
+ int dstLen);
+
+ /**
+ * @brief Computes the natural logarithms of input floating point
+ * vector
+ * @param[in] input Floating point input vector
+ * @param[out] output Pre-allocated buffer to be populated with
+ * natural log values of each input element
+ * @return none
+ */
+ static void VecLogarithmF32(std::vector <float>& input,
+ std::vector <float>& output);
+
+ /**
+ * @brief Gets the mean of a floating point array of elements
+ * @param[in] ptrSrc pointer to the first element
+ * @param[in] srcLen Number of elements in the array/vector
+ * @return average value
+ */
+ static float MeanF32(const float* ptrSrc, uint32_t srcLen);
+
+ /**
+ * @brief Gets the standard deviation of a floating point array
+ * of elements
+ * @param[in] ptrSrc pointer to the first element
+ * @param[in] srcLen Number of elements in the array/vector
+ * @param[in] mean pre-computed mean value
+ * @return standard deviation value
+ */
+ static float StdDevF32(const float* ptrSrc, uint32_t srcLen,
+ float mean);
+};
diff --git a/samples/common/include/Audio/SlidingWindow.hpp b/samples/common/include/Audio/SlidingWindow.hpp
new file mode 100644
index 0000000000..77498c6338
--- /dev/null
+++ b/samples/common/include/Audio/SlidingWindow.hpp
@@ -0,0 +1,161 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+template<class T>
+class SlidingWindow
+{
+protected:
+ T* m_start = nullptr;
+ size_t m_dataSize = 0;
+ size_t m_size = 0;
+ size_t m_stride = 0;
+ size_t m_count = 0;
+public:
+
+ /**
+ * Creates the window slider through the given data.
+ *
+ * @param data pointer to the data to slide through.
+ * @param dataSize size in T type elements wise.
+ * @param windowSize sliding window size in T type wise elements.
+ * @param stride stride size in T type wise elements.
+ */
+ SlidingWindow(T* data, size_t dataSize,
+ size_t windowSize, size_t stride)
+ {
+ m_start = data;
+ m_dataSize = dataSize;
+ m_size = windowSize;
+ m_stride = stride;
+ }
+
+ SlidingWindow() = default;
+
+ ~SlidingWindow() = default;
+
+ /**
+ * Get the next data window.
+ * @return pointer to the next window, if next window is not available nullptr is returned.
+ */
+ virtual T* Next()
+ {
+ if (HasNext())
+ {
+ m_count++;
+ return m_start + Index() * m_stride;
+ }
+ else
+ {
+ return nullptr;
+ }
+ }
+
+ /**
+ * Checks if the next data portion is available.
+ * @return true if next data portion is available
+ */
+ bool HasNext()
+ {
+ return this->m_count < 1 + this->FractionalTotalStrides() && (this->NextWindowStartIndex() < this->m_dataSize);
+ }
+
+ /**
+ * Resest the slider to the initial position.
+ */
+ virtual void Reset()
+ {
+ m_count = 0;
+ }
+
+ /**
+ * Resest the slider to the initial position.
+ */
+ virtual size_t GetWindowSize()
+ {
+ return m_size;
+ }
+
+ /**
+ * Resets the slider to the start of the new data.
+ * New data size MUST be the same as the old one.
+ * @param newStart pointer to the new data to slide through.
+ */
+ virtual void Reset(T* newStart)
+ {
+ m_start = newStart;
+ Reset();
+ }
+
+ /**
+ * Gets current index of the sliding window.
+ * @return current position of the sliding window in number of strides
+ */
+ size_t Index()
+ {
+ return m_count == 0? 0: m_count - 1;
+ }
+
+ /**
+ * Gets the index from the start of the data where the next window will begin.
+ * While Index() returns the index of sliding window itself this function returns the index of the data
+ * element itself.
+ * @return Index from the start of the data where the next sliding window will begin.
+ */
+ virtual size_t NextWindowStartIndex()
+ {
+ return m_count == 0? 0: ((m_count) * m_stride);
+ }
+
+ /**
+ * Go to given sliding window index.
+ * @param index new position of the sliding window. if index is invalid (greater than possible range of strides)
+ * then next call to Next() will return nullptr.
+ */
+ void FastForward(size_t index)
+ {
+ m_count = index;
+ }
+
+ /**
+ * Calculates whole number of times the window can stride through the given data.
+ * @return maximum number of strides.
+ */
+ size_t TotalStrides()
+ {
+ if (m_size > m_dataSize)
+ {
+ return 0;
+ }
+ return ((m_dataSize - m_size)/m_stride);
+ }
+
+ /**
+ * Calculates number of times the window can stride through the given data. May not be a whole number.
+ * @return Number of strides to cover all data.
+ */
+ float FractionalTotalStrides()
+ {
+ if(this->m_size > this->m_dataSize)
+ {
+ return this->m_dataSize / this->m_size;
+ }
+ else
+ {
+ return ((this->m_dataSize - this->m_size)/ static_cast<float>(this->m_stride));
+ }
+
+ }
+
+ /**
+ * Calculates the remaining data left to be processed
+ * @return The remaining unprocessed data
+ */
+ int RemainingData()
+ {
+ return this->m_dataSize - this->NextWindowStartIndex();
+ }
+}; \ No newline at end of file