samples/SpeechRecognition/src/Wav2LetterMFCC.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

//
// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include "Wav2LetterMFCC.hpp"
#include "MathUtils.hpp"

#include <cfloat>

bool Wav2LetterMFCC::ApplyMelFilterBank(
        std::vector<float>&                 fftVec,
        std::vector<std::vector<float>>&    melFilterBank,
        std::vector<uint32_t>&               filterBankFilterFirst,
        std::vector<uint32_t>&               filterBankFilterLast,
        std::vector<float>&                 melEnergies)
{
    const size_t numBanks = melEnergies.size();

    if (numBanks != filterBankFilterFirst.size() ||
            numBanks != filterBankFilterLast.size()) 
    {
        printf("Unexpected filter bank lengths\n");
        return false;
    }

    for (size_t bin = 0; bin < numBanks; ++bin) 
    {
        auto filterBankIter = melFilterBank[bin].begin();
        auto end = melFilterBank[bin].end();
        // Avoid log of zero at later stages, same value used in librosa.
        // The number was used during our default wav2letter model training. 
        float melEnergy = 1e-10;
        const uint32_t firstIndex = filterBankFilterFirst[bin];
        const uint32_t lastIndex = std::min<uint32_t>(filterBankFilterLast[bin], fftVec.size() - 1);

        for (uint32_t i = firstIndex; i <= lastIndex && filterBankIter != end; ++i) 
        {
            melEnergy += (*filterBankIter++ * fftVec[i]);
        }

        melEnergies[bin] = melEnergy;
    }

    return true;
}

void Wav2LetterMFCC::ConvertToLogarithmicScale(std::vector<float>& melEnergies)
{
    float maxMelEnergy = -FLT_MAX;

    // Container for natural logarithms of mel energies. 
    std::vector <float> vecLogEnergies(melEnergies.size(), 0.f);

    // Because we are taking natural logs, we need to multiply by log10(e).
    // Also, for wav2letter model, we scale our log10 values by 10. 
    constexpr float multiplier = 10.0 *  // Default scalar. 
                                  0.4342944819032518;  // log10f(std::exp(1.0)) 

    // Take log of the whole vector. 
    MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies);

    // Scale the log values and get the max. 
    for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin();
              iterM != melEnergies.end() && iterL != vecLogEnergies.end(); ++iterM, ++iterL) 
    {

        *iterM = *iterL * multiplier;

        // Save the max mel energy. 
        if (*iterM > maxMelEnergy) 
        {
            maxMelEnergy = *iterM;
        }
    }

    // Clamp the mel energies. 
    constexpr float maxDb = 80.0;
    const float clampLevelLowdB = maxMelEnergy - maxDb;
    for (float& melEnergy : melEnergies) 
    {
        melEnergy = std::max(melEnergy, clampLevelLowdB);
    }
}

std::vector<float> Wav2LetterMFCC::CreateDCTMatrix(
                                    const int32_t inputLength,
                                    const int32_t coefficientCount)
{
    std::vector<float> dctMatix(inputLength * coefficientCount);

    // Orthonormal normalization. 
    const float normalizerK0 = 2 * sqrtf(1.0f /
                                    static_cast<float>(4 * inputLength));
    const float normalizer = 2 * sqrtf(1.0f /
                                    static_cast<float>(2 * inputLength));

    const float angleIncr = M_PI / inputLength;
    float angle = angleIncr;  // We start using it at k = 1 loop. 

    // First row of DCT will use normalizer K0. 
    for (int32_t n = 0; n < inputLength; ++n) 
    {
        dctMatix[n] = normalizerK0;  // cos(0) = 1 
    }

    // Second row (index = 1) onwards, we use standard normalizer. 
    for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength) 
    {
        for (int32_t n = 0; n < inputLength; ++n) 
        {
            dctMatix[m+n] = normalizer * cosf((n + 0.5f) * angle);
        }
        angle += angleIncr;
    }
    return dctMatix;
}

float Wav2LetterMFCC::GetMelFilterBankNormaliser(
                                const float&    leftMel,
                                const float&    rightMel,
                                const bool      useHTKMethod)
{
    // Slaney normalization for mel weights. 
    return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) -
            MFCC::InverseMelScale(leftMel, useHTKMethod)));
}