summaryrefslogtreecommitdiff
path: root/source/application/api/use_case/asr/include/Wav2LetterMfcc.hpp
blob: b05d79bbfbe4d12d3ffefaadc4e069bb1fc1e263 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/*
 * SPDX-FileCopyrightText: Copyright 2021 Arm Limited and/or its affiliates <open-source-office@arm.com>
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#ifndef ASR_WAV2LETTER_MFCC_HPP
#define ASR_WAV2LETTER_MFCC_HPP

#include "Mfcc.hpp"

namespace arm {
namespace app {
namespace audio {

    /* Class to provide Wav2Letter specific MFCC calculation requirements. */
    class Wav2LetterMFCC : public MFCC {

    public:
        static constexpr uint32_t  ms_defaultSamplingFreq = 16000;
        static constexpr uint32_t  ms_defaultNumFbankBins =   128;
        static constexpr uint32_t  ms_defaultMelLoFreq    =     0;
        static constexpr uint32_t  ms_defaultMelHiFreq    =  8000;
        static constexpr bool      ms_defaultUseHtkMethod = false;

        explicit Wav2LetterMFCC(const size_t numFeats, const size_t frameLen)
            :  MFCC(MfccParams(
                        ms_defaultSamplingFreq, ms_defaultNumFbankBins,
                        ms_defaultMelLoFreq, ms_defaultMelHiFreq,
                        numFeats, frameLen, ms_defaultUseHtkMethod))
        {}

        Wav2LetterMFCC()  = delete;
        ~Wav2LetterMFCC() = default;

    protected:

        /**
         * @brief       Overrides base class implementation of this function.
         * @param[in]   fftVec                  Vector populated with FFT magnitudes
         * @param[in]   melFilterBank           2D Vector with filter bank weights
         * @param[in]   filterBankFilterFirst   Vector containing the first indices of filter bank
         *                                      to be used for each bin.
         * @param[in]   filterBankFilterLast    Vector containing the last indices of filter bank
         *                                      to be used for each bin.
         * @param[out]  melEnergies             Pre-allocated vector of MEL energies to be
         *                                      populated.
         * @return      true if successful, false otherwise
         */
        bool ApplyMelFilterBank(
            std::vector<float>&                 fftVec,
            std::vector<std::vector<float>>&    melFilterBank,
            std::vector<uint32_t>&              filterBankFilterFirst,
            std::vector<uint32_t>&              filterBankFilterLast,
            std::vector<float>&                 melEnergies) override;

        /**
         * @brief           Override for the base class implementation convert mel
         *                  energies to logarithmic scale. The difference from
         *                  default behaviour is that the power is converted to dB
         *                  and subsequently clamped.
         * @param[in,out]   melEnergies   1D vector of Mel energies
         **/
        void ConvertToLogarithmicScale(std::vector<float>& melEnergies) override;

        /**
         * @brief       Create a matrix used to calculate Discrete Cosine
         *              Transform. Override for the base class' default
         *              implementation as the first and last elements
         *              use a different normaliser.
         * @param[in]   inputLength        input length of the buffer on which
         *                                 DCT will be performed
         * @param[in]   coefficientCount   Total coefficients per input length.
         * @return      1D vector with inputLength x coefficientCount elements
         *              populated with DCT coefficients.
         */
        std::vector<float> CreateDCTMatrix(int32_t inputLength,
                                           int32_t coefficientCount) override;

        /**
         * @brief       Given the low and high Mel values, get the normaliser
         *              for weights to be applied when populating the filter
         *              bank. Override for the base class implementation.
         * @param[in]   leftMel        Low Mel frequency value.
         * @param[in]   rightMel       High Mel frequency value.
         * @param[in]   useHTKMethod   bool to signal if HTK method is to be
         *                             used for calculation.
         * @return      Value to use for normalising.
         */
        float GetMelFilterBankNormaliser(const float&   leftMel,
                                         const float&   rightMel,
                                         bool     useHTKMethod) override;
    };

} /* namespace audio */
} /* namespace app */
} /* namespace arm */

#endif /* ASR_WAV2LETTER_MFCC_HPP */