/* * Copyright (c) 2022 Arm Limited. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "KwsProcessing.hpp" #include "log_macros.h" #include "MicroNetKwsModel.hpp" namespace arm { namespace app { KwsPreProcess::KwsPreProcess(TfLiteTensor* inputTensor, size_t numFeatures, size_t numMfccFrames, int mfccFrameLength, int mfccFrameStride ): m_inputTensor{inputTensor}, m_mfccFrameLength{mfccFrameLength}, m_mfccFrameStride{mfccFrameStride}, m_numMfccFrames{numMfccFrames}, m_mfcc{audio::MicroNetKwsMFCC(numFeatures, mfccFrameLength)} { this->m_mfcc.Init(); /* Deduce the data length required for 1 inference from the network parameters. */ this->m_audioDataWindowSize = this->m_numMfccFrames * this->m_mfccFrameStride + (this->m_mfccFrameLength - this->m_mfccFrameStride); /* Creating an MFCC feature sliding window for the data required for 1 inference. */ this->m_mfccSlidingWindow = audio::SlidingWindow(nullptr, this->m_audioDataWindowSize, this->m_mfccFrameLength, this->m_mfccFrameStride); /* For longer audio clips we choose to move by half the audio window size * => for a 1 second window size there is an overlap of 0.5 seconds. */ this->m_audioDataStride = this->m_audioDataWindowSize / 2; /* To have the previously calculated features re-usable, stride must be multiple * of MFCC features window stride. Reduce stride through audio if needed. */ if (0 != this->m_audioDataStride % this->m_mfccFrameStride) { this->m_audioDataStride -= this->m_audioDataStride % this->m_mfccFrameStride; } this->m_numMfccVectorsInAudioStride = this->m_audioDataStride / this->m_mfccFrameStride; /* Calculate number of the feature vectors in the window overlap region. * These feature vectors will be reused.*/ this->m_numReusedMfccVectors = this->m_mfccSlidingWindow.TotalStrides() + 1 - this->m_numMfccVectorsInAudioStride; /* Construct feature calculation function. */ this->m_mfccFeatureCalculator = GetFeatureCalculator(this->m_mfcc, this->m_inputTensor, this->m_numReusedMfccVectors); if (!this->m_mfccFeatureCalculator) { printf_err("Feature calculator not initialized."); } } bool KwsPreProcess::DoPreProcess(const void* data, size_t inputSize) { UNUSED(inputSize); if (data == nullptr) { printf_err("Data pointer is null"); } /* Set the features sliding window to the new address. */ auto input = static_cast(data); this->m_mfccSlidingWindow.Reset(input); /* Cache is only usable if we have more than 1 inference in an audio clip. */ bool useCache = this->m_audioWindowIndex > 0 && this->m_numReusedMfccVectors > 0; /* Use a sliding window to calculate MFCC features frame by frame. */ while (this->m_mfccSlidingWindow.HasNext()) { const int16_t* mfccWindow = this->m_mfccSlidingWindow.Next(); std::vector mfccFrameAudioData = std::vector(mfccWindow, mfccWindow + this->m_mfccFrameLength); /* Compute features for this window and write them to input tensor. */ this->m_mfccFeatureCalculator(mfccFrameAudioData, this->m_mfccSlidingWindow.Index(), useCache, this->m_numMfccVectorsInAudioStride); } debug("Input tensor populated \n"); return true; } /** * @brief Generic feature calculator factory. * * Returns lambda function to compute features using features cache. * Real features math is done by a lambda function provided as a parameter. * Features are written to input tensor memory. * * @tparam T Feature vector type. * @param[in] inputTensor Model input tensor pointer. * @param[in] cacheSize Number of feature vectors to cache. Defined by the sliding window overlap. * @param[in] compute Features calculator function. * @return Lambda function to compute features. */ template std::function&, size_t, bool, size_t)> KwsPreProcess::FeatureCalc(TfLiteTensor* inputTensor, size_t cacheSize, std::function (std::vector& )> compute) { /* Feature cache to be captured by lambda function. */ static std::vector> featureCache = std::vector>(cacheSize); return [=](std::vector& audioDataWindow, size_t index, bool useCache, size_t featuresOverlapIndex) { T* tensorData = tflite::GetTensorData(inputTensor); std::vector features; /* Reuse features from cache if cache is ready and sliding windows overlap. * Overlap is in the beginning of sliding window with a size of a feature cache. */ if (useCache && index < featureCache.size()) { features = std::move(featureCache[index]); } else { features = std::move(compute(audioDataWindow)); } auto size = features.size(); auto sizeBytes = sizeof(T) * size; std::memcpy(tensorData + (index * size), features.data(), sizeBytes); /* Start renewing cache as soon iteration goes out of the windows overlap. */ if (index >= featuresOverlapIndex) { featureCache[index - featuresOverlapIndex] = std::move(features); } }; } template std::function&, size_t , bool, size_t)> KwsPreProcess::FeatureCalc(TfLiteTensor* inputTensor, size_t cacheSize, std::function (std::vector&)> compute); template std::function&, size_t, bool, size_t)> KwsPreProcess::FeatureCalc(TfLiteTensor* inputTensor, size_t cacheSize, std::function(std::vector&)> compute); std::function&, int, bool, size_t)> KwsPreProcess::GetFeatureCalculator(audio::MicroNetKwsMFCC& mfcc, TfLiteTensor* inputTensor, size_t cacheSize) { std::function&, size_t, bool, size_t)> mfccFeatureCalc; TfLiteQuantization quant = inputTensor->quantization; if (kTfLiteAffineQuantization == quant.type) { auto *quantParams = (TfLiteAffineQuantization *) quant.params; const float quantScale = quantParams->scale->data[0]; const int quantOffset = quantParams->zero_point->data[0]; switch (inputTensor->type) { case kTfLiteInt8: { mfccFeatureCalc = this->FeatureCalc(inputTensor, cacheSize, [=, &mfcc](std::vector& audioDataWindow) { return mfcc.MfccComputeQuant(audioDataWindow, quantScale, quantOffset); } ); break; } default: printf_err("Tensor type %s not supported\n", TfLiteTypeGetName(inputTensor->type)); } } else { mfccFeatureCalc = this->FeatureCalc(inputTensor, cacheSize, [&mfcc](std::vector& audioDataWindow) { return mfcc.MfccCompute(audioDataWindow); } ); } return mfccFeatureCalc; } KwsPostProcess::KwsPostProcess(TfLiteTensor* outputTensor, Classifier& classifier, const std::vector& labels, std::vector& results) :m_outputTensor{outputTensor}, m_kwsClassifier{classifier}, m_labels{labels}, m_results{results} {} bool KwsPostProcess::DoPostProcess() { return this->m_kwsClassifier.GetClassificationResults( this->m_outputTensor, this->m_results, this->m_labels, 1, true); } } /* namespace app */ } /* namespace arm */