summaryrefslogtreecommitdiff
path: root/source/use_case/asr/src
diff options
context:
space:
mode:
Diffstat (limited to 'source/use_case/asr/src')
-rw-r--r--source/use_case/asr/src/AsrClassifier.cc130
-rw-r--r--source/use_case/asr/src/MainLoop.cc230
-rw-r--r--source/use_case/asr/src/OutputDecode.cc47
-rw-r--r--source/use_case/asr/src/UseCaseHandler.cc288
-rw-r--r--source/use_case/asr/src/Wav2LetterMfcc.cc137
-rw-r--r--source/use_case/asr/src/Wav2LetterModel.cc56
-rw-r--r--source/use_case/asr/src/Wav2LetterPostprocess.cc172
-rw-r--r--source/use_case/asr/src/Wav2LetterPreprocess.cc228
8 files changed, 1288 insertions, 0 deletions
diff --git a/source/use_case/asr/src/AsrClassifier.cc b/source/use_case/asr/src/AsrClassifier.cc
new file mode 100644
index 0000000..7377d30
--- /dev/null
+++ b/source/use_case/asr/src/AsrClassifier.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "AsrClassifier.hpp"
+
+#include "hal.h"
+#include "TensorFlowLiteMicro.hpp"
+#include "Wav2LetterModel.hpp"
+
+template<typename T>
+bool arm::app::AsrClassifier::_GetTopResults(TfLiteTensor* tensor,
+ std::vector<ClassificationResult>& vecResults,
+ const std::vector <std::string>& labels, double scale, double zeroPoint)
+{
+ const uint32_t nElems = tensor->dims->data[arm::app::Wav2LetterModel::ms_outputRowsIdx];
+ const uint32_t nLetters = tensor->dims->data[arm::app::Wav2LetterModel::ms_outputColsIdx];
+
+ /* NOTE: tensor's size verification against labels should be
+ * checked by the calling/public function. */
+ if (nLetters < 1) {
+ return false;
+ }
+
+ /* Final results' container. */
+ vecResults = std::vector<ClassificationResult>(nElems);
+
+ T* tensorData = tflite::GetTensorData<T>(tensor);
+
+ /* Get the top 1 results. */
+ for (uint32_t i = 0, row = 0; i < nElems; ++i, row+=nLetters) {
+ std::pair<T, uint32_t> top_1 = std::make_pair(tensorData[row + 0], 0);
+
+ for (uint32_t j = 1; j < nLetters; ++j) {
+ if (top_1.first < tensorData[row + j]) {
+ top_1.first = tensorData[row + j];
+ top_1.second = j;
+ }
+ }
+
+ double score = static_cast<int> (top_1.first);
+ vecResults[i].m_normalisedVal = scale * (score - zeroPoint);
+ vecResults[i].m_label = labels[top_1.second];
+ vecResults[i].m_labelIdx = top_1.second;
+ }
+
+ return true;
+}
+template bool arm::app::AsrClassifier::_GetTopResults<uint8_t>(TfLiteTensor* tensor,
+ std::vector<ClassificationResult>& vecResults,
+ const std::vector <std::string>& labels, double scale, double zeroPoint);
+template bool arm::app::AsrClassifier::_GetTopResults<int8_t>(TfLiteTensor* tensor,
+ std::vector<ClassificationResult>& vecResults,
+ const std::vector <std::string>& labels, double scale, double zeroPoint);
+
+bool arm::app::AsrClassifier::GetClassificationResults(
+ TfLiteTensor* outputTensor,
+ std::vector<ClassificationResult>& vecResults,
+ const std::vector <std::string>& labels, uint32_t topNCount)
+{
+ vecResults.clear();
+
+ constexpr int minTensorDims = static_cast<int>(
+ (arm::app::Wav2LetterModel::ms_outputRowsIdx > arm::app::Wav2LetterModel::ms_outputColsIdx)?
+ arm::app::Wav2LetterModel::ms_outputRowsIdx : arm::app::Wav2LetterModel::ms_outputColsIdx);
+
+ constexpr uint32_t outColsIdx = arm::app::Wav2LetterModel::ms_outputColsIdx;
+
+ /* Sanity checks. */
+ if (outputTensor == nullptr) {
+ printf_err("Output vector is null pointer.\n");
+ return false;
+ } else if (outputTensor->dims->size < minTensorDims) {
+ printf_err("Output tensor expected to be %dD\n", minTensorDims);
+ return false;
+ } else if (static_cast<uint32_t>(outputTensor->dims->data[outColsIdx]) < topNCount) {
+ printf_err("Output vectors are smaller than %u\n", topNCount);
+ return false;
+ } else if (static_cast<uint32_t>(outputTensor->dims->data[outColsIdx]) != labels.size()) {
+ printf("Output size doesn't match the labels' size\n");
+ return false;
+ }
+
+ if (topNCount != 1) {
+ warn("TopNCount value ignored in this implementation\n");
+ }
+
+ /* To return the floating point values, we need quantization parameters. */
+ QuantParams quantParams = GetTensorQuantParams(outputTensor);
+
+ bool resultState;
+
+ switch (outputTensor->type) {
+ case kTfLiteUInt8:
+ resultState = this->_GetTopResults<uint8_t>(
+ outputTensor, vecResults,
+ labels, quantParams.scale,
+ quantParams.offset);
+ break;
+ case kTfLiteInt8:
+ resultState = this->_GetTopResults<int8_t>(
+ outputTensor, vecResults,
+ labels, quantParams.scale,
+ quantParams.offset);
+ break;
+ default:
+ printf_err("Tensor type %s not supported by classifier\n",
+ TfLiteTypeGetName(outputTensor->type));
+ return false;
+ }
+
+ if (!resultState) {
+ printf_err("Failed to get sorted set\n");
+ return false;
+ }
+
+ return true;
+} \ No newline at end of file
diff --git a/source/use_case/asr/src/MainLoop.cc b/source/use_case/asr/src/MainLoop.cc
new file mode 100644
index 0000000..ca777be
--- /dev/null
+++ b/source/use_case/asr/src/MainLoop.cc
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "hal.h" /* Brings in platform definitions. */
+#include "Labels.hpp" /* For label strings. */
+#include "UseCaseHandler.hpp" /* Handlers for different user options. */
+#include "Wav2LetterModel.hpp" /* Model class for running inference. */
+#include "UseCaseCommonUtils.hpp" /* Utils functions. */
+#include "AsrClassifier.hpp" /* Classifier. */
+#include "InputFiles.hpp" /* Generated audio clip header. */
+#include "Wav2LetterPreprocess.hpp" /* Pre-processing class. */
+#include "Wav2LetterPostprocess.hpp" /* Post-processing class. */
+
+enum opcodes
+{
+ MENU_OPT_RUN_INF_NEXT = 1, /* Run on next vector. */
+ MENU_OPT_RUN_INF_CHOSEN, /* Run on a user provided vector index. */
+ MENU_OPT_RUN_INF_ALL, /* Run inference on all. */
+ MENU_OPT_SHOW_MODEL_INFO, /* Show model info. */
+ MENU_OPT_LIST_AUDIO_CLIPS /* List the current baked audio clips. */
+};
+
+static void DisplayMenu()
+{
+ printf("\n\nUser input required\n");
+ printf("Enter option number from:\n\n");
+ printf(" %u. Classify next audio clip\n", MENU_OPT_RUN_INF_NEXT);
+ printf(" %u. Classify audio clip at chosen index\n", MENU_OPT_RUN_INF_CHOSEN);
+ printf(" %u. Run classification on all audio clips\n", MENU_OPT_RUN_INF_ALL);
+ printf(" %u. Show NN model info\n", MENU_OPT_SHOW_MODEL_INFO);
+ printf(" %u. List audio clips\n\n", MENU_OPT_LIST_AUDIO_CLIPS);
+ printf(" Choice: ");
+}
+
+/** @brief Verify input and output tensor are of certain min dimensions. */
+static bool VerifyTensorDimensions(const arm::app::Model& model);
+
+/** @brief Gets the number of MFCC features for a single window. */
+static uint32_t GetNumMfccFeatures(const arm::app::Model& model);
+
+/** @brief Gets the number of MFCC feature vectors to be computed. */
+static uint32_t GetNumMfccFeatureVectors(const arm::app::Model& model);
+
+/** @brief Gets the output context length (left and right) for post-processing. */
+static uint32_t GetOutputContextLen(const arm::app::Model& model,
+ uint32_t inputCtxLen);
+
+/** @brief Gets the output inner length for post-processing. */
+static uint32_t GetOutputInnerLen(const arm::app::Model& model,
+ uint32_t outputCtxLen);
+
+void main_loop(hal_platform& platform)
+{
+ arm::app::Wav2LetterModel model; /* Model wrapper object. */
+
+ /* Load the model. */
+ if (!model.Init()) {
+ printf_err("Failed to initialise model\n");
+ return;
+ } else if (!VerifyTensorDimensions(model)) {
+ printf_err("Model's input or output dimension verification failed\n");
+ return;
+ }
+
+ /* Initialise pre-processing. */
+ arm::app::audio::asr::Preprocess prep(
+ GetNumMfccFeatures(model),
+ g_FrameLength,
+ g_FrameStride,
+ GetNumMfccFeatureVectors(model));
+
+ /* Initialise post-processing. */
+ const uint32_t outputCtxLen = GetOutputContextLen(model, g_ctxLen);
+ const uint32_t blankTokenIdx = 28;
+ arm::app::audio::asr::Postprocess postp(
+ outputCtxLen,
+ GetOutputInnerLen(model, outputCtxLen),
+ blankTokenIdx);
+
+ /* Instantiate application context. */
+ arm::app::ApplicationContext caseContext;
+ std::vector <std::string> labels;
+ GetLabelsVector(labels);
+ arm::app::AsrClassifier classifier; /* Classifier wrapper object. */
+
+ caseContext.Set<hal_platform&>("platform", platform);
+ caseContext.Set<arm::app::Model&>("model", model);
+ caseContext.Set<uint32_t>("clipIndex", 0);
+ caseContext.Set<uint32_t>("frameLength", g_FrameLength);
+ caseContext.Set<uint32_t>("frameStride", g_FrameStride);
+ caseContext.Set<float>("scoreThreshold", g_ScoreThreshold); /* Score threshold. */
+ caseContext.Set<uint32_t>("ctxLen", g_ctxLen); /* Left and right context length (MFCC feat vectors). */
+ caseContext.Set<const std::vector <std::string>&>("labels", labels);
+ caseContext.Set<arm::app::AsrClassifier&>("classifier", classifier);
+ caseContext.Set<arm::app::audio::asr::Preprocess&>("preprocess", prep);
+ caseContext.Set<arm::app::audio::asr::Postprocess&>("postprocess", postp);
+
+ bool executionSuccessful = true;
+ constexpr bool bUseMenu = NUMBER_OF_FILES > 1 ? true : false;
+
+ /* Loop. */
+ do {
+ int menuOption = MENU_OPT_RUN_INF_NEXT;
+ if (bUseMenu) {
+ DisplayMenu();
+ menuOption = arm::app::ReadUserInputAsInt(platform);
+ printf("\n");
+ }
+ switch (menuOption) {
+ case MENU_OPT_RUN_INF_NEXT:
+ executionSuccessful = ClassifyAudioHandler(
+ caseContext,
+ caseContext.Get<uint32_t>("clipIndex"),
+ false);
+ break;
+ case MENU_OPT_RUN_INF_CHOSEN: {
+ printf(" Enter the audio clip index [0, %d]: ",
+ NUMBER_OF_FILES-1);
+ auto clipIndex = static_cast<uint32_t>(
+ arm::app::ReadUserInputAsInt(platform));
+ executionSuccessful = ClassifyAudioHandler(caseContext,
+ clipIndex,
+ false);
+ break;
+ }
+ case MENU_OPT_RUN_INF_ALL:
+ executionSuccessful = ClassifyAudioHandler(
+ caseContext,
+ caseContext.Get<uint32_t>("clipIndex"),
+ true);
+ break;
+ case MENU_OPT_SHOW_MODEL_INFO:
+ executionSuccessful = model.ShowModelInfoHandler();
+ break;
+ case MENU_OPT_LIST_AUDIO_CLIPS:
+ executionSuccessful = ListFilesHandler(caseContext);
+ break;
+ default:
+ printf("Incorrect choice, try again.");
+ break;
+ }
+ } while (executionSuccessful && bUseMenu);
+ info("Main loop terminated.\n");
+}
+
+static bool VerifyTensorDimensions(const arm::app::Model& model)
+{
+ /* Populate tensor related parameters. */
+ TfLiteTensor* inputTensor = model.GetInputTensor(0);
+ if (!inputTensor->dims) {
+ printf_err("Invalid input tensor dims\n");
+ return false;
+ } else if (inputTensor->dims->size < 3) {
+ printf_err("Input tensor dimension should be >= 3\n");
+ return false;
+ }
+
+ TfLiteTensor* outputTensor = model.GetOutputTensor(0);
+ if (!outputTensor->dims) {
+ printf_err("Invalid output tensor dims\n");
+ return false;
+ } else if (outputTensor->dims->size < 3) {
+ printf_err("Output tensor dimension should be >= 3\n");
+ return false;
+ }
+
+ return true;
+}
+
+static uint32_t GetNumMfccFeatures(const arm::app::Model& model)
+{
+ TfLiteTensor* inputTensor = model.GetInputTensor(0);
+ const int inputCols = inputTensor->dims->data[arm::app::Wav2LetterModel::ms_inputColsIdx];
+ if (0 != inputCols % 3) {
+ printf_err("Number of input columns is not a multiple of 3\n");
+ }
+ return std::max(inputCols/3, 0);
+}
+
+static uint32_t GetNumMfccFeatureVectors(const arm::app::Model& model)
+{
+ TfLiteTensor* inputTensor = model.GetInputTensor(0);
+ const int inputRows = inputTensor->dims->data[arm::app::Wav2LetterModel::ms_inputRowsIdx];
+ return std::max(inputRows, 0);
+}
+
+static uint32_t GetOutputContextLen(const arm::app::Model& model, const uint32_t inputCtxLen)
+{
+ const uint32_t inputRows = GetNumMfccFeatureVectors(model);
+ const uint32_t inputInnerLen = inputRows - (2 * inputCtxLen);
+ constexpr uint32_t ms_outputRowsIdx = arm::app::Wav2LetterModel::ms_outputRowsIdx;
+
+ /* Check to make sure that the input tensor supports the above
+ * context and inner lengths. */
+ if (inputRows <= 2 * inputCtxLen || inputRows <= inputInnerLen) {
+ printf_err("Input rows not compatible with ctx of %u\n",
+ inputCtxLen);
+ return 0;
+ }
+
+ TfLiteTensor* outputTensor = model.GetOutputTensor(0);
+ const uint32_t outputRows = std::max(outputTensor->dims->data[ms_outputRowsIdx], 0);
+
+ const float tensorColRatio = static_cast<float>(inputRows)/
+ static_cast<float>(outputRows);
+
+ return std::round(static_cast<float>(inputCtxLen)/tensorColRatio);
+}
+
+static uint32_t GetOutputInnerLen(const arm::app::Model& model,
+ const uint32_t outputCtxLen)
+{
+ constexpr uint32_t ms_outputRowsIdx = arm::app::Wav2LetterModel::ms_outputRowsIdx;
+ TfLiteTensor* outputTensor = model.GetOutputTensor(0);
+ const uint32_t outputRows = std::max(outputTensor->dims->data[ms_outputRowsIdx], 0);
+ return (outputRows - (2 * outputCtxLen));
+}
diff --git a/source/use_case/asr/src/OutputDecode.cc b/source/use_case/asr/src/OutputDecode.cc
new file mode 100644
index 0000000..41fbe07
--- /dev/null
+++ b/source/use_case/asr/src/OutputDecode.cc
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "OutputDecode.hpp"
+
+namespace arm {
+namespace app {
+namespace audio {
+namespace asr {
+
+ std::string DecodeOutput(const std::vector<ClassificationResult>& vecResults)
+ {
+ std::string CleanOutputBuffer;
+
+ for (size_t i = 0; i < vecResults.size(); ++i) /* For all elements in vector. */
+ {
+ while (i+1 < vecResults.size() &&
+ vecResults[i].m_label == vecResults[i+1].m_label) /* While the current element is equal to the next, ignore it and move on. */
+ {
+ ++i;
+ }
+ if (vecResults[i].m_label != "$") /* $ is a character used to represent unknown and double characters so should not be in output. */
+ {
+ CleanOutputBuffer += vecResults[i].m_label; /* If the element is different to the next, it will be appended to CleanOutputBuffer. */
+ }
+ }
+
+ return CleanOutputBuffer; /* Return string type containing clean output. */
+ }
+
+} /* namespace asr */
+} /* namespace audio */
+} /* namespace app */
+} /* namespace arm */
diff --git a/source/use_case/asr/src/UseCaseHandler.cc b/source/use_case/asr/src/UseCaseHandler.cc
new file mode 100644
index 0000000..e706eb8
--- /dev/null
+++ b/source/use_case/asr/src/UseCaseHandler.cc
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "UseCaseHandler.hpp"
+
+#include "InputFiles.hpp"
+#include "AsrClassifier.hpp"
+#include "Wav2LetterModel.hpp"
+#include "hal.h"
+#include "Wav2LetterMfcc.hpp"
+#include "AudioUtils.hpp"
+#include "UseCaseCommonUtils.hpp"
+#include "AsrResult.hpp"
+#include "Wav2LetterPreprocess.hpp"
+#include "Wav2LetterPostprocess.hpp"
+#include "OutputDecode.hpp"
+
+namespace arm {
+namespace app {
+
+ /**
+ * @brief Helper function to increment current audio clip index.
+ * @param[in,out] ctx Pointer to the application context object.
+ **/
+ static void _IncrementAppCtxClipIdx(ApplicationContext& ctx);
+
+ /**
+ * @brief Helper function to set the audio clip index.
+ * @param[in,out] ctx Pointer to the application context object.
+ * @param[in] idx Value to be set.
+ * @return true if index is set, false otherwise.
+ **/
+ static bool _SetAppCtxClipIdx(ApplicationContext& ctx, uint32_t idx);
+
+ /**
+ * @brief Presents inference results using the data presentation
+ * object.
+ * @param[in] platform Reference to the hal platform object.
+ * @param[in] results Vector of classification results to be displayed.
+ * @param[in] infTimeMs Inference time in milliseconds, if available
+ * otherwise, this can be passed in as 0.
+ * @return true if successful, false otherwise.
+ **/
+ static bool _PresentInferenceResult(
+ hal_platform& platform,
+ const std::vector<arm::app::asr::AsrResult>& results);
+
+ /* Audio inference classification handler. */
+ bool ClassifyAudioHandler(ApplicationContext& ctx, uint32_t clipIndex, bool runAll)
+ {
+ constexpr uint32_t dataPsnTxtInfStartX = 20;
+ constexpr uint32_t dataPsnTxtInfStartY = 40;
+
+ auto& platform = ctx.Get<hal_platform&>("platform");
+ platform.data_psn->clear(COLOR_BLACK);
+
+ /* If the request has a valid size, set the audio index. */
+ if (clipIndex < NUMBER_OF_FILES) {
+ if (!_SetAppCtxClipIdx(ctx, clipIndex)) {
+ return false;
+ }
+ }
+
+ /* Get model reference. */
+ auto& model = ctx.Get<Model&>("model");
+ if (!model.IsInited()) {
+ printf_err("Model is not initialised! Terminating processing.\n");
+ return false;
+ }
+
+ /* Get score threshold to be applied for the classifier (post-inference). */
+ auto scoreThreshold = ctx.Get<float>("scoreThreshold");
+
+ /* Get tensors. Dimensions of the tensor should have been verified by
+ * the callee. */
+ TfLiteTensor* inputTensor = model.GetInputTensor(0);
+ TfLiteTensor* outputTensor = model.GetOutputTensor(0);
+ const uint32_t inputRows = inputTensor->dims->data[arm::app::Wav2LetterModel::ms_inputRowsIdx];
+
+ /* Populate MFCC related parameters. */
+ auto mfccParamsWinLen = ctx.Get<uint32_t>("frameLength");
+ auto mfccParamsWinStride = ctx.Get<uint32_t>("frameStride");
+
+ /* Populate ASR inference context and inner lengths for input. */
+ auto inputCtxLen = ctx.Get<uint32_t>("ctxLen");
+ const uint32_t inputInnerLen = inputRows - (2 * inputCtxLen);
+
+ /* Audio data stride corresponds to inputInnerLen feature vectors. */
+ const uint32_t audioParamsWinLen = (inputRows - 1) * mfccParamsWinStride + (mfccParamsWinLen);
+ const uint32_t audioParamsWinStride = inputInnerLen * mfccParamsWinStride;
+ const float audioParamsSecondsPerSample = (1.0/audio::Wav2LetterMFCC::ms_defaultSamplingFreq);
+
+ /* Get pre/post-processing objects. */
+ auto& prep = ctx.Get<audio::asr::Preprocess&>("preprocess");
+ auto& postp = ctx.Get<audio::asr::Postprocess&>("postprocess");
+
+ /* Set default reduction axis for post-processing. */
+ const uint32_t reductionAxis = arm::app::Wav2LetterModel::ms_outputRowsIdx;
+
+ /* Audio clip start index. */
+ auto startClipIdx = ctx.Get<uint32_t>("clipIndex");
+
+ /* Loop to process audio clips. */
+ do {
+ /* Get current audio clip index. */
+ auto currentIndex = ctx.Get<uint32_t>("clipIndex");
+
+ /* Get the current audio buffer and respective size. */
+ const int16_t* audioArr = get_audio_array(currentIndex);
+ const uint32_t audioArrSize = get_audio_array_size(currentIndex);
+
+ if (!audioArr) {
+ printf_err("Invalid audio array pointer\n");
+ return false;
+ }
+
+ /* Audio clip must have enough samples to produce 1 MFCC feature. */
+ if (audioArrSize < mfccParamsWinLen) {
+ printf_err("Not enough audio samples, minimum needed is %u\n", mfccParamsWinLen);
+ return false;
+ }
+
+ /* Initialise an audio slider. */
+ auto audioDataSlider = audio::ASRSlidingWindow<const int16_t>(
+ audioArr,
+ audioArrSize,
+ audioParamsWinLen,
+ audioParamsWinStride);
+
+ /* Declare a container for results. */
+ std::vector<arm::app::asr::AsrResult> results;
+
+ /* Display message on the LCD - inference running. */
+ std::string str_inf{"Running inference... "};
+ platform.data_psn->present_data_text(
+ str_inf.c_str(), str_inf.size(),
+ dataPsnTxtInfStartX, dataPsnTxtInfStartY, 0);
+
+ info("Running inference on audio clip %u => %s\n", currentIndex,
+ get_filename(currentIndex));
+
+ size_t inferenceWindowLen = audioParamsWinLen;
+
+ /* Start sliding through audio clip. */
+ while (audioDataSlider.HasNext()) {
+
+ /* If not enough audio see how much can be sent for processing. */
+ size_t nextStartIndex = audioDataSlider.NextWindowStartIndex();
+ if (nextStartIndex + audioParamsWinLen > audioArrSize) {
+ inferenceWindowLen = audioArrSize - nextStartIndex;
+ }
+
+ const int16_t* inferenceWindow = audioDataSlider.Next();
+
+ info("Inference %zu/%zu\n", audioDataSlider.Index() + 1,
+ static_cast<size_t>(ceilf(audioDataSlider.FractionalTotalStrides() + 1)));
+
+ Profiler prepProfiler{&platform, "pre-processing"};
+ prepProfiler.StartProfiling();
+
+ /* Calculate MFCCs, deltas and populate the input tensor. */
+ prep.Invoke(inferenceWindow, inferenceWindowLen, inputTensor);
+
+ prepProfiler.StopProfiling();
+ std::string prepProfileResults = prepProfiler.GetResultsAndReset();
+ info("%s\n", prepProfileResults.c_str());
+
+ /* Run inference over this audio clip sliding window. */
+ arm::app::RunInference(platform, model);
+
+ /* Post-process. */
+ postp.Invoke(outputTensor, reductionAxis, !audioDataSlider.HasNext());
+
+ /* Get results. */
+ std::vector<ClassificationResult> classificationResult;
+ auto& classifier = ctx.Get<AsrClassifier&>("classifier");
+ classifier.GetClassificationResults(
+ outputTensor, classificationResult,
+ ctx.Get<std::vector<std::string>&>("labels"), 1);
+
+ results.emplace_back(asr::AsrResult(classificationResult,
+ (audioDataSlider.Index() *
+ audioParamsSecondsPerSample *
+ audioParamsWinStride),
+ audioDataSlider.Index(), scoreThreshold));
+
+#if VERIFY_TEST_OUTPUT
+ arm::app::DumpTensor(outputTensor,
+ outputTensor->dims->data[arm::app::Wav2LetterModel::ms_outputColsIdx]);
+#endif /* VERIFY_TEST_OUTPUT */
+
+ }
+
+ /* Erase. */
+ str_inf = std::string(str_inf.size(), ' ');
+ platform.data_psn->present_data_text(
+ str_inf.c_str(), str_inf.size(),
+ dataPsnTxtInfStartX, dataPsnTxtInfStartY, 0);
+
+ ctx.Set<std::vector<arm::app::asr::AsrResult>>("results", results);
+
+ if (!_PresentInferenceResult(platform, results)) {
+ return false;
+ }
+
+ _IncrementAppCtxClipIdx(ctx);
+
+ } while (runAll && ctx.Get<uint32_t>("clipIndex") != startClipIdx);
+
+ return true;
+ }
+
+ static void _IncrementAppCtxClipIdx(ApplicationContext& ctx)
+ {
+ auto curAudioIdx = ctx.Get<uint32_t>("clipIndex");
+
+ if (curAudioIdx + 1 >= NUMBER_OF_FILES) {
+ ctx.Set<uint32_t>("clipIndex", 0);
+ return;
+ }
+ ++curAudioIdx;
+ ctx.Set<uint32_t>("clipIndex", curAudioIdx);
+ }
+
+ static bool _SetAppCtxClipIdx(ApplicationContext& ctx, const uint32_t idx)
+ {
+ if (idx >= NUMBER_OF_FILES) {
+ printf_err("Invalid idx %u (expected less than %u)\n",
+ idx, NUMBER_OF_FILES);
+ return false;
+ }
+
+ ctx.Set<uint32_t>("clipIndex", idx);
+ return true;
+ }
+
+ static bool _PresentInferenceResult(hal_platform& platform,
+ const std::vector<arm::app::asr::AsrResult>& results)
+ {
+ constexpr uint32_t dataPsnTxtStartX1 = 20;
+ constexpr uint32_t dataPsnTxtStartY1 = 60;
+ constexpr bool allow_multiple_lines = true;
+
+ platform.data_psn->set_text_color(COLOR_GREEN);
+
+ /* Results from multiple inferences should be combined before processing. */
+ std::vector<arm::app::ClassificationResult> combinedResults;
+ for (auto& result : results) {
+ combinedResults.insert(combinedResults.end(),
+ result.m_resultVec.begin(),
+ result.m_resultVec.end());
+ }
+
+ /* Get each inference result string using the decoder. */
+ for (const auto & result : results) {
+ std::string infResultStr = audio::asr::DecodeOutput(result.m_resultVec);
+
+ info("Result for inf %u: %s\n", result.m_inferenceNumber,
+ infResultStr.c_str());
+ }
+
+ /* Get the decoded result for the combined result. */
+ std::string finalResultStr = audio::asr::DecodeOutput(combinedResults);
+
+ platform.data_psn->present_data_text(
+ finalResultStr.c_str(), finalResultStr.size(),
+ dataPsnTxtStartX1, dataPsnTxtStartY1,
+ allow_multiple_lines);
+
+ info("Final result: %s\n", finalResultStr.c_str());
+ return true;
+ }
+
+} /* namespace app */
+} /* namespace arm */ \ No newline at end of file
diff --git a/source/use_case/asr/src/Wav2LetterMfcc.cc b/source/use_case/asr/src/Wav2LetterMfcc.cc
new file mode 100644
index 0000000..92c91bc
--- /dev/null
+++ b/source/use_case/asr/src/Wav2LetterMfcc.cc
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "Wav2LetterMfcc.hpp"
+
+#include "PlatformMath.hpp"
+
+#include <cfloat>
+
+namespace arm {
+namespace app {
+namespace audio {
+
+ bool Wav2LetterMFCC::ApplyMelFilterBank(
+ std::vector<float>& fftVec,
+ std::vector<std::vector<float>>& melFilterBank,
+ std::vector<int32_t>& filterBankFilterFirst,
+ std::vector<int32_t>& filterBankFilterLast,
+ std::vector<float>& melEnergies)
+ {
+ const size_t numBanks = melEnergies.size();
+
+ if (numBanks != filterBankFilterFirst.size() ||
+ numBanks != filterBankFilterLast.size()) {
+ printf_err("Unexpected filter bank lengths\n");
+ return false;
+ }
+
+ for (size_t bin = 0; bin < numBanks; ++bin) {
+ auto filterBankIter = melFilterBank[bin].begin();
+ float melEnergy = 1e-10; /* Avoid log of zero at later stages, same value used in librosa. */
+ const int32_t firstIndex = filterBankFilterFirst[bin];
+ const int32_t lastIndex = filterBankFilterLast[bin];
+
+ for (int32_t i = firstIndex; i <= lastIndex; ++i) {
+ melEnergy += (*filterBankIter++ * fftVec[i]);
+ }
+
+ melEnergies[bin] = melEnergy;
+ }
+
+ return true;
+ }
+
+ void Wav2LetterMFCC::ConvertToLogarithmicScale(
+ std::vector<float>& melEnergies)
+ {
+ float maxMelEnergy = -FLT_MAX;
+
+ /* Container for natural logarithms of mel energies. */
+ std::vector <float> vecLogEnergies(melEnergies.size(), 0.f);
+
+ /* Because we are taking natural logs, we need to multiply by log10(e).
+ * Also, for wav2letter model, we scale our log10 values by 10. */
+ constexpr float multiplier = 10.0 * /* Default scalar. */
+ 0.4342944819032518; /* log10f(std::exp(1.0)) */
+
+ /* Take log of the whole vector. */
+ math::MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies);
+
+ /* Scale the log values and get the max. */
+ for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin();
+ iterM != melEnergies.end(); ++iterM, ++iterL) {
+
+ *iterM = *iterL * multiplier;
+
+ /* Save the max mel energy. */
+ if (*iterM > maxMelEnergy) {
+ maxMelEnergy = *iterM;
+ }
+ }
+
+ /* Clamp the mel energies. */
+ constexpr float maxDb = 80.0;
+ const float clampLevelLowdB = maxMelEnergy - maxDb;
+ for (auto iter = melEnergies.begin(); iter != melEnergies.end(); ++iter) {
+ *iter = std::max(*iter, clampLevelLowdB);
+ }
+ }
+
+ std::vector<float> Wav2LetterMFCC::CreateDCTMatrix(
+ const int32_t inputLength,
+ const int32_t coefficientCount)
+ {
+ std::vector<float> dctMatix(inputLength * coefficientCount);
+
+ /* Orthonormal normalization. */
+ const float normalizerK0 = 2 * math::MathUtils::SqrtF32(1.0f /
+ static_cast<float>(4*inputLength));
+ const float normalizer = 2 * math::MathUtils::SqrtF32(1.0f /
+ static_cast<float>(2*inputLength));
+
+ const float angleIncr = M_PI / inputLength;
+ float angle = angleIncr; /* We start using it at k = 1 loop. */
+
+ /* First row of DCT will use normalizer K0. */
+ for (int32_t n = 0; n < inputLength; ++n) {
+ dctMatix[n] = normalizerK0 /* cos(0) = 1 */;
+ }
+
+ /* Second row (index = 1) onwards, we use standard normalizer. */
+ for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength) {
+ for (int32_t n = 0; n < inputLength; ++n) {
+ dctMatix[m+n] = normalizer *
+ math::MathUtils::CosineF32((n + 0.5f) * angle);
+ }
+ angle += angleIncr;
+ }
+ return dctMatix;
+ }
+
+ float Wav2LetterMFCC::GetMelFilterBankNormaliser(
+ const float& leftMel,
+ const float& rightMel,
+ const bool useHTKMethod)
+ {
+ /* Slaney normalization for mel weights. */
+ return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) -
+ MFCC::InverseMelScale(leftMel, useHTKMethod)));
+ }
+
+} /* namespace audio */
+} /* namespace app */
+} /* namespace arm */
diff --git a/source/use_case/asr/src/Wav2LetterModel.cc b/source/use_case/asr/src/Wav2LetterModel.cc
new file mode 100644
index 0000000..5aefecd
--- /dev/null
+++ b/source/use_case/asr/src/Wav2LetterModel.cc
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "Wav2LetterModel.hpp"
+
+#include "hal.h"
+
+const tflite::MicroOpResolver& arm::app::Wav2LetterModel::GetOpResolver()
+{
+ return this->_m_opResolver;
+}
+
+bool arm::app::Wav2LetterModel::EnlistOperations()
+{
+ this->_m_opResolver.AddConv2D();
+ this->_m_opResolver.AddMul();
+ this->_m_opResolver.AddMaximum();
+ this->_m_opResolver.AddReshape();
+
+#if defined(ARM_NPU)
+ if (kTfLiteOk == this->_m_opResolver.AddEthosU()) {
+ info("Added %s support to op resolver\n",
+ tflite::GetString_ETHOSU());
+ } else {
+ printf_err("Failed to add Arm NPU support to op resolver.");
+ return false;
+ }
+#endif /* ARM_NPU */
+
+ return true;
+}
+
+extern uint8_t* GetModelPointer();
+const uint8_t* arm::app::Wav2LetterModel::ModelPointer()
+{
+ return GetModelPointer();
+}
+
+extern size_t GetModelLen();
+size_t arm::app::Wav2LetterModel::ModelSize()
+{
+ return GetModelLen();
+} \ No newline at end of file
diff --git a/source/use_case/asr/src/Wav2LetterPostprocess.cc b/source/use_case/asr/src/Wav2LetterPostprocess.cc
new file mode 100644
index 0000000..60ee51e
--- /dev/null
+++ b/source/use_case/asr/src/Wav2LetterPostprocess.cc
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "Wav2LetterPostprocess.hpp"
+
+#include "Wav2LetterModel.hpp"
+
+
+namespace arm {
+namespace app {
+namespace audio {
+namespace asr {
+
+ Postprocess::Postprocess(const uint32_t contextLen,
+ const uint32_t innerLen,
+ const uint32_t blankTokenIdx)
+ : _m_contextLen(contextLen),
+ _m_innerLen(innerLen),
+ _m_totalLen(2 * this->_m_contextLen + this->_m_innerLen),
+ _m_countIterations(0),
+ _m_blankTokenIdx(blankTokenIdx)
+ {}
+
+ bool Postprocess::Invoke(TfLiteTensor* tensor,
+ const uint32_t axisIdx,
+ const bool lastIteration)
+ {
+ /* Basic checks. */
+ if (!this->_IsInputValid(tensor, axisIdx)) {
+ return false;
+ }
+
+ /* Irrespective of tensor type, we use unsigned "byte" */
+ uint8_t* ptrData = tflite::GetTensorData<uint8_t>(tensor);
+ const uint32_t elemSz = this->_GetTensorElementSize(tensor);
+
+ /* Other sanity checks. */
+ if (0 == elemSz) {
+ printf_err("Tensor type not supported for post processing\n");
+ return false;
+ } else if (elemSz * this->_m_totalLen > tensor->bytes) {
+ printf_err("Insufficient number of tensor bytes\n");
+ return false;
+ }
+
+ /* Which axis do we need to process? */
+ switch (axisIdx) {
+ case arm::app::Wav2LetterModel::ms_outputRowsIdx:
+ return this->_EraseSectionsRowWise(ptrData,
+ elemSz * tensor->dims->data[arm::app::Wav2LetterModel::ms_outputColsIdx],
+ lastIteration);
+ case arm::app::Wav2LetterModel::ms_outputColsIdx:
+ return this->_EraseSectionsColWise(ptrData,
+ elemSz * tensor->dims->data[arm::app::Wav2LetterModel::ms_outputRowsIdx],
+ lastIteration);
+ default:
+ printf_err("Unsupported axis index: %u\n", axisIdx);
+ }
+
+ return false;
+ }
+
+ bool Postprocess::_IsInputValid(TfLiteTensor* tensor,
+ const uint32_t axisIdx) const
+ {
+ if (nullptr == tensor) {
+ return false;
+ }
+
+ if (static_cast<int>(axisIdx) >= tensor->dims->size) {
+ printf_err("Invalid axis index: %u; Max: %d\n",
+ axisIdx, tensor->dims->size);
+ return false;
+ }
+
+ if (static_cast<int>(this->_m_totalLen) !=
+ tensor->dims->data[axisIdx]) {
+ printf_err("Unexpected tensor dimension for axis %d, \n",
+ tensor->dims->data[axisIdx]);
+ return false;
+ }
+
+ return true;
+ }
+
+ uint32_t Postprocess::_GetTensorElementSize(TfLiteTensor* tensor)
+ {
+ switch(tensor->type) {
+ case kTfLiteUInt8:
+ return 1;
+ case kTfLiteInt8:
+ return 1;
+ case kTfLiteInt16:
+ return 2;
+ case kTfLiteInt32:
+ return 4;
+ case kTfLiteFloat32:
+ return 4;
+ default:
+ printf_err("Unsupported tensor type %s\n",
+ TfLiteTypeGetName(tensor->type));
+ }
+
+ return 0;
+ }
+
+ bool Postprocess::_EraseSectionsRowWise(
+ uint8_t* ptrData,
+ const uint32_t strideSzBytes,
+ const bool lastIteration)
+ {
+ /* In this case, the "zero-ing" is quite simple as the region
+ * to be zeroed sits in contiguous memory (row-major). */
+ const uint32_t eraseLen = strideSzBytes * this->_m_contextLen;
+
+ /* Erase left context? */
+ if (this->_m_countIterations > 0) {
+ /* Set output of each classification window to the blank token. */
+ std::memset(ptrData, 0, eraseLen);
+ for (size_t windowIdx = 0; windowIdx < this->_m_contextLen; windowIdx++) {
+ ptrData[windowIdx*strideSzBytes + this->_m_blankTokenIdx] = 1;
+ }
+ }
+
+ /* Erase right context? */
+ if (false == lastIteration) {
+ uint8_t * rightCtxPtr = ptrData + (strideSzBytes * (this->_m_contextLen + this->_m_innerLen));
+ /* Set output of each classification window to the blank token. */
+ std::memset(rightCtxPtr, 0, eraseLen);
+ for (size_t windowIdx = 0; windowIdx < this->_m_contextLen; windowIdx++) {
+ rightCtxPtr[windowIdx*strideSzBytes + this->_m_blankTokenIdx] = 1;
+ }
+ }
+
+ if (lastIteration) {
+ this->_m_countIterations = 0;
+ } else {
+ ++this->_m_countIterations;
+ }
+
+ return true;
+ }
+
+ bool Postprocess::_EraseSectionsColWise(
+ uint8_t* ptrData,
+ const uint32_t strideSzBytes,
+ const bool lastIteration)
+ {
+ /* Not implemented. */
+ UNUSED(ptrData);
+ UNUSED(strideSzBytes);
+ UNUSED(lastIteration);
+ return false;
+ }
+
+} /* namespace asr */
+} /* namespace audio */
+} /* namespace app */
+} /* namespace arm */ \ No newline at end of file
diff --git a/source/use_case/asr/src/Wav2LetterPreprocess.cc b/source/use_case/asr/src/Wav2LetterPreprocess.cc
new file mode 100644
index 0000000..e46cca3
--- /dev/null
+++ b/source/use_case/asr/src/Wav2LetterPreprocess.cc
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2021 Arm Limited. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "Wav2LetterPreprocess.hpp"
+
+#include "PlatformMath.hpp"
+#include "TensorFlowLiteMicro.hpp"
+
+#include <algorithm>
+#include <cmath>
+
+namespace arm {
+namespace app {
+namespace audio {
+namespace asr {
+
+ Preprocess::Preprocess(
+ const uint32_t numMfccFeatures,
+ const uint32_t windowLen,
+ const uint32_t windowStride,
+ const uint32_t numMfccVectors):
+ _m_mfcc(numMfccFeatures, windowLen),
+ _m_mfccBuf(numMfccFeatures, numMfccVectors),
+ _m_delta1Buf(numMfccFeatures, numMfccVectors),
+ _m_delta2Buf(numMfccFeatures, numMfccVectors),
+ _m_windowLen(windowLen),
+ _m_windowStride(windowStride),
+ _m_numMfccFeats(numMfccFeatures),
+ _m_numFeatVectors(numMfccVectors),
+ _m_window()
+ {
+ if (numMfccFeatures > 0 && windowLen > 0) {
+ this->_m_mfcc.Init();
+ }
+ }
+
+ bool Preprocess::Invoke(
+ const int16_t* audioData,
+ const uint32_t audioDataLen,
+ TfLiteTensor* tensor)
+ {
+ this->_m_window = SlidingWindow<const int16_t>(
+ audioData, audioDataLen,
+ this->_m_windowLen, this->_m_windowStride);
+
+ uint32_t mfccBufIdx = 0;
+
+ std::fill(_m_mfccBuf.begin(), _m_mfccBuf.end(), 0.f);
+ std::fill(_m_delta1Buf.begin(), _m_delta1Buf.end(), 0.f);
+ std::fill(_m_delta2Buf.begin(), _m_delta2Buf.end(), 0.f);
+
+ /* While we can slide over the window. */
+ while (this->_m_window.HasNext()) {
+ const int16_t* mfccWindow = this->_m_window.Next();
+ auto mfccAudioData = std::vector<int16_t>(
+ mfccWindow,
+ mfccWindow + this->_m_windowLen);
+ auto mfcc = this->_m_mfcc.MfccCompute(mfccAudioData);
+ for (size_t i = 0; i < this->_m_mfccBuf.size(0); ++i) {
+ this->_m_mfccBuf(i, mfccBufIdx) = mfcc[i];
+ }
+ ++mfccBufIdx;
+ }
+
+ /* Pad MFCC if needed by adding MFCC for zeros. */
+ if (mfccBufIdx != this->_m_numFeatVectors) {
+ std::vector<int16_t> zerosWindow = std::vector<int16_t>(this->_m_windowLen, 0);
+ std::vector<float> mfccZeros = this->_m_mfcc.MfccCompute(zerosWindow);
+
+ while (mfccBufIdx != this->_m_numFeatVectors) {
+ memcpy(&this->_m_mfccBuf(0, mfccBufIdx),
+ mfccZeros.data(), sizeof(float) * _m_numMfccFeats);
+ ++mfccBufIdx;
+ }
+ }
+
+ /* Compute first and second order deltas from MFCCs. */
+ this->_ComputeDeltas(this->_m_mfccBuf,
+ this->_m_delta1Buf,
+ this->_m_delta2Buf);
+
+ /* Normalise. */
+ this->_Normalise();
+
+ /* Quantise. */
+ QuantParams quantParams = GetTensorQuantParams(tensor);
+
+ if (0 == quantParams.scale) {
+ printf_err("Quantisation scale can't be 0\n");
+ return false;
+ }
+
+ switch(tensor->type) {
+ case kTfLiteUInt8:
+ return this->_Quantise<uint8_t>(
+ tflite::GetTensorData<uint8_t>(tensor), tensor->bytes,
+ quantParams.scale, quantParams.offset);
+ case kTfLiteInt8:
+ return this->_Quantise<int8_t>(
+ tflite::GetTensorData<int8_t>(tensor), tensor->bytes,
+ quantParams.scale, quantParams.offset);
+ default:
+ printf_err("Unsupported tensor type %s\n",
+ TfLiteTypeGetName(tensor->type));
+ }
+
+ return false;
+ }
+
+ bool Preprocess::_ComputeDeltas(Array2d<float>& mfcc,
+ Array2d<float>& delta1,
+ Array2d<float>& delta2)
+ {
+ const std::vector <float> delta1Coeffs =
+ {6.66666667e-02, 5.00000000e-02, 3.33333333e-02,
+ 1.66666667e-02, -3.46944695e-18, -1.66666667e-02,
+ -3.33333333e-02, -5.00000000e-02, -6.66666667e-02};
+
+ const std::vector <float> delta2Coeffs =
+ {0.06060606, 0.01515152, -0.01731602,
+ -0.03679654, -0.04329004, -0.03679654,
+ -0.01731602, 0.01515152, 0.06060606};
+
+ if (delta1.size(0) == 0 || delta2.size(0) != delta1.size(0) ||
+ mfcc.size(0) == 0 || mfcc.size(1) == 0) {
+ return false;
+ }
+
+ /* Get the middle index; coeff vec len should always be odd. */
+ const size_t coeffLen = delta1Coeffs.size();
+ const size_t fMidIdx = (coeffLen - 1)/2;
+ const size_t numFeatures = mfcc.size(0);
+ const size_t numFeatVectors = mfcc.size(1);
+
+ /* Iterate through features in MFCC vector. */
+ for (size_t i = 0; i < numFeatures; ++i) {
+ /* For each feature, iterate through time (t) samples representing feature evolution and
+ * calculate d/dt and d^2/dt^2, using 1D convolution with differential kernels.
+ * Convolution padding = valid, result size is `time length - kernel length + 1`.
+ * The result is padded with 0 from both sides to match the size of initial time samples data.
+ *
+ * For the small filter, conv1D implementation as a simple loop is efficient enough.
+ * Filters of a greater size would need CMSIS-DSP functions to be used, like arm_fir_f32.
+ */
+
+ for (size_t j = fMidIdx; j < numFeatVectors - fMidIdx; ++j) {
+ float d1 = 0;
+ float d2 = 0;
+ const size_t mfccStIdx = j - fMidIdx;
+
+ for (size_t k = 0, m = coeffLen - 1; k < coeffLen; ++k, --m) {
+
+ d1 += mfcc(i,mfccStIdx + k) * delta1Coeffs[m];
+ d2 += mfcc(i,mfccStIdx + k) * delta2Coeffs[m];
+ }
+
+ delta1(i,j) = d1;
+ delta2(i,j) = d2;
+ }
+ }
+
+ return true;
+ }
+
+ float Preprocess::_GetMean(Array2d<float>& vec)
+ {
+ return math::MathUtils::MeanF32(vec.begin(), vec.totalSize());
+ }
+
+ float Preprocess::_GetStdDev(Array2d<float>& vec, const float mean)
+ {
+ return math::MathUtils::StdDevF32(vec.begin(), vec.totalSize(), mean);
+ }
+
+ void Preprocess::_NormaliseVec(Array2d<float>& vec)
+ {
+ auto mean = Preprocess::_GetMean(vec);
+ auto stddev = Preprocess::_GetStdDev(vec, mean);
+
+ debug("Mean: %f, Stddev: %f\n", mean, stddev);
+ if (stddev == 0) {
+ std::fill(vec.begin(), vec.end(), 0);
+ } else {
+ const float stddevInv = 1.f/stddev;
+ const float normalisedMean = mean/stddev;
+
+ auto NormalisingFunction = [=](float& value) {
+ value = value * stddevInv - normalisedMean;
+ };
+ std::for_each(vec.begin(), vec.end(), NormalisingFunction);
+ }
+ }
+
+ void Preprocess::_Normalise()
+ {
+ Preprocess::_NormaliseVec(this->_m_mfccBuf);
+ Preprocess::_NormaliseVec(this->_m_delta1Buf);
+ Preprocess::_NormaliseVec(this->_m_delta2Buf);
+ }
+
+ float Preprocess::_GetQuantElem(
+ const float elem,
+ const float quantScale,
+ const int quantOffset,
+ const float minVal,
+ const float maxVal)
+ {
+ float val = std::round((elem/quantScale) + quantOffset);
+ return std::min<float>(std::max<float>(val, minVal), maxVal);
+ }
+
+} /* namespace asr */
+} /* namespace audio */
+} /* namespace app */
+} /* namespace arm */ \ No newline at end of file