From 23c26277086c78704a17f0dae86da947816320c0 Mon Sep 17 00:00:00 2001 From: George Gekov Date: Mon, 16 Aug 2021 11:32:10 +0100 Subject: MLECO-2079 Adding the C++ KWS example Signed-off-by: Eanna O Cathain Change-Id: I81899bbfaada32f478c2e2fc6441eabb94d8d0fc --- samples/SpeechRecognition/src/Main.cpp | 137 ++++++++++++++------------------- 1 file changed, 56 insertions(+), 81 deletions(-) (limited to 'samples/SpeechRecognition/src/Main.cpp') diff --git a/samples/SpeechRecognition/src/Main.cpp b/samples/SpeechRecognition/src/Main.cpp index de37e23b40..e2d293001f 100644 --- a/samples/SpeechRecognition/src/Main.cpp +++ b/samples/SpeechRecognition/src/Main.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include @@ -11,10 +11,8 @@ #include "CmdArgsParser.hpp" #include "ArmnnNetworkExecutor.hpp" #include "AudioCapture.hpp" -#include "Preprocess.hpp" -#include "Decoder.hpp" #include "SpeechRecognitionPipeline.hpp" - +#include "Wav2LetterMFCC.hpp" using InferenceResult = std::vector; using InferenceResults = std::vector; @@ -25,101 +23,77 @@ const std::string LABEL_PATH = "--label-path"; const std::string PREFERRED_BACKENDS = "--preferred-backends"; const std::string HELP = "--help"; -std::map labels = { - {0, "a" }, - {1, "b" }, - {2, "c" }, - {3, "d" }, - {4, "e" }, - {5, "f" }, - {6, "g" }, - {7, "h" }, - {8, "i" }, - {9, "j" }, - {10,"k" }, - {11,"l" }, - {12,"m" }, - {13,"n" }, - {14,"o" }, - {15,"p" }, - {16,"q" }, - {17,"r" }, - {18,"s" }, - {19,"t" }, - {20,"u" }, - {21,"v" }, - {22,"w" }, - {23,"x" }, - {24,"y" }, - {25,"z" }, - {26, "\'" }, +std::map labels = +{ + {0, "a"}, + {1, "b"}, + {2, "c"}, + {3, "d"}, + {4, "e"}, + {5, "f"}, + {6, "g"}, + {7, "h"}, + {8, "i"}, + {9, "j"}, + {10, "k"}, + {11, "l"}, + {12, "m"}, + {13, "n"}, + {14, "o"}, + {15, "p"}, + {16, "q"}, + {17, "r"}, + {18, "s"}, + {19, "t"}, + {20, "u"}, + {21, "v"}, + {22, "w"}, + {23, "x"}, + {24, "y"}, + {25, "z"}, + {26, "\'"}, {27, " "}, - {28,"$" } + {28, "$"} }; /* * The accepted options for this Speech Recognition executable */ -static std::map CMD_OPTIONS = { - {AUDIO_FILE_PATH, "[REQUIRED] Path to the Audio file to run speech recognition on"}, - {MODEL_FILE_PATH, "[REQUIRED] Path to the Speech Recognition model to use"}, - {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma." - " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]." - " Defaults to CpuAcc,CpuRef"} +static std::map CMD_OPTIONS = +{ + {AUDIO_FILE_PATH, "[REQUIRED] Path to the Audio file to run speech recognition on"}, + {MODEL_FILE_PATH, "[REQUIRED] Path to the Speech Recognition model to use"}, + {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma." + " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]." + " Defaults to CpuAcc,CpuRef"} }; /* * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector */ -std::vector GetPreferredBackendList(const std::string& preferredBackends) +std::vector GetPreferredBackendList(const std::string& preferredBackends) { std::vector backends; std::stringstream ss(preferredBackends); - while(ss.good()) + while (ss.good()) { std::string backend; - std::getline( ss, backend, ',' ); + std::getline(ss, backend, ','); backends.emplace_back(backend); } return backends; } -int main(int argc, char *argv[]) +int main(int argc, char* argv[]) { - // Wav2Letter ASR SETTINGS - int SAMP_FREQ = 16000; - int FRAME_LEN_MS = 32; - int FRAME_LEN_SAMPLES = SAMP_FREQ * FRAME_LEN_MS * 0.001; - int NUM_MFCC_FEATS = 13; - int MFCC_WINDOW_LEN = 512; - int MFCC_WINDOW_STRIDE = 160; - const int NUM_MFCC_VECTORS = 296; - int SAMPLES_PER_INFERENCE = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS -1) * MFCC_WINDOW_STRIDE); - int MEL_LO_FREQ = 0; - int MEL_HI_FREQ = 8000; - int NUM_FBANK_BIN = 128; - int INPUT_WINDOW_LEFT_CONTEXT = 98; - int INPUT_WINDOW_RIGHT_CONTEXT = 98; - int INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS - - (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT); - int SLIDING_WINDOW_OFFSET = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE; - - - MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN, - MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS); - - MFCC mfccInst = MFCC(mfccParams); - - Preprocess preprocessor(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, mfccInst); - bool isFirstWindow = true; - std::string currentRContext = ""; + std::string currentRContext = ""; - std::map options; + std::map options; int result = ParseOptions(options, CMD_OPTIONS, argv, argc); - if (result != 0) + if (result != 0) { return result; } @@ -127,28 +101,29 @@ int main(int argc, char *argv[]) // Create the network options common::PipelineOptions pipelineOptions; pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH); - - if (CheckOptionSpecified(options, PREFERRED_BACKENDS)) + pipelineOptions.m_ModelName = "Wav2Letter"; + if (CheckOptionSpecified(options, PREFERRED_BACKENDS)) { pipelineOptions.m_backends = GetPreferredBackendList((GetSpecifiedOption(options, PREFERRED_BACKENDS))); - } - else + } + else { pipelineOptions.m_backends = {"CpuAcc", "CpuRef"}; } asr::IPipelinePtr asrPipeline = asr::CreatePipeline(pipelineOptions, labels); - asr::AudioCapture capture; - std::vector audioData = capture.LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH)); - capture.InitSlidingWindow(audioData.data(), audioData.size(), SAMPLES_PER_INFERENCE, SLIDING_WINDOW_OFFSET); + audio::AudioCapture capture; + std::vector audioData = audio::AudioCapture::LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH)); + capture.InitSlidingWindow(audioData.data(), audioData.size(), asrPipeline->getInputSamplesSize(), + asrPipeline->getSlidingWindowOffset()); - while (capture.HasNext()) + while (capture.HasNext()) { std::vector audioBlock = capture.Next(); InferenceResults results; - std::vector preprocessedData = asrPipeline->PreProcessing(audioBlock, preprocessor); + std::vector preprocessedData = asrPipeline->PreProcessing(audioBlock); asrPipeline->Inference(preprocessedData, results); asrPipeline->PostProcessing(results, isFirstWindow, !capture.HasNext(), currentRContext); } -- cgit v1.2.1