From 23c26277086c78704a17f0dae86da947816320c0 Mon Sep 17 00:00:00 2001
From: George Gekov <george.gekov@arm.com>
Date: Mon, 16 Aug 2021 11:32:10 +0100
Subject: MLECO-2079 Adding the C++ KWS example

Signed-off-by: Eanna O Cathain <eanna.ocathain@arm.com>
Change-Id: I81899bbfaada32f478c2e2fc6441eabb94d8d0fc
---
 samples/SpeechRecognition/src/Main.cpp | 137 ++++++++++++++-------------------
 1 file changed, 56 insertions(+), 81 deletions(-)

(limited to 'samples/SpeechRecognition/src/Main.cpp')
diff --git a/samples/SpeechRecognition/src/Main.cpp b/samples/SpeechRecognition/src/Main.cpp
index de37e23b40..e2d293001f 100644
--- a/samples/SpeechRecognition/src/Main.cpp
+++ b/samples/SpeechRecognition/src/Main.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #include <iostream>
@@ -11,10 +11,8 @@
 #include "CmdArgsParser.hpp"
 #include "ArmnnNetworkExecutor.hpp"
 #include "AudioCapture.hpp"
-#include "Preprocess.hpp"
-#include "Decoder.hpp"
 #include "SpeechRecognitionPipeline.hpp"
-
+#include "Wav2LetterMFCC.hpp"
 
 using InferenceResult = std::vector<int8_t>;
 using InferenceResults = std::vector<InferenceResult>;
@@ -25,101 +23,77 @@ const std::string LABEL_PATH = "--label-path";
 const std::string PREFERRED_BACKENDS = "--preferred-backends";
 const std::string HELP = "--help";
 
-std::map<int, std::string> labels = {
-        {0, "a" },
-        {1, "b" },
-        {2, "c" },
-        {3, "d" },
-        {4, "e" },
-        {5, "f" },
-        {6, "g" },
-        {7, "h" },
-        {8, "i" },
-        {9, "j" },
-        {10,"k" },
-        {11,"l" },
-        {12,"m" },
-        {13,"n" },
-        {14,"o" },
-        {15,"p" },
-        {16,"q" },
-        {17,"r" },
-        {18,"s" },
-        {19,"t" },
-        {20,"u" },
-        {21,"v" },
-        {22,"w" },
-        {23,"x" },
-        {24,"y" },
-        {25,"z" },
-        {26, "\'" },
+std::map<int, std::string> labels = 
+{
+        {0,  "a"},
+        {1,  "b"},
+        {2,  "c"},
+        {3,  "d"},
+        {4,  "e"},
+        {5,  "f"},
+        {6,  "g"},
+        {7,  "h"},
+        {8,  "i"},
+        {9,  "j"},
+        {10, "k"},
+        {11, "l"},
+        {12, "m"},
+        {13, "n"},
+        {14, "o"},
+        {15, "p"},
+        {16, "q"},
+        {17, "r"},
+        {18, "s"},
+        {19, "t"},
+        {20, "u"},
+        {21, "v"},
+        {22, "w"},
+        {23, "x"},
+        {24, "y"},
+        {25, "z"},
+        {26, "\'"},
         {27, " "},
-        {28,"$" }
+        {28, "$"}
 };
 
 /*
  * The accepted options for this Speech Recognition executable
  */
-static std::map<std::string, std::string> CMD_OPTIONS = {
-        {AUDIO_FILE_PATH, "[REQUIRED] Path to the Audio file to run speech recognition on"},
-        {MODEL_FILE_PATH, "[REQUIRED] Path to the Speech Recognition model to use"},
-        {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
-                             " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
-                             " Defaults to CpuAcc,CpuRef"}
+static std::map<std::string, std::string> CMD_OPTIONS = 
+{
+    {AUDIO_FILE_PATH,    "[REQUIRED] Path to the Audio file to run speech recognition on"},
+    {MODEL_FILE_PATH,    "[REQUIRED] Path to the Speech Recognition model to use"},
+    {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
+                         " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
+                         " Defaults to CpuAcc,CpuRef"}
 };
 
 /*
  * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector
  */
-std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends)
+std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends) 
 {
     std::vector<armnn::BackendId> backends;
     std::stringstream ss(preferredBackends);
 
-    while(ss.good())
+    while (ss.good()) 
     {
         std::string backend;
-        std::getline( ss, backend, ',' );
+        std::getline(ss, backend, ',');
         backends.emplace_back(backend);
     }
     return backends;
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char* argv[]) 
 {
-    // Wav2Letter ASR SETTINGS
-    int             SAMP_FREQ                  = 16000;
-    int             FRAME_LEN_MS               = 32;
-    int             FRAME_LEN_SAMPLES          = SAMP_FREQ * FRAME_LEN_MS * 0.001;
-    int             NUM_MFCC_FEATS             = 13;
-    int             MFCC_WINDOW_LEN            = 512;
-    int             MFCC_WINDOW_STRIDE         = 160;
-    const int       NUM_MFCC_VECTORS           = 296;
-    int             SAMPLES_PER_INFERENCE      = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS -1) * MFCC_WINDOW_STRIDE);
-    int             MEL_LO_FREQ                = 0;
-    int             MEL_HI_FREQ                = 8000;
-    int             NUM_FBANK_BIN              = 128;
-    int             INPUT_WINDOW_LEFT_CONTEXT  = 98;
-    int             INPUT_WINDOW_RIGHT_CONTEXT = 98;
-    int             INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS -
-            (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT);
-    int             SLIDING_WINDOW_OFFSET      = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE;
-
-
-    MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN,
-            MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS);
-
-    MFCC mfccInst = MFCC(mfccParams);
-
-    Preprocess preprocessor(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, mfccInst);
-
     bool isFirstWindow = true;
-    std::string currentRContext  = "";
+    std::string currentRContext = "";
 
-    std::map <std::string, std::string> options;
+    std::map<std::string, std::string> options;
 
     int result = ParseOptions(options, CMD_OPTIONS, argv, argc);
-    if (result != 0)
+    if (result != 0) 
     {
         return result;
     }
@@ -127,28 +101,29 @@ int main(int argc, char *argv[])
     // Create the network options
     common::PipelineOptions pipelineOptions;
     pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH);
-
-    if (CheckOptionSpecified(options, PREFERRED_BACKENDS))
+    pipelineOptions.m_ModelName = "Wav2Letter";
+    if (CheckOptionSpecified(options, PREFERRED_BACKENDS)) 
     {
         pipelineOptions.m_backends = GetPreferredBackendList((GetSpecifiedOption(options, PREFERRED_BACKENDS)));
-    }
-    else
+    } 
+    else 
     {
         pipelineOptions.m_backends = {"CpuAcc", "CpuRef"};
     }
 
     asr::IPipelinePtr asrPipeline = asr::CreatePipeline(pipelineOptions, labels);
 
-    asr::AudioCapture capture;
-    std::vector<float> audioData = capture.LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH));
-    capture.InitSlidingWindow(audioData.data(), audioData.size(), SAMPLES_PER_INFERENCE, SLIDING_WINDOW_OFFSET);
+    audio::AudioCapture capture;
+    std::vector<float> audioData = audio::AudioCapture::LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH));
+    capture.InitSlidingWindow(audioData.data(), audioData.size(), asrPipeline->getInputSamplesSize(),
+                              asrPipeline->getSlidingWindowOffset());
 
-    while (capture.HasNext())
+    while (capture.HasNext()) 
     {
         std::vector<float> audioBlock = capture.Next();
         InferenceResults results;
 
-        std::vector<int8_t> preprocessedData = asrPipeline->PreProcessing<float, int8_t>(audioBlock, preprocessor);
+        std::vector<int8_t> preprocessedData = asrPipeline->PreProcessing(audioBlock);
         asrPipeline->Inference<int8_t>(preprocessedData, results);
         asrPipeline->PostProcessing<int8_t>(results, isFirstWindow, !capture.HasNext(), currentRContext);
     }
-- 
cgit v1.2.1