aboutsummaryrefslogtreecommitdiff
path: root/samples/SpeechRecognition/src/Main.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'samples/SpeechRecognition/src/Main.cpp')
-rw-r--r--samples/SpeechRecognition/src/Main.cpp137
1 files changed, 56 insertions, 81 deletions
diff --git a/samples/SpeechRecognition/src/Main.cpp b/samples/SpeechRecognition/src/Main.cpp
index de37e23b40..e2d293001f 100644
--- a/samples/SpeechRecognition/src/Main.cpp
+++ b/samples/SpeechRecognition/src/Main.cpp
@@ -1,5 +1,5 @@
//
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include <iostream>
@@ -11,10 +11,8 @@
#include "CmdArgsParser.hpp"
#include "ArmnnNetworkExecutor.hpp"
#include "AudioCapture.hpp"
-#include "Preprocess.hpp"
-#include "Decoder.hpp"
#include "SpeechRecognitionPipeline.hpp"
-
+#include "Wav2LetterMFCC.hpp"
using InferenceResult = std::vector<int8_t>;
using InferenceResults = std::vector<InferenceResult>;
@@ -25,101 +23,77 @@ const std::string LABEL_PATH = "--label-path";
const std::string PREFERRED_BACKENDS = "--preferred-backends";
const std::string HELP = "--help";
-std::map<int, std::string> labels = {
- {0, "a" },
- {1, "b" },
- {2, "c" },
- {3, "d" },
- {4, "e" },
- {5, "f" },
- {6, "g" },
- {7, "h" },
- {8, "i" },
- {9, "j" },
- {10,"k" },
- {11,"l" },
- {12,"m" },
- {13,"n" },
- {14,"o" },
- {15,"p" },
- {16,"q" },
- {17,"r" },
- {18,"s" },
- {19,"t" },
- {20,"u" },
- {21,"v" },
- {22,"w" },
- {23,"x" },
- {24,"y" },
- {25,"z" },
- {26, "\'" },
+std::map<int, std::string> labels =
+{
+ {0, "a"},
+ {1, "b"},
+ {2, "c"},
+ {3, "d"},
+ {4, "e"},
+ {5, "f"},
+ {6, "g"},
+ {7, "h"},
+ {8, "i"},
+ {9, "j"},
+ {10, "k"},
+ {11, "l"},
+ {12, "m"},
+ {13, "n"},
+ {14, "o"},
+ {15, "p"},
+ {16, "q"},
+ {17, "r"},
+ {18, "s"},
+ {19, "t"},
+ {20, "u"},
+ {21, "v"},
+ {22, "w"},
+ {23, "x"},
+ {24, "y"},
+ {25, "z"},
+ {26, "\'"},
{27, " "},
- {28,"$" }
+ {28, "$"}
};
/*
* The accepted options for this Speech Recognition executable
*/
-static std::map<std::string, std::string> CMD_OPTIONS = {
- {AUDIO_FILE_PATH, "[REQUIRED] Path to the Audio file to run speech recognition on"},
- {MODEL_FILE_PATH, "[REQUIRED] Path to the Speech Recognition model to use"},
- {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
- " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
- " Defaults to CpuAcc,CpuRef"}
+static std::map<std::string, std::string> CMD_OPTIONS =
+{
+ {AUDIO_FILE_PATH, "[REQUIRED] Path to the Audio file to run speech recognition on"},
+ {MODEL_FILE_PATH, "[REQUIRED] Path to the Speech Recognition model to use"},
+ {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
+ " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
+ " Defaults to CpuAcc,CpuRef"}
};
/*
* Reads the user supplied backend preference, splits it by comma, and returns an ordered vector
*/
-std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends)
+std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends)
{
std::vector<armnn::BackendId> backends;
std::stringstream ss(preferredBackends);
- while(ss.good())
+ while (ss.good())
{
std::string backend;
- std::getline( ss, backend, ',' );
+ std::getline(ss, backend, ',');
backends.emplace_back(backend);
}
return backends;
}
-int main(int argc, char *argv[])
+int main(int argc, char* argv[])
{
- // Wav2Letter ASR SETTINGS
- int SAMP_FREQ = 16000;
- int FRAME_LEN_MS = 32;
- int FRAME_LEN_SAMPLES = SAMP_FREQ * FRAME_LEN_MS * 0.001;
- int NUM_MFCC_FEATS = 13;
- int MFCC_WINDOW_LEN = 512;
- int MFCC_WINDOW_STRIDE = 160;
- const int NUM_MFCC_VECTORS = 296;
- int SAMPLES_PER_INFERENCE = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS -1) * MFCC_WINDOW_STRIDE);
- int MEL_LO_FREQ = 0;
- int MEL_HI_FREQ = 8000;
- int NUM_FBANK_BIN = 128;
- int INPUT_WINDOW_LEFT_CONTEXT = 98;
- int INPUT_WINDOW_RIGHT_CONTEXT = 98;
- int INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS -
- (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT);
- int SLIDING_WINDOW_OFFSET = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE;
-
-
- MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN,
- MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS);
-
- MFCC mfccInst = MFCC(mfccParams);
-
- Preprocess preprocessor(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, mfccInst);
-
bool isFirstWindow = true;
- std::string currentRContext = "";
+ std::string currentRContext = "";
- std::map <std::string, std::string> options;
+ std::map<std::string, std::string> options;
int result = ParseOptions(options, CMD_OPTIONS, argv, argc);
- if (result != 0)
+ if (result != 0)
{
return result;
}
@@ -127,28 +101,29 @@ int main(int argc, char *argv[])
// Create the network options
common::PipelineOptions pipelineOptions;
pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH);
-
- if (CheckOptionSpecified(options, PREFERRED_BACKENDS))
+ pipelineOptions.m_ModelName = "Wav2Letter";
+ if (CheckOptionSpecified(options, PREFERRED_BACKENDS))
{
pipelineOptions.m_backends = GetPreferredBackendList((GetSpecifiedOption(options, PREFERRED_BACKENDS)));
- }
- else
+ }
+ else
{
pipelineOptions.m_backends = {"CpuAcc", "CpuRef"};
}
asr::IPipelinePtr asrPipeline = asr::CreatePipeline(pipelineOptions, labels);
- asr::AudioCapture capture;
- std::vector<float> audioData = capture.LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH));
- capture.InitSlidingWindow(audioData.data(), audioData.size(), SAMPLES_PER_INFERENCE, SLIDING_WINDOW_OFFSET);
+ audio::AudioCapture capture;
+ std::vector<float> audioData = audio::AudioCapture::LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH));
+ capture.InitSlidingWindow(audioData.data(), audioData.size(), asrPipeline->getInputSamplesSize(),
+ asrPipeline->getSlidingWindowOffset());
- while (capture.HasNext())
+ while (capture.HasNext())
{
std::vector<float> audioBlock = capture.Next();
InferenceResults results;
- std::vector<int8_t> preprocessedData = asrPipeline->PreProcessing<float, int8_t>(audioBlock, preprocessor);
+ std::vector<int8_t> preprocessedData = asrPipeline->PreProcessing(audioBlock);
asrPipeline->Inference<int8_t>(preprocessedData, results);
asrPipeline->PostProcessing<int8_t>(results, isFirstWindow, !capture.HasNext(), currentRContext);
}