4 files changed, 297 insertions, 0 deletions
diff --git a/samples/KeywordSpotting/src/Decoder.cpp b/samples/KeywordSpotting/src/Decoder.cpp
new file mode 100644
index 0000000000..107e25caa9
--- /dev/null
+++ b/samples/KeywordSpotting/src/Decoder.cpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "Decoder.hpp"
+
+std::pair<int, float> kws::Decoder::decodeOutput(std::vector<int8_t>& modelOutput) 
+{
+
+    std::vector<float> dequantisedOutput;
+    //Normalise vector values into new vector
+    for (auto& value : modelOutput) 
+    {
+        float normalisedModelOutput = this->quantisationScale * (static_cast<float >(value) -
+                                                                 static_cast<float >(this->quantisationOffset));
+        dequantisedOutput.push_back(normalisedModelOutput);
+    }
+
+    //Get largest value in modelOutput
+    const std::vector<float>::iterator& maxElementIterator = std::max_element(dequantisedOutput.begin(),
+                                                                              dequantisedOutput.end());
+    //Find the labelMapIndex of the largest value which corresponds to a key in a label map
+    int labelMapIndex = static_cast<int>(std::distance(dequantisedOutput.begin(), maxElementIterator));
+
+    //Round to two DP
+    float maxModelOutputProbability = std::roundf((*maxElementIterator) * 100) / 100;
+
+    return std::make_pair(labelMapIndex, maxModelOutputProbability);
+
+}
+
+
+
+
diff --git a/samples/KeywordSpotting/src/DsCNNPreprocessor.cpp b/samples/KeywordSpotting/src/DsCNNPreprocessor.cpp
new file mode 100644
index 0000000000..8215feeeb5
--- /dev/null
+++ b/samples/KeywordSpotting/src/DsCNNPreprocessor.cpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include <cmath>
+#include <numeric>
+#include <algorithm>
+#include <memory>
+#include "MathUtils.hpp"
+#include "SlidingWindow.hpp"
+#include "DsCNNPreprocessor.hpp"
+
+std::vector<int8_t> kws::DsCNNPreprocessor::Invoke(const float* audioData, size_t dataSize,
+                                                   int quantOffset, float quantScale) 
+{
+    auto window = SlidingWindow<const float>(
+            audioData, dataSize,
+            this->m_windowLen, this->m_windowStride);
+
+    uint32_t mfccBufIdx = 0;
+    std::vector<int8_t> outputBuffer;
+    // While we can slide over the window
+    while (window.HasNext()) 
+    {
+        const float* mfccWindow = window.Next();
+        auto mfccAudioData = std::vector<float>(mfccWindow, mfccWindow + this->m_windowLen);
+
+        auto mfcc = this->m_mfcc->MfccComputeQuant<int8_t>(mfccAudioData, quantScale, quantOffset);
+
+        std::copy(mfcc.begin(), mfcc.end(), std::back_inserter(outputBuffer));
+
+        ++mfccBufIdx;
+    }
+
+    return outputBuffer;
+}
+
+kws::DsCNNPreprocessor::DsCNNPreprocessor(const uint32_t windowLen, const uint32_t windowStride,
+                                          std::unique_ptr<DsCnnMFCC> mfccInst) :
+        m_windowLen{windowLen}, m_windowStride{windowStride}, m_mfcc{std::move(mfccInst)} {}
diff --git a/samples/KeywordSpotting/src/KeywordSpottingPipeline.cpp b/samples/KeywordSpotting/src/KeywordSpottingPipeline.cpp
new file mode 100644
index 0000000000..e32d9476e3
--- /dev/null
+++ b/samples/KeywordSpotting/src/KeywordSpottingPipeline.cpp
@@ -0,0 +1,94 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "KeywordSpottingPipeline.hpp"
+#include "ArmnnNetworkExecutor.hpp"
+#include "DsCNNPreprocessor.hpp"
+
+namespace kws
+{
+KWSPipeline::KWSPipeline(std::unique_ptr<common::ArmnnNetworkExecutor<int8_t>> executor,
+                         std::unique_ptr<Decoder> decoder,
+                         std::unique_ptr<DsCNNPreprocessor> preProcessor
+                         ) :
+        m_executor(std::move(executor)),
+        m_decoder(std::move(decoder)),
+        m_preProcessor(std::move(preProcessor)) {}
+
+
+std::vector<int8_t> KWSPipeline::PreProcessing(std::vector<float>& audio)
+{
+    return m_preProcessor->Invoke(audio.data(), audio.size(), m_executor->GetQuantizationOffset(),
+                                  m_executor->GetQuantizationScale());
+}
+
+void KWSPipeline::Inference(const std::vector<int8_t>& preprocessedData, 
+                            common::InferenceResults<int8_t>& result)
+{
+    m_executor->Run(preprocessedData.data(), preprocessedData.size(), result);
+}
+
+void KWSPipeline::PostProcessing(common::InferenceResults<int8_t>& inferenceResults,
+                    std::map<int, std::string>& labels,
+                    const std::function<void (int, std::string&, float)>& callback)
+{
+    std::pair<int,float> outputDecoder = this->m_decoder->decodeOutput(inferenceResults[0]);
+    int keywordIndex = std::get<0>(outputDecoder);
+    std::string output = labels[keywordIndex];
+    callback(keywordIndex, output, std::get<1>(outputDecoder));
+}
+
+int KWSPipeline::getInputSamplesSize()
+{
+    return this->m_preProcessor->m_windowLen +
+            ((this->m_preProcessor->m_mfcc->m_params.m_numMfccVectors - 1) * 
+              this->m_preProcessor->m_windowStride);
+}
+
+IPipelinePtr CreatePipeline(common::PipelineOptions& config)
+{
+    if (config.m_ModelName == "DS_CNN_CLUSTERED_INT8") 
+    {
+        //DS-CNN model settings
+        float SAMP_FREQ = 16000;
+        int MFCC_WINDOW_LEN = 640;
+        int MFCC_WINDOW_STRIDE = 320;
+        int NUM_MFCC_FEATS = 10;
+        int NUM_MFCC_VECTORS = 49;
+        //todo: calc in pipeline and use in main
+        int SAMPLES_PER_INFERENCE = NUM_MFCC_VECTORS * MFCC_WINDOW_STRIDE + 
+                                    MFCC_WINDOW_LEN - MFCC_WINDOW_STRIDE; //16000
+        float MEL_LO_FREQ = 20;
+        float MEL_HI_FREQ = 4000;
+        int NUM_FBANK_BIN = 40;
+
+        MfccParams mfccParams(SAMP_FREQ,
+                              NUM_FBANK_BIN,
+                              MEL_LO_FREQ,
+                              MEL_HI_FREQ,
+                              NUM_MFCC_FEATS,
+                              MFCC_WINDOW_LEN, false,
+                              NUM_MFCC_VECTORS);
+
+        std::unique_ptr<DsCnnMFCC> mfccInst = std::make_unique<DsCnnMFCC>(mfccParams);
+        auto preprocessor = std::make_unique<kws::DsCNNPreprocessor>(
+            MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, std::move(mfccInst));
+
+        auto executor = std::make_unique<common::ArmnnNetworkExecutor<int8_t>>(
+            config.m_ModelFilePath, config.m_backends);
+
+        auto decoder = std::make_unique<kws::Decoder>(executor->GetOutputQuantizationOffset(0),
+                                                      executor->GetOutputQuantizationScale(0));
+
+        return std::make_unique<kws::KWSPipeline>(std::move(executor), 
+                                                  std::move(decoder), std::move(preprocessor));
+    }  
+    else 
+    {
+        throw std::invalid_argument("Unknown Model name: " + config.m_ModelName + " .");
+    }
+}
+
+};// namespace kws
+\ No newline at end of file
diff --git a/samples/KeywordSpotting/src/Main.cpp b/samples/KeywordSpotting/src/Main.cpp
new file mode 100644
index 0000000000..10efcd8ce7
--- /dev/null
+++ b/samples/KeywordSpotting/src/Main.cpp
@@ -0,0 +1,128 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include <iostream>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include "KeywordSpottingPipeline.hpp"
+#include "CmdArgsParser.hpp"
+#include "ArmnnNetworkExecutor.hpp"
+#include "AudioCapture.hpp"
+
+const std::string AUDIO_FILE_PATH = "--audio-file-path";
+const std::string MODEL_FILE_PATH = "--model-file-path";
+const std::string LABEL_PATH = "--label-path";
+const std::string PREFERRED_BACKENDS = "--preferred-backends";
+const std::string HELP = "--help";
+
+/*
+ * The accepted options for this Speech Recognition executable
+ */
+static std::map<std::string, std::string> CMD_OPTIONS = 
+{
+        {AUDIO_FILE_PATH,    "[REQUIRED] Path to the Audio file to run speech recognition on"},
+        {MODEL_FILE_PATH,    "[REQUIRED] Path to the Speech Recognition model to use"},
+        {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma."
+                             " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]."
+                             " Defaults to CpuAcc,CpuRef"}
+};
+
+/*
+ * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector
+ */
+std::vector<armnn::BackendId> GetPreferredBackendList(const std::string& preferredBackends) 
+{
+    std::vector<armnn::BackendId> backends;
+    std::stringstream ss(preferredBackends);
+
+    while (ss.good()) 
+    {
+        std::string backend;
+        std::getline(ss, backend, ',');
+        backends.emplace_back(backend);
+    }
+    return backends;
+}
+
+//Labels for this model
+std::map<int, std::string> labels = 
+{
+        {0,  "silence"},
+        {1,  "unknown"},
+        {2,  "yes"},
+        {3,  "no"},
+        {4,  "up"},
+        {5,  "down"},
+        {6,  "left"},
+        {7,  "right"},
+        {8,  "on"},
+        {9,  "off"},
+        {10, "stop"},
+        {11, "go"}
+};
+
+
+int main(int argc, char* argv[]) 
+{
+    printf("ArmNN major version: %d\n", ARMNN_MAJOR_VERSION);
+    std::map<std::string, std::string> options;
+
+    //Read command line args
+    int result = ParseOptions(options, CMD_OPTIONS, argv, argc);
+    if (result != 0) 
+    {
+        return result;
+    }
+
+    // Create the ArmNN inference runner
+    common::PipelineOptions pipelineOptions;
+    pipelineOptions.m_ModelName = "DS_CNN_CLUSTERED_INT8";
+    pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH);
+    if (CheckOptionSpecified(options, PREFERRED_BACKENDS)) 
+    {
+        pipelineOptions.m_backends = GetPreferredBackendList(
+            (GetSpecifiedOption(options, PREFERRED_BACKENDS)));
+    } 
+    else 
+    {
+        pipelineOptions.m_backends = {"CpuAcc", "CpuRef"};
+    }
+
+    kws::IPipelinePtr kwsPipeline = kws::CreatePipeline(pipelineOptions);
+
+    //Extract audio data from sound file
+    auto filePath = GetSpecifiedOption(options, AUDIO_FILE_PATH);
+    std::vector<float> audioData = audio::AudioCapture::LoadAudioFile(filePath);
+
+    audio::AudioCapture capture;
+    //todo: read samples and stride from pipeline
+    capture.InitSlidingWindow(audioData.data(), 
+                              audioData.size(), 
+                              kwsPipeline->getInputSamplesSize(), 
+                              kwsPipeline->getInputSamplesSize()/2);
+
+    //Loop through audio data buffer
+    while (capture.HasNext()) 
+    {
+        std::vector<float> audioBlock = capture.Next();
+        common::InferenceResults<int8_t> results;
+
+        //Prepare input tensors
+        std::vector<int8_t> preprocessedData = kwsPipeline->PreProcessing(audioBlock);
+        //Run inference
+        kwsPipeline->Inference(preprocessedData, results);
+        //Decode output
+        kwsPipeline->PostProcessing(results, labels,
+                                    [](int index, std::string& label, float prob) -> void {
+                                        printf("Keyword \"%s\", index %d:, probability %f\n",
+                                               label.c_str(),
+                                               index,
+                                               prob);
+                                    });
+    }
+
+    return 0;
+}
+\ No newline at end of file