From aa4bcb14d0cbee910331545dd2fc086b58c37170 Mon Sep 17 00:00:00 2001 From: Kshitij Sisodia Date: Fri, 6 May 2022 09:13:03 +0100 Subject: MLECO-3183: Refactoring application sources Platform agnostic application sources are moved into application api module with their own independent CMake projects. Changes for MLECO-3080 also included - they create CMake projects individial API's (again, platform agnostic) that dependent on the common logic. The API for KWS_API "joint" API has been removed and now the use case relies on individual KWS, and ASR API libraries. Change-Id: I1f7748dc767abb3904634a04e0991b74ac7b756d Signed-off-by: Kshitij Sisodia --- source/application/api/use_case/ad/CMakeLists.txt | 41 + .../api/use_case/ad/include/AdMelSpectrogram.hpp | 97 +++ .../api/use_case/ad/include/AdModel.hpp | 55 ++ .../api/use_case/ad/include/AdProcessing.hpp | 231 ++++++ .../api/use_case/ad/include/MelSpectrogram.hpp | 234 ++++++ .../api/use_case/ad/src/AdMelSpectrogram.cc | 93 +++ source/application/api/use_case/ad/src/AdModel.cc | 41 + .../api/use_case/ad/src/AdProcessing.cc | 210 +++++ .../api/use_case/ad/src/MelSpectrogram.cc | 316 ++++++++ source/application/api/use_case/asr/CMakeLists.txt | 43 + .../api/use_case/asr/include/AsrClassifier.hpp | 63 ++ .../api/use_case/asr/include/AsrResult.hpp | 63 ++ .../api/use_case/asr/include/OutputDecode.hpp | 40 + .../api/use_case/asr/include/Wav2LetterMfcc.hpp | 109 +++ .../api/use_case/asr/include/Wav2LetterModel.hpp | 67 ++ .../use_case/asr/include/Wav2LetterPostprocess.hpp | 109 +++ .../use_case/asr/include/Wav2LetterPreprocess.hpp | 182 +++++ .../api/use_case/asr/src/AsrClassifier.cc | 144 ++++ .../api/use_case/asr/src/OutputDecode.cc | 47 ++ .../api/use_case/asr/src/Wav2LetterMfcc.cc | 141 ++++ .../api/use_case/asr/src/Wav2LetterModel.cc | 42 + .../api/use_case/asr/src/Wav2LetterPostprocess.cc | 214 +++++ .../api/use_case/asr/src/Wav2LetterPreprocess.cc | 208 +++++ .../api/use_case/img_class/CMakeLists.txt | 39 + .../img_class/include/ImgClassProcessing.hpp | 91 +++ .../use_case/img_class/include/MobileNetModel.hpp | 51 ++ .../use_case/img_class/src/ImgClassProcessing.cc | 66 ++ .../api/use_case/img_class/src/MobileNetModel.cc | 42 + .../api/use_case/inference_runner/CMakeLists.txt | 37 + .../inference_runner/include/TestModel.hpp | 43 + .../api/use_case/inference_runner/src/TestModel.cc | 23 + source/application/api/use_case/kws/CMakeLists.txt | 39 + .../api/use_case/kws/include/KwsProcessing.hpp | 137 ++++ .../api/use_case/kws/include/KwsResult.hpp | 63 ++ .../api/use_case/kws/include/MicroNetKwsMfcc.hpp | 50 ++ .../api/use_case/kws/include/MicroNetKwsModel.hpp | 63 ++ .../api/use_case/kws/src/KwsProcessing.cc | 211 +++++ .../api/use_case/kws/src/MicroNetKwsModel.cc | 42 + .../api/use_case/noise_reduction/CMakeLists.txt | 40 + .../include/RNNoiseFeatureProcessor.hpp | 341 ++++++++ .../noise_reduction/include/RNNoiseModel.hpp | 78 ++ .../noise_reduction/include/RNNoiseProcessing.hpp | 113 +++ .../noise_reduction/src/RNNoiseFeatureProcessor.cc | 892 +++++++++++++++++++++ .../use_case/noise_reduction/src/RNNoiseModel.cc | 96 +++ .../noise_reduction/src/RNNoiseProcessing.cc | 100 +++ .../api/use_case/object_detection/CMakeLists.txt | 40 + .../object_detection/include/DetectionResult.hpp | 61 ++ .../include/DetectorPostProcessing.hpp | 125 +++ .../include/DetectorPreProcessing.hpp | 60 ++ .../object_detection/include/YoloFastestModel.hpp | 56 ++ .../object_detection/src/DetectorPostProcessing.cc | 240 ++++++ .../object_detection/src/DetectorPreProcessing.cc | 52 ++ .../object_detection/src/YoloFastestModel.cc | 45 ++ source/application/api/use_case/vww/CMakeLists.txt | 39 + .../use_case/vww/include/VisualWakeWordModel.hpp | 50 ++ .../vww/include/VisualWakeWordProcessing.hpp | 93 +++ .../api/use_case/vww/src/VisualWakeWordModel.cc | 42 + .../use_case/vww/src/VisualWakeWordProcessing.cc | 80 ++ 58 files changed, 6430 insertions(+) create mode 100644 source/application/api/use_case/ad/CMakeLists.txt create mode 100644 source/application/api/use_case/ad/include/AdMelSpectrogram.hpp create mode 100644 source/application/api/use_case/ad/include/AdModel.hpp create mode 100644 source/application/api/use_case/ad/include/AdProcessing.hpp create mode 100644 source/application/api/use_case/ad/include/MelSpectrogram.hpp create mode 100644 source/application/api/use_case/ad/src/AdMelSpectrogram.cc create mode 100644 source/application/api/use_case/ad/src/AdModel.cc create mode 100644 source/application/api/use_case/ad/src/AdProcessing.cc create mode 100644 source/application/api/use_case/ad/src/MelSpectrogram.cc create mode 100644 source/application/api/use_case/asr/CMakeLists.txt create mode 100644 source/application/api/use_case/asr/include/AsrClassifier.hpp create mode 100644 source/application/api/use_case/asr/include/AsrResult.hpp create mode 100644 source/application/api/use_case/asr/include/OutputDecode.hpp create mode 100644 source/application/api/use_case/asr/include/Wav2LetterMfcc.hpp create mode 100644 source/application/api/use_case/asr/include/Wav2LetterModel.hpp create mode 100644 source/application/api/use_case/asr/include/Wav2LetterPostprocess.hpp create mode 100644 source/application/api/use_case/asr/include/Wav2LetterPreprocess.hpp create mode 100644 source/application/api/use_case/asr/src/AsrClassifier.cc create mode 100644 source/application/api/use_case/asr/src/OutputDecode.cc create mode 100644 source/application/api/use_case/asr/src/Wav2LetterMfcc.cc create mode 100644 source/application/api/use_case/asr/src/Wav2LetterModel.cc create mode 100644 source/application/api/use_case/asr/src/Wav2LetterPostprocess.cc create mode 100644 source/application/api/use_case/asr/src/Wav2LetterPreprocess.cc create mode 100644 source/application/api/use_case/img_class/CMakeLists.txt create mode 100644 source/application/api/use_case/img_class/include/ImgClassProcessing.hpp create mode 100644 source/application/api/use_case/img_class/include/MobileNetModel.hpp create mode 100644 source/application/api/use_case/img_class/src/ImgClassProcessing.cc create mode 100644 source/application/api/use_case/img_class/src/MobileNetModel.cc create mode 100644 source/application/api/use_case/inference_runner/CMakeLists.txt create mode 100644 source/application/api/use_case/inference_runner/include/TestModel.hpp create mode 100644 source/application/api/use_case/inference_runner/src/TestModel.cc create mode 100644 source/application/api/use_case/kws/CMakeLists.txt create mode 100644 source/application/api/use_case/kws/include/KwsProcessing.hpp create mode 100644 source/application/api/use_case/kws/include/KwsResult.hpp create mode 100644 source/application/api/use_case/kws/include/MicroNetKwsMfcc.hpp create mode 100644 source/application/api/use_case/kws/include/MicroNetKwsModel.hpp create mode 100644 source/application/api/use_case/kws/src/KwsProcessing.cc create mode 100644 source/application/api/use_case/kws/src/MicroNetKwsModel.cc create mode 100644 source/application/api/use_case/noise_reduction/CMakeLists.txt create mode 100644 source/application/api/use_case/noise_reduction/include/RNNoiseFeatureProcessor.hpp create mode 100644 source/application/api/use_case/noise_reduction/include/RNNoiseModel.hpp create mode 100644 source/application/api/use_case/noise_reduction/include/RNNoiseProcessing.hpp create mode 100644 source/application/api/use_case/noise_reduction/src/RNNoiseFeatureProcessor.cc create mode 100644 source/application/api/use_case/noise_reduction/src/RNNoiseModel.cc create mode 100644 source/application/api/use_case/noise_reduction/src/RNNoiseProcessing.cc create mode 100644 source/application/api/use_case/object_detection/CMakeLists.txt create mode 100644 source/application/api/use_case/object_detection/include/DetectionResult.hpp create mode 100644 source/application/api/use_case/object_detection/include/DetectorPostProcessing.hpp create mode 100644 source/application/api/use_case/object_detection/include/DetectorPreProcessing.hpp create mode 100644 source/application/api/use_case/object_detection/include/YoloFastestModel.hpp create mode 100644 source/application/api/use_case/object_detection/src/DetectorPostProcessing.cc create mode 100644 source/application/api/use_case/object_detection/src/DetectorPreProcessing.cc create mode 100644 source/application/api/use_case/object_detection/src/YoloFastestModel.cc create mode 100644 source/application/api/use_case/vww/CMakeLists.txt create mode 100644 source/application/api/use_case/vww/include/VisualWakeWordModel.hpp create mode 100644 source/application/api/use_case/vww/include/VisualWakeWordProcessing.hpp create mode 100644 source/application/api/use_case/vww/src/VisualWakeWordModel.cc create mode 100644 source/application/api/use_case/vww/src/VisualWakeWordProcessing.cc (limited to 'source/application/api/use_case') diff --git a/source/application/api/use_case/ad/CMakeLists.txt b/source/application/api/use_case/ad/CMakeLists.txt new file mode 100644 index 0000000..224816f --- /dev/null +++ b/source/application/api/use_case/ad/CMakeLists.txt @@ -0,0 +1,41 @@ +#---------------------------------------------------------------------------- +# Copyright (c) 2022 Arm Limited. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#---------------------------------------------------------------------------- +######################################################### +# ANOMALY DETECTION API library # +######################################################### +cmake_minimum_required(VERSION 3.15.6) + +set(AD_API_TARGET ad_api) +project(${AD_API_TARGET} + DESCRIPTION "Anomaly detection use case API library" + LANGUAGES C CXX) + +# Create static library +add_library(${AD_API_TARGET} STATIC + src/AdModel.cc + src/AdProcessing.cc + src/AdMelSpectrogram.cc + src/MelSpectrogram.cc) + +target_include_directories(${AD_API_TARGET} PUBLIC include) + +target_link_libraries(${AD_API_TARGET} PUBLIC common_api) + +message(STATUS "*******************************************************") +message(STATUS "Library : " ${AD_API_TARGET}) +message(STATUS "CMAKE_SYSTEM_PROCESSOR : " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "*******************************************************") diff --git a/source/application/api/use_case/ad/include/AdMelSpectrogram.hpp b/source/application/api/use_case/ad/include/AdMelSpectrogram.hpp new file mode 100644 index 0000000..05c5bfc --- /dev/null +++ b/source/application/api/use_case/ad/include/AdMelSpectrogram.hpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ADMELSPECTROGRAM_HPP +#define ADMELSPECTROGRAM_HPP + +#include "MelSpectrogram.hpp" + +namespace arm { +namespace app { +namespace audio { + + /* Class to provide anomaly detection specific Mel Spectrogram calculation requirements */ + class AdMelSpectrogram : public MelSpectrogram { + + public: + static constexpr uint32_t ms_defaultSamplingFreq = 16000; + static constexpr uint32_t ms_defaultNumFbankBins = 64; + static constexpr uint32_t ms_defaultMelLoFreq = 0; + static constexpr uint32_t ms_defaultMelHiFreq = 8000; + static constexpr bool ms_defaultUseHtkMethod = false; + + explicit AdMelSpectrogram(const size_t frameLen) + : MelSpectrogram(MelSpecParams( + ms_defaultSamplingFreq, ms_defaultNumFbankBins, + ms_defaultMelLoFreq, ms_defaultMelHiFreq, + frameLen, ms_defaultUseHtkMethod)) + {} + + AdMelSpectrogram() = delete; + ~AdMelSpectrogram() = default; + + protected: + + /** + * @brief Overrides base class implementation of this function. + * @param[in] fftVec Vector populated with FFT magnitudes + * @param[in] melFilterBank 2D Vector with filter bank weights + * @param[in] filterBankFilterFirst Vector containing the first indices of filter bank + * to be used for each bin. + * @param[in] filterBankFilterLast Vector containing the last indices of filter bank + * to be used for each bin. + * @param[out] melEnergies Pre-allocated vector of MEL energies to be + * populated. + * @return true if successful, false otherwise + */ + virtual bool ApplyMelFilterBank( + std::vector& fftVec, + std::vector>& melFilterBank, + std::vector& filterBankFilterFirst, + std::vector& filterBankFilterLast, + std::vector& melEnergies) override; + + /** + * @brief Override for the base class implementation convert mel + * energies to logarithmic scale. The difference from + * default behaviour is that the power is converted to dB + * and subsequently clamped. + * @param[in,out] melEnergies - 1D vector of Mel energies + **/ + virtual void ConvertToLogarithmicScale(std::vector& melEnergies) override; + + /** + * @brief Given the low and high Mel values, get the normaliser + * for weights to be applied when populating the filter + * bank. Override for the base class implementation. + * @param[in] leftMel - low Mel frequency value + * @param[in] rightMel - high Mel frequency value + * @param[in] useHTKMethod - bool to signal if HTK method is to be + * used for calculation + * @return Return float value to be applied + * when populating the filter bank. + */ + virtual float GetMelFilterBankNormaliser( + const float& leftMel, + const float& rightMel, + const bool useHTKMethod) override; + }; + +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ + +#endif /* ADMELSPECTROGRAM_HPP */ diff --git a/source/application/api/use_case/ad/include/AdModel.hpp b/source/application/api/use_case/ad/include/AdModel.hpp new file mode 100644 index 0000000..0436a89 --- /dev/null +++ b/source/application/api/use_case/ad/include/AdModel.hpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AD_MODEL_HPP +#define AD_MODEL_HPP + +#include "Model.hpp" + +extern const int g_FrameLength; +extern const int g_FrameStride; +extern const float g_ScoreThreshold; +extern const float g_TrainingMean; + +namespace arm { +namespace app { + + class AdModel : public Model { + + public: + /* Indices for the expected model - based on input tensor shape */ + static constexpr uint32_t ms_inputRowsIdx = 1; + static constexpr uint32_t ms_inputColsIdx = 2; + + protected: + /** @brief Gets the reference to op resolver interface class */ + const tflite::MicroOpResolver& GetOpResolver() override; + + /** @brief Adds operations to the op resolver instance */ + bool EnlistOperations() override; + + private: + /* Maximum number of individual operations that can be enlisted */ + static constexpr int ms_maxOpCnt = 6; + + /* A mutable op resolver instance */ + tflite::MicroMutableOpResolver m_opResolver; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* AD_MODEL_HPP */ diff --git a/source/application/api/use_case/ad/include/AdProcessing.hpp b/source/application/api/use_case/ad/include/AdProcessing.hpp new file mode 100644 index 0000000..abee75e --- /dev/null +++ b/source/application/api/use_case/ad/include/AdProcessing.hpp @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef AD_PROCESSING_HPP +#define AD_PROCESSING_HPP + +#include "BaseProcessing.hpp" +#include "TensorFlowLiteMicro.hpp" +#include "AudioUtils.hpp" +#include "AdMelSpectrogram.hpp" +#include "log_macros.h" + +namespace arm { +namespace app { + + /** + * @brief Pre-processing class for anomaly detection use case. + * Implements methods declared by BasePreProcess and anything else needed + * to populate input tensors ready for inference. + */ + class AdPreProcess : public BasePreProcess { + + public: + /** + * @brief Constructor for AdPreProcess class objects + * @param[in] inputTensor input tensor pointer from the tensor arena. + * @param[in] melSpectrogramFrameLen MEL spectrogram's frame length + * @param[in] melSpectrogramFrameStride MEL spectrogram's frame stride + * @param[in] adModelTrainingMean Training mean for the Anomaly detection model being used. + */ + explicit AdPreProcess(TfLiteTensor* inputTensor, + uint32_t melSpectrogramFrameLen, + uint32_t melSpectrogramFrameStride, + float adModelTrainingMean); + + ~AdPreProcess() = default; + + /** + * @brief Function to invoke pre-processing and populate the input vector + * @param input pointer to input data. For anomaly detection, this is the pointer to + * the audio data. + * @param inputSize Size of the data being passed in for pre-processing. + * @return True if successful, false otherwise. + */ + bool DoPreProcess(const void* input, size_t inputSize) override; + + /** + * @brief Getter function for audio window size computed when constructing + * the class object. + * @return Audio window size as 32 bit unsigned integer. + */ + uint32_t GetAudioWindowSize(); + + /** + * @brief Getter function for audio window stride computed when constructing + * the class object. + * @return Audio window stride as 32 bit unsigned integer. + */ + uint32_t GetAudioDataStride(); + + /** + * @brief Setter function for current audio index. This is only used for evaluating + * if previously computed features can be re-used from cache. + */ + void SetAudioWindowIndex(uint32_t idx); + + private: + bool m_validInstance{false}; /**< Indicates the current object is valid. */ + uint32_t m_melSpectrogramFrameLen{}; /**< MEL spectrogram's window frame length */ + uint32_t m_melSpectrogramFrameStride{}; /**< MEL spectrogram's window frame stride */ + uint8_t m_inputResizeScale{}; /**< Downscaling factor for the MEL energy matrix. */ + uint32_t m_numMelSpecVectorsInAudioStride{}; /**< Number of frames to move across the audio. */ + uint32_t m_audioDataWindowSize{}; /**< Audio window size computed based on other parameters. */ + uint32_t m_audioDataStride{}; /**< Audio window stride computed. */ + uint32_t m_numReusedFeatureVectors{}; /**< Number of MEL vectors that can be re-used */ + uint32_t m_audioWindowIndex{}; /**< Current audio window index (from audio's sliding window) */ + + audio::SlidingWindow m_melWindowSlider; /**< Internal MEL spectrogram window slider */ + audio::AdMelSpectrogram m_melSpec; /**< MEL spectrogram computation object */ + std::function&, int, bool, size_t, size_t)> m_featureCalc; /**< Feature calculator object */ + }; + + class AdPostProcess : public BasePostProcess { + public: + /** + * @brief Constructor for AdPostProcess object. + * @param[in] outputTensor Output tensor pointer. + */ + explicit AdPostProcess(TfLiteTensor* outputTensor); + + ~AdPostProcess() = default; + + /** + * @brief Function to do the post-processing on the output tensor. + * @return True if successful, false otherwise. + */ + bool DoPostProcess() override; + + /** + * @brief Getter function for an element from the de-quantised output vector. + * @param index Index of the element to be retrieved. + * @return index represented as a 32 bit floating point number. + */ + float GetOutputValue(uint32_t index); + + private: + TfLiteTensor* m_outputTensor{}; /**< Output tensor pointer */ + std::vector m_dequantizedOutputVec{}; /**< Internal output vector */ + + /** + * @brief De-quantizes and flattens the output tensor into a vector. + * @tparam T template parameter to indicate data type. + * @return True if successful, false otherwise. + */ + template + bool Dequantize() + { + TfLiteTensor* tensor = this->m_outputTensor; + if (tensor == nullptr) { + printf_err("Invalid output tensor.\n"); + return false; + } + T* tensorData = tflite::GetTensorData(tensor); + + uint32_t totalOutputSize = 1; + for (int inputDim = 0; inputDim < tensor->dims->size; inputDim++){ + totalOutputSize *= tensor->dims->data[inputDim]; + } + + /* For getting the floating point values, we need quantization parameters */ + QuantParams quantParams = GetTensorQuantParams(tensor); + + this->m_dequantizedOutputVec = std::vector(totalOutputSize, 0); + + for (size_t i = 0; i < totalOutputSize; ++i) { + this->m_dequantizedOutputVec[i] = quantParams.scale * (tensorData[i] - quantParams.offset); + } + + return true; + } + }; + + /* Templated instances available: */ + template bool AdPostProcess::Dequantize(); + + /** + * @brief Generic feature calculator factory. + * + * Returns lambda function to compute features using features cache. + * Real features math is done by a lambda function provided as a parameter. + * Features are written to input tensor memory. + * + * @tparam T feature vector type. + * @param inputTensor model input tensor pointer. + * @param cacheSize number of feature vectors to cache. Defined by the sliding window overlap. + * @param compute features calculator function. + * @return lambda function to compute features. + */ + template + std::function&, size_t, bool, size_t, size_t)> + FeatureCalc(TfLiteTensor* inputTensor, size_t cacheSize, + std::function (std::vector& )> compute) + { + /* Feature cache to be captured by lambda function*/ + static std::vector> featureCache = std::vector>(cacheSize); + + return [=](std::vector& audioDataWindow, + size_t index, + bool useCache, + size_t featuresOverlapIndex, + size_t resizeScale) + { + T* tensorData = tflite::GetTensorData(inputTensor); + std::vector features; + + /* Reuse features from cache if cache is ready and sliding windows overlap. + * Overlap is in the beginning of sliding window with a size of a feature cache. */ + if (useCache && index < featureCache.size()) { + features = std::move(featureCache[index]); + } else { + features = std::move(compute(audioDataWindow)); + } + auto size = features.size() / resizeScale; + auto sizeBytes = sizeof(T); + + /* Input should be transposed and "resized" by skipping elements. */ + for (size_t outIndex = 0; outIndex < size; outIndex++) { + std::memcpy(tensorData + (outIndex*size) + index, &features[outIndex*resizeScale], sizeBytes); + } + + /* Start renewing cache as soon iteration goes out of the windows overlap. */ + if (index >= featuresOverlapIndex / resizeScale) { + featureCache[index - featuresOverlapIndex / resizeScale] = std::move(features); + } + }; + } + + template std::function&, size_t , bool, size_t, size_t)> + FeatureCalc(TfLiteTensor* inputTensor, + size_t cacheSize, + std::function (std::vector&)> compute); + + template std::function&, size_t, bool, size_t, size_t)> + FeatureCalc(TfLiteTensor *inputTensor, + size_t cacheSize, + std::function(std::vector&)> compute); + + std::function&, int, bool, size_t, size_t)> + GetFeatureCalculator(audio::AdMelSpectrogram& melSpec, + TfLiteTensor* inputTensor, + size_t cacheSize, + float trainingMean); + +} /* namespace app */ +} /* namespace arm */ + +#endif /* AD_PROCESSING_HPP */ diff --git a/source/application/api/use_case/ad/include/MelSpectrogram.hpp b/source/application/api/use_case/ad/include/MelSpectrogram.hpp new file mode 100644 index 0000000..d3ea3f7 --- /dev/null +++ b/source/application/api/use_case/ad/include/MelSpectrogram.hpp @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MELSPECTROGRAM_HPP +#define MELSPECTROGRAM_HPP + +#include "PlatformMath.hpp" + +#include +#include +#include +#include +#include + +namespace arm { +namespace app { +namespace audio { + + /* Mel Spectrogram consolidated parameters */ + class MelSpecParams { + public: + float m_samplingFreq; + uint32_t m_numFbankBins; + float m_melLoFreq; + float m_melHiFreq; + uint32_t m_frameLen; + uint32_t m_frameLenPadded; + bool m_useHtkMethod; + + /** @brief Constructor */ + MelSpecParams(const float samplingFreq, const uint32_t numFbankBins, + const float melLoFreq, const float melHiFreq, + const uint32_t frameLen, const bool useHtkMethod); + + MelSpecParams() = delete; + ~MelSpecParams() = default; + + /** @brief String representation of parameters */ + std::string Str() const; + }; + + /** + * @brief Class for Mel Spectrogram feature extraction. + * Based on https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Source/MFCC/mfcc.cpp + * This class is designed to be generic and self-sufficient but + * certain calculation routines can be overridden to accommodate + * use-case specific requirements. + */ + class MelSpectrogram { + + public: + /** + * @brief Extract Mel Spectrogram for one single small frame of + * audio data e.g. 640 samples. + * @param[in] audioData Vector of audio samples to calculate + * features for. + * @param[in] trainingMean Value to subtract from the the computed mel spectrogram, default 0. + * @return Vector of extracted Mel Spectrogram features. + **/ + std::vector ComputeMelSpec(const std::vector& audioData, float trainingMean = 0); + + /** + * @brief Constructor + * @param[in] params Mel Spectrogram parameters + */ + explicit MelSpectrogram(const MelSpecParams& params); + + MelSpectrogram() = delete; + ~MelSpectrogram() = default; + + /** @brief Initialise */ + void Init(); + + /** + * @brief Extract Mel Spectrogram features and quantise for one single small + * frame of audio data e.g. 640 samples. + * @param[in] audioData Vector of audio samples to calculate + * features for. + * @param[in] quantScale quantisation scale. + * @param[in] quantOffset quantisation offset. + * @param[in] trainingMean training mean. + * @return Vector of extracted quantised Mel Spectrogram features. + **/ + template + std::vector MelSpecComputeQuant(const std::vector& audioData, + const float quantScale, + const int quantOffset, + float trainingMean = 0) + { + this->ComputeMelSpec(audioData, trainingMean); + float minVal = std::numeric_limits::min(); + float maxVal = std::numeric_limits::max(); + + std::vector melSpecOut(this->m_params.m_numFbankBins); + const size_t numFbankBins = this->m_params.m_numFbankBins; + + /* Quantize to T. */ + for (size_t k = 0; k < numFbankBins; ++k) { + auto quantizedEnergy = std::round(((this->m_melEnergies[k]) / quantScale) + quantOffset); + melSpecOut[k] = static_cast(std::min(std::max(quantizedEnergy, minVal), maxVal)); + } + + return melSpecOut; + } + + /* Constants */ + static constexpr float ms_logStep = /*logf(6.4)*/ 1.8562979903656 / 27.0; + static constexpr float ms_freqStep = 200.0 / 3; + static constexpr float ms_minLogHz = 1000.0; + static constexpr float ms_minLogMel = ms_minLogHz / ms_freqStep; + + protected: + /** + * @brief Project input frequency to Mel Scale. + * @param[in] freq input frequency in floating point + * @param[in] useHTKMethod bool to signal if HTK method is to be + * used for calculation + * @return Mel transformed frequency in floating point + **/ + static float MelScale(const float freq, + const bool useHTKMethod = true); + + /** + * @brief Inverse Mel transform - convert MEL warped frequency + * back to normal frequency + * @param[in] melFreq Mel frequency in floating point + * @param[in] useHTKMethod bool to signal if HTK method is to be + * used for calculation + * @return Real world frequency in floating point + **/ + static float InverseMelScale(const float melFreq, + const bool useHTKMethod = true); + + /** + * @brief Populates MEL energies after applying the MEL filter + * bank weights and adding them up to be placed into + * bins, according to the filter bank's first and last + * indices (pre-computed for each filter bank element + * by CreateMelFilterBank function). + * @param[in] fftVec Vector populated with FFT magnitudes + * @param[in] melFilterBank 2D Vector with filter bank weights + * @param[in] filterBankFilterFirst Vector containing the first indices of filter bank + * to be used for each bin. + * @param[in] filterBankFilterLast Vector containing the last indices of filter bank + * to be used for each bin. + * @param[out] melEnergies Pre-allocated vector of MEL energies to be + * populated. + * @return true if successful, false otherwise + */ + virtual bool ApplyMelFilterBank( + std::vector& fftVec, + std::vector>& melFilterBank, + std::vector& filterBankFilterFirst, + std::vector& filterBankFilterLast, + std::vector& melEnergies); + + /** + * @brief Converts the Mel energies for logarithmic scale + * @param[in,out] melEnergies 1D vector of Mel energies + **/ + virtual void ConvertToLogarithmicScale(std::vector& melEnergies); + + /** + * @brief Given the low and high Mel values, get the normaliser + * for weights to be applied when populating the filter + * bank. + * @param[in] leftMel low Mel frequency value + * @param[in] rightMel high Mel frequency value + * @param[in] useHTKMethod bool to signal if HTK method is to be + * used for calculation + * @return Return float value to be applied + * when populating the filter bank. + */ + virtual float GetMelFilterBankNormaliser( + const float& leftMel, + const float& rightMel, + const bool useHTKMethod); + + private: + MelSpecParams m_params; + std::vector m_frame; + std::vector m_buffer; + std::vector m_melEnergies; + std::vector m_windowFunc; + std::vector> m_melFilterBank; + std::vector m_filterBankFilterFirst; + std::vector m_filterBankFilterLast; + bool m_filterBankInitialised; + arm::app::math::FftInstance m_fftInstance; + + /** + * @brief Initialises the filter banks. + **/ + void InitMelFilterBank(); + + /** + * @brief Signals whether the instance of MelSpectrogram has had its + * required buffers initialised + * @return True if initialised, false otherwise + **/ + bool IsMelFilterBankInited() const; + + /** + * @brief Create mel filter banks for Mel Spectrogram calculation. + * @return 2D vector of floats + **/ + std::vector> CreateMelFilterBank(); + + /** + * @brief Computes the magnitude from an interleaved complex array + **/ + void ConvertToPowerSpectrum(); + + }; + +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ + + +#endif /* MELSPECTROGRAM_HPP */ diff --git a/source/application/api/use_case/ad/src/AdMelSpectrogram.cc b/source/application/api/use_case/ad/src/AdMelSpectrogram.cc new file mode 100644 index 0000000..14b9323 --- /dev/null +++ b/source/application/api/use_case/ad/src/AdMelSpectrogram.cc @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AdMelSpectrogram.hpp" +#include "PlatformMath.hpp" +#include "log_macros.h" + +#include + +namespace arm { +namespace app { +namespace audio { + + bool AdMelSpectrogram::ApplyMelFilterBank( + std::vector& fftVec, + std::vector>& melFilterBank, + std::vector& filterBankFilterFirst, + std::vector& filterBankFilterLast, + std::vector& melEnergies) + { + const size_t numBanks = melEnergies.size(); + + if (numBanks != filterBankFilterFirst.size() || + numBanks != filterBankFilterLast.size()) { + printf_err("unexpected filter bank lengths\n"); + return false; + } + + for (size_t bin = 0; bin < numBanks; ++bin) { + auto filterBankIter = melFilterBank[bin].begin(); + auto end = melFilterBank[bin].end(); + float melEnergy = FLT_MIN; /* Avoid log of zero at later stages. */ + const uint32_t firstIndex = filterBankFilterFirst[bin]; + const uint32_t lastIndex = std::min(filterBankFilterLast[bin], fftVec.size() - 1); + + for (uint32_t i = firstIndex; i <= lastIndex && filterBankIter != end; ++i) { + melEnergy += (*filterBankIter++ * fftVec[i]); + } + + melEnergies[bin] = melEnergy; + } + + return true; + } + + void AdMelSpectrogram::ConvertToLogarithmicScale( + std::vector& melEnergies) + { + /* Container for natural logarithms of mel energies */ + std::vector vecLogEnergies(melEnergies.size(), 0.f); + + /* Because we are taking natural logs, we need to multiply by log10(e). + * Also, for wav2letter model, we scale our log10 values by 10 */ + constexpr float multiplier = 10.0 * /* default scalar */ + 0.4342944819032518; /* log10f(std::exp(1.0))*/ + + /* Take log of the whole vector */ + math::MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies); + + /* Scale the log values. */ + for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin(); + iterM != melEnergies.end() && iterL != vecLogEnergies.end(); ++iterM, ++iterL) { + + *iterM = *iterL * multiplier; + } + } + + float AdMelSpectrogram::GetMelFilterBankNormaliser( + const float& leftMel, + const float& rightMel, + const bool useHTKMethod) + { + /* Slaney normalization for mel weights. */ + return (2.0f / (AdMelSpectrogram::InverseMelScale(rightMel, useHTKMethod) - + AdMelSpectrogram::InverseMelScale(leftMel, useHTKMethod))); + } + +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ diff --git a/source/application/api/use_case/ad/src/AdModel.cc b/source/application/api/use_case/ad/src/AdModel.cc new file mode 100644 index 0000000..961c260 --- /dev/null +++ b/source/application/api/use_case/ad/src/AdModel.cc @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AdModel.hpp" +#include "log_macros.h" + +const tflite::MicroOpResolver& arm::app::AdModel::GetOpResolver() +{ + return this->m_opResolver; +} + +bool arm::app::AdModel::EnlistOperations() +{ + this->m_opResolver.AddAveragePool2D(); + this->m_opResolver.AddConv2D(); + this->m_opResolver.AddDepthwiseConv2D(); + this->m_opResolver.AddRelu6(); + this->m_opResolver.AddReshape(); + + if (kTfLiteOk == this->m_opResolver.AddEthosU()) { + info("Added %s support to op resolver\n", + tflite::GetString_ETHOSU()); + } else { + printf_err("Failed to add Arm NPU support to op resolver."); + return false; + } + return true; +} diff --git a/source/application/api/use_case/ad/src/AdProcessing.cc b/source/application/api/use_case/ad/src/AdProcessing.cc new file mode 100644 index 0000000..fb26a83 --- /dev/null +++ b/source/application/api/use_case/ad/src/AdProcessing.cc @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AdProcessing.hpp" + +#include "AdModel.hpp" + +namespace arm { +namespace app { + +AdPreProcess::AdPreProcess(TfLiteTensor* inputTensor, + uint32_t melSpectrogramFrameLen, + uint32_t melSpectrogramFrameStride, + float adModelTrainingMean): + m_validInstance{false}, + m_melSpectrogramFrameLen{melSpectrogramFrameLen}, + m_melSpectrogramFrameStride{melSpectrogramFrameStride}, + /**< Model is trained on features downsampled 2x */ + m_inputResizeScale{2}, + /**< We are choosing to move by 20 frames across the audio for each inference. */ + m_numMelSpecVectorsInAudioStride{20}, + m_audioDataStride{m_numMelSpecVectorsInAudioStride * melSpectrogramFrameStride}, + m_melSpec{melSpectrogramFrameLen} +{ + UNUSED(this->m_melSpectrogramFrameStride); + + if (!inputTensor) { + printf_err("Invalid input tensor provided to pre-process\n"); + return; + } + + TfLiteIntArray* inputShape = inputTensor->dims; + + if (!inputShape) { + printf_err("Invalid input tensor dims\n"); + return; + } + + const uint32_t kNumRows = inputShape->data[AdModel::ms_inputRowsIdx]; + const uint32_t kNumCols = inputShape->data[AdModel::ms_inputColsIdx]; + + /* Deduce the data length required for 1 inference from the network parameters. */ + this->m_audioDataWindowSize = (((this->m_inputResizeScale * kNumCols) - 1) * + melSpectrogramFrameStride) + + melSpectrogramFrameLen; + this->m_numReusedFeatureVectors = kNumRows - + (this->m_numMelSpecVectorsInAudioStride / + this->m_inputResizeScale); + this->m_melSpec.Init(); + + /* Creating a Mel Spectrogram sliding window for the data required for 1 inference. + * "resizing" done here by multiplying stride by resize scale. */ + this->m_melWindowSlider = audio::SlidingWindow( + nullptr, /* to be populated later. */ + this->m_audioDataWindowSize, + melSpectrogramFrameLen, + melSpectrogramFrameStride * this->m_inputResizeScale); + + /* Construct feature calculation function. */ + this->m_featureCalc = GetFeatureCalculator(this->m_melSpec, inputTensor, + this->m_numReusedFeatureVectors, + adModelTrainingMean); + this->m_validInstance = true; +} + +bool AdPreProcess::DoPreProcess(const void* input, size_t inputSize) +{ + /* Check that we have a valid instance. */ + if (!this->m_validInstance) { + printf_err("Invalid pre-processor instance\n"); + return false; + } + + /* We expect that we can traverse the size with which the MEL spectrogram + * sliding window was initialised with. */ + if (!input || inputSize < this->m_audioDataWindowSize) { + printf_err("Invalid input provided for pre-processing\n"); + return false; + } + + /* We moved to the next window - set the features sliding to the new address. */ + this->m_melWindowSlider.Reset(static_cast(input)); + + /* The first window does not have cache ready. */ + const bool useCache = this->m_audioWindowIndex > 0 && this->m_numReusedFeatureVectors > 0; + + /* Start calculating features inside one audio sliding window. */ + while (this->m_melWindowSlider.HasNext()) { + const int16_t* melSpecWindow = this->m_melWindowSlider.Next(); + std::vector melSpecAudioData = std::vector( + melSpecWindow, + melSpecWindow + this->m_melSpectrogramFrameLen); + + /* Compute features for this window and write them to input tensor. */ + this->m_featureCalc(melSpecAudioData, + this->m_melWindowSlider.Index(), + useCache, + this->m_numMelSpecVectorsInAudioStride, + this->m_inputResizeScale); + } + + return true; +} + +uint32_t AdPreProcess::GetAudioWindowSize() +{ + return this->m_audioDataWindowSize; +} + +uint32_t AdPreProcess::GetAudioDataStride() +{ + return this->m_audioDataStride; +} + +void AdPreProcess::SetAudioWindowIndex(uint32_t idx) +{ + this->m_audioWindowIndex = idx; +} + +AdPostProcess::AdPostProcess(TfLiteTensor* outputTensor) : + m_outputTensor {outputTensor} +{} + +bool AdPostProcess::DoPostProcess() +{ + switch (this->m_outputTensor->type) { + case kTfLiteInt8: + this->Dequantize(); + break; + default: + printf_err("Unsupported tensor type"); + return false; + } + + math::MathUtils::SoftmaxF32(this->m_dequantizedOutputVec); + return true; +} + +float AdPostProcess::GetOutputValue(uint32_t index) +{ + if (index < this->m_dequantizedOutputVec.size()) { + return this->m_dequantizedOutputVec[index]; + } + printf_err("Invalid index for output\n"); + return 0.0; +} + +std::function&, int, bool, size_t, size_t)> +GetFeatureCalculator(audio::AdMelSpectrogram& melSpec, + TfLiteTensor* inputTensor, + size_t cacheSize, + float trainingMean) +{ + std::function&, size_t, bool, size_t, size_t)> melSpecFeatureCalc; + + TfLiteQuantization quant = inputTensor->quantization; + + if (kTfLiteAffineQuantization == quant.type) { + + auto* quantParams = static_cast(quant.params); + const float quantScale = quantParams->scale->data[0]; + const int quantOffset = quantParams->zero_point->data[0]; + + switch (inputTensor->type) { + case kTfLiteInt8: { + melSpecFeatureCalc = FeatureCalc( + inputTensor, + cacheSize, + [=, &melSpec](std::vector& audioDataWindow) { + return melSpec.MelSpecComputeQuant( + audioDataWindow, + quantScale, + quantOffset, + trainingMean); + } + ); + break; + } + default: + printf_err("Tensor type %s not supported\n", TfLiteTypeGetName(inputTensor->type)); + } + } else { + melSpecFeatureCalc = FeatureCalc( + inputTensor, + cacheSize, + [=, &melSpec]( + std::vector& audioDataWindow) { + return melSpec.ComputeMelSpec( + audioDataWindow, + trainingMean); + }); + } + return melSpecFeatureCalc; +} + +} /* namespace app */ +} /* namespace arm */ diff --git a/source/application/api/use_case/ad/src/MelSpectrogram.cc b/source/application/api/use_case/ad/src/MelSpectrogram.cc new file mode 100644 index 0000000..ff0c536 --- /dev/null +++ b/source/application/api/use_case/ad/src/MelSpectrogram.cc @@ -0,0 +1,316 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "MelSpectrogram.hpp" + +#include "PlatformMath.hpp" +#include "log_macros.h" + +#include +#include + +namespace arm { +namespace app { +namespace audio { + + MelSpecParams::MelSpecParams( + const float samplingFreq, + const uint32_t numFbankBins, + const float melLoFreq, + const float melHiFreq, + const uint32_t frameLen, + const bool useHtkMethod): + m_samplingFreq(samplingFreq), + m_numFbankBins(numFbankBins), + m_melLoFreq(melLoFreq), + m_melHiFreq(melHiFreq), + m_frameLen(frameLen), + + /* Smallest power of 2 >= frame length. */ + m_frameLenPadded(pow(2, ceil((log(frameLen)/log(2))))), + m_useHtkMethod(useHtkMethod) + {} + + std::string MelSpecParams::Str() const + { + char strC[1024]; + snprintf(strC, sizeof(strC) - 1, "\n \ + \n\t Sampling frequency: %f\ + \n\t Number of filter banks: %" PRIu32 "\ + \n\t Mel frequency limit (low): %f\ + \n\t Mel frequency limit (high): %f\ + \n\t Frame length: %" PRIu32 "\ + \n\t Padded frame length: %" PRIu32 "\ + \n\t Using HTK for Mel scale: %s\n", + this->m_samplingFreq, this->m_numFbankBins, this->m_melLoFreq, + this->m_melHiFreq, this->m_frameLen, + this->m_frameLenPadded, this->m_useHtkMethod ? "yes" : "no"); + return std::string{strC}; + } + + MelSpectrogram::MelSpectrogram(const MelSpecParams& params): + m_params(params), + m_filterBankInitialised(false) + { + this->m_buffer = std::vector( + this->m_params.m_frameLenPadded, 0.0); + this->m_frame = std::vector( + this->m_params.m_frameLenPadded, 0.0); + this->m_melEnergies = std::vector( + this->m_params.m_numFbankBins, 0.0); + + this->m_windowFunc = std::vector(this->m_params.m_frameLen); + const auto multiplier = static_cast(2 * M_PI / this->m_params.m_frameLen); + + /* Create window function. */ + for (size_t i = 0; i < this->m_params.m_frameLen; ++i) { + this->m_windowFunc[i] = (0.5 - (0.5 * + math::MathUtils::CosineF32(static_cast(i) * multiplier))); + } + + math::MathUtils::FftInitF32(this->m_params.m_frameLenPadded, this->m_fftInstance); + debug("Instantiated Mel Spectrogram object: %s\n", this->m_params.Str().c_str()); + } + + void MelSpectrogram::Init() + { + this->InitMelFilterBank(); + } + + float MelSpectrogram::MelScale(const float freq, const bool useHTKMethod) + { + if (useHTKMethod) { + return 1127.0f * logf (1.0f + freq / 700.0f); + } else { + /* Slaney formula for mel scale. */ + float mel = freq / ms_freqStep; + + if (freq >= ms_minLogHz) { + mel = ms_minLogMel + logf(freq / ms_minLogHz) / ms_logStep; + } + return mel; + } + } + + float MelSpectrogram::InverseMelScale(const float melFreq, const bool useHTKMethod) + { + if (useHTKMethod) { + return 700.0f * (expf (melFreq / 1127.0f) - 1.0f); + } else { + /* Slaney formula for inverse mel scale. */ + float freq = ms_freqStep * melFreq; + + if (melFreq >= ms_minLogMel) { + freq = ms_minLogHz * expf(ms_logStep * (melFreq - ms_minLogMel)); + } + return freq; + } + } + + bool MelSpectrogram::ApplyMelFilterBank( + std::vector& fftVec, + std::vector>& melFilterBank, + std::vector& filterBankFilterFirst, + std::vector& filterBankFilterLast, + std::vector& melEnergies) + { + const size_t numBanks = melEnergies.size(); + + if (numBanks != filterBankFilterFirst.size() || + numBanks != filterBankFilterLast.size()) { + printf_err("unexpected filter bank lengths\n"); + return false; + } + + for (size_t bin = 0; bin < numBanks; ++bin) { + auto filterBankIter = melFilterBank[bin].begin(); + auto end = melFilterBank[bin].end(); + float melEnergy = FLT_MIN; /* Avoid log of zero at later stages */ + const uint32_t firstIndex = filterBankFilterFirst[bin]; + const uint32_t lastIndex = std::min(filterBankFilterLast[bin], fftVec.size() - 1); + + for (uint32_t i = firstIndex; i <= lastIndex && filterBankIter != end; ++i) { + float energyRep = math::MathUtils::SqrtF32(fftVec[i]); + melEnergy += (*filterBankIter++ * energyRep); + } + + melEnergies[bin] = melEnergy; + } + + return true; + } + + void MelSpectrogram::ConvertToLogarithmicScale(std::vector& melEnergies) + { + for (float& melEnergy : melEnergies) { + melEnergy = logf(melEnergy); + } + } + + void MelSpectrogram::ConvertToPowerSpectrum() + { + const uint32_t halfDim = this->m_buffer.size() / 2; + + /* Handle this special case. */ + float firstEnergy = this->m_buffer[0] * this->m_buffer[0]; + float lastEnergy = this->m_buffer[1] * this->m_buffer[1]; + + math::MathUtils::ComplexMagnitudeSquaredF32( + this->m_buffer.data(), + this->m_buffer.size(), + this->m_buffer.data(), + this->m_buffer.size()/2); + + this->m_buffer[0] = firstEnergy; + this->m_buffer[halfDim] = lastEnergy; + } + + float MelSpectrogram::GetMelFilterBankNormaliser( + const float& leftMel, + const float& rightMel, + const bool useHTKMethod) + { + UNUSED(leftMel); + UNUSED(rightMel); + UNUSED(useHTKMethod); + + /* By default, no normalisation => return 1 */ + return 1.f; + } + + void MelSpectrogram::InitMelFilterBank() + { + if (!this->IsMelFilterBankInited()) { + this->m_melFilterBank = this->CreateMelFilterBank(); + this->m_filterBankInitialised = true; + } + } + + bool MelSpectrogram::IsMelFilterBankInited() const + { + return this->m_filterBankInitialised; + } + + std::vector MelSpectrogram::ComputeMelSpec(const std::vector& audioData, float trainingMean) + { + this->InitMelFilterBank(); + + /* TensorFlow way of normalizing .wav data to (-1, 1). */ + constexpr float normaliser = 1.0/(1<<15); + for (size_t i = 0; i < this->m_params.m_frameLen; ++i) { + this->m_frame[i] = static_cast(audioData[i]) * normaliser; + } + + /* Apply window function to input frame. */ + for(size_t i = 0; i < this->m_params.m_frameLen; ++i) { + this->m_frame[i] *= this->m_windowFunc[i]; + } + + /* Set remaining frame values to 0. */ + std::fill(this->m_frame.begin() + this->m_params.m_frameLen,this->m_frame.end(), 0); + + /* Compute FFT. */ + math::MathUtils::FftF32(this->m_frame, this->m_buffer, this->m_fftInstance); + + /* Convert to power spectrum. */ + this->ConvertToPowerSpectrum(); + + /* Apply mel filterbanks. */ + if (!this->ApplyMelFilterBank(this->m_buffer, + this->m_melFilterBank, + this->m_filterBankFilterFirst, + this->m_filterBankFilterLast, + this->m_melEnergies)) { + printf_err("Failed to apply MEL filter banks\n"); + } + + /* Convert to logarithmic scale */ + this->ConvertToLogarithmicScale(this->m_melEnergies); + + /* Perform mean subtraction. */ + for (auto& energy:this->m_melEnergies) { + energy -= trainingMean; + } + + return this->m_melEnergies; + } + + std::vector> MelSpectrogram::CreateMelFilterBank() + { + size_t numFftBins = this->m_params.m_frameLenPadded / 2; + float fftBinWidth = static_cast(this->m_params.m_samplingFreq) / this->m_params.m_frameLenPadded; + + float melLowFreq = MelSpectrogram::MelScale(this->m_params.m_melLoFreq, + this->m_params.m_useHtkMethod); + float melHighFreq = MelSpectrogram::MelScale(this->m_params.m_melHiFreq, + this->m_params.m_useHtkMethod); + float melFreqDelta = (melHighFreq - melLowFreq) / (this->m_params.m_numFbankBins + 1); + + std::vector thisBin = std::vector(numFftBins); + std::vector> melFilterBank( + this->m_params.m_numFbankBins); + this->m_filterBankFilterFirst = + std::vector(this->m_params.m_numFbankBins); + this->m_filterBankFilterLast = + std::vector(this->m_params.m_numFbankBins); + + for (size_t bin = 0; bin < this->m_params.m_numFbankBins; bin++) { + float leftMel = melLowFreq + bin * melFreqDelta; + float centerMel = melLowFreq + (bin + 1) * melFreqDelta; + float rightMel = melLowFreq + (bin + 2) * melFreqDelta; + + uint32_t firstIndex = 0; + uint32_t lastIndex = 0; + bool firstIndexFound = false; + const float normaliser = this->GetMelFilterBankNormaliser(leftMel, rightMel, this->m_params.m_useHtkMethod); + + for (size_t i = 0; i < numFftBins; ++i) { + float freq = (fftBinWidth * i); /* Center freq of this fft bin. */ + float mel = MelSpectrogram::MelScale(freq, this->m_params.m_useHtkMethod); + thisBin[i] = 0.0; + + if (mel > leftMel && mel < rightMel) { + float weight; + if (mel <= centerMel) { + weight = (mel - leftMel) / (centerMel - leftMel); + } else { + weight = (rightMel - mel) / (rightMel - centerMel); + } + + thisBin[i] = weight * normaliser; + if (!firstIndexFound) { + firstIndex = i; + firstIndexFound = true; + } + lastIndex = i; + } + } + + this->m_filterBankFilterFirst[bin] = firstIndex; + this->m_filterBankFilterLast[bin] = lastIndex; + + /* Copy the part we care about. */ + for (uint32_t i = firstIndex; i <= lastIndex; ++i) { + melFilterBank[bin].push_back(thisBin[i]); + } + } + + return melFilterBank; + } + +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ diff --git a/source/application/api/use_case/asr/CMakeLists.txt b/source/application/api/use_case/asr/CMakeLists.txt new file mode 100644 index 0000000..77e3d6a --- /dev/null +++ b/source/application/api/use_case/asr/CMakeLists.txt @@ -0,0 +1,43 @@ +#---------------------------------------------------------------------------- +# Copyright (c) 2022 Arm Limited. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#---------------------------------------------------------------------------- +######################################################### +# AUTOMATIC SPEECH RECOGNITION API library # +######################################################### +cmake_minimum_required(VERSION 3.15.6) + +set(ASR_API_TARGET asr_api) +project(${ASR_API_TARGET} + DESCRIPTION "Automatic speech recognition use case API library" + LANGUAGES C CXX) + +# Create static library +add_library(${ASR_API_TARGET} STATIC + src/Wav2LetterPreprocess.cc + src/Wav2LetterPostprocess.cc + src/Wav2LetterMfcc.cc + src/AsrClassifier.cc + src/OutputDecode.cc + src/Wav2LetterModel.cc) + +target_include_directories(${ASR_API_TARGET} PUBLIC include) + +target_link_libraries(${ASR_API_TARGET} PUBLIC common_api) + +message(STATUS "*******************************************************") +message(STATUS "Library : " ${ASR_API_TARGET}) +message(STATUS "CMAKE_SYSTEM_PROCESSOR : " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "*******************************************************") diff --git a/source/application/api/use_case/asr/include/AsrClassifier.hpp b/source/application/api/use_case/asr/include/AsrClassifier.hpp new file mode 100644 index 0000000..a07a721 --- /dev/null +++ b/source/application/api/use_case/asr/include/AsrClassifier.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ASR_CLASSIFIER_HPP +#define ASR_CLASSIFIER_HPP + +#include "Classifier.hpp" + +namespace arm { +namespace app { + + class AsrClassifier : public Classifier { + public: + /** + * @brief Gets the top N classification results from the + * output vector. + * @param[in] outputTensor Inference output tensor from an NN model. + * @param[out] vecResults A vector of classification results + * populated by this function. + * @param[in] labels Labels vector to match classified classes + * @param[in] topNCount Number of top classifications to pick. + * @param[in] use_softmax Whether softmax scaling should be applied to model output. + * @return true if successful, false otherwise. + **/ + bool GetClassificationResults(TfLiteTensor* outputTensor, + std::vector& vecResults, + const std::vector& labels, + uint32_t topNCount, bool use_softmax = false) override; + + private: + /** + * @brief Utility function that gets the top 1 classification results from the + * output tensor (vector of vector). + * @param[in] tensor Inference output tensor from an NN model. + * @param[out] vecResults Vector of classification results populated by this function. + * @param[in] labels Labels vector to match classified classes. + * @param[in] scale Quantization scale. + * @param[in] zeroPoint Quantization zero point. + * @return true if successful, false otherwise. + **/ + template + bool GetTopResults(TfLiteTensor* tensor, + std::vector& vecResults, + const std::vector& labels, double scale, double zeroPoint); + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* ASR_CLASSIFIER_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/asr/include/AsrResult.hpp b/source/application/api/use_case/asr/include/AsrResult.hpp new file mode 100644 index 0000000..ed826d0 --- /dev/null +++ b/source/application/api/use_case/asr/include/AsrResult.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ASR_RESULT_HPP +#define ASR_RESULT_HPP + +#include "ClassificationResult.hpp" + +#include + +namespace arm { +namespace app { +namespace asr { + + using ResultVec = std::vector; + + /* Structure for holding ASR result. */ + class AsrResult { + + public: + ResultVec m_resultVec; /* Container for "thresholded" classification results. */ + float m_timeStamp; /* Audio timestamp for this result. */ + uint32_t m_inferenceNumber; /* Corresponding inference number. */ + float m_threshold; /* Threshold value for `m_resultVec.` */ + + AsrResult() = delete; + AsrResult(ResultVec& resultVec, + const float timestamp, + const uint32_t inferenceIdx, + const float scoreThreshold) { + + this->m_threshold = scoreThreshold; + this->m_timeStamp = timestamp; + this->m_inferenceNumber = inferenceIdx; + + this->m_resultVec = ResultVec(); + for (auto& i : resultVec) { + if (i.m_normalisedVal >= this->m_threshold) { + this->m_resultVec.emplace_back(i); + } + } + } + ~AsrResult() = default; + }; + +} /* namespace asr */ +} /* namespace app */ +} /* namespace arm */ + +#endif /* ASR_RESULT_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/asr/include/OutputDecode.hpp b/source/application/api/use_case/asr/include/OutputDecode.hpp new file mode 100644 index 0000000..9d39057 --- /dev/null +++ b/source/application/api/use_case/asr/include/OutputDecode.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ASR_OUTPUT_DECODE_HPP +#define ASR_OUTPUT_DECODE_HPP + +#include "AsrClassifier.hpp" + +namespace arm { +namespace app { +namespace audio { +namespace asr { + + /** + * @brief Gets the top N classification results from the + * output vector. + * @param[in] vecResults Label output from classifier. + * @return true if successful, false otherwise. + **/ + std::string DecodeOutput(const std::vector& vecResults); + +} /* namespace asr */ +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ + +#endif /* ASR_OUTPUT_DECODE_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/asr/include/Wav2LetterMfcc.hpp b/source/application/api/use_case/asr/include/Wav2LetterMfcc.hpp new file mode 100644 index 0000000..b5a21d3 --- /dev/null +++ b/source/application/api/use_case/asr/include/Wav2LetterMfcc.hpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ASR_WAV2LETTER_MFCC_HPP +#define ASR_WAV2LETTER_MFCC_HPP + +#include "Mfcc.hpp" + +namespace arm { +namespace app { +namespace audio { + + /* Class to provide Wav2Letter specific MFCC calculation requirements. */ + class Wav2LetterMFCC : public MFCC { + + public: + static constexpr uint32_t ms_defaultSamplingFreq = 16000; + static constexpr uint32_t ms_defaultNumFbankBins = 128; + static constexpr uint32_t ms_defaultMelLoFreq = 0; + static constexpr uint32_t ms_defaultMelHiFreq = 8000; + static constexpr bool ms_defaultUseHtkMethod = false; + + explicit Wav2LetterMFCC(const size_t numFeats, const size_t frameLen) + : MFCC(MfccParams( + ms_defaultSamplingFreq, ms_defaultNumFbankBins, + ms_defaultMelLoFreq, ms_defaultMelHiFreq, + numFeats, frameLen, ms_defaultUseHtkMethod)) + {} + + Wav2LetterMFCC() = delete; + ~Wav2LetterMFCC() = default; + + protected: + + /** + * @brief Overrides base class implementation of this function. + * @param[in] fftVec Vector populated with FFT magnitudes + * @param[in] melFilterBank 2D Vector with filter bank weights + * @param[in] filterBankFilterFirst Vector containing the first indices of filter bank + * to be used for each bin. + * @param[in] filterBankFilterLast Vector containing the last indices of filter bank + * to be used for each bin. + * @param[out] melEnergies Pre-allocated vector of MEL energies to be + * populated. + * @return true if successful, false otherwise + */ + bool ApplyMelFilterBank( + std::vector& fftVec, + std::vector>& melFilterBank, + std::vector& filterBankFilterFirst, + std::vector& filterBankFilterLast, + std::vector& melEnergies) override; + + /** + * @brief Override for the base class implementation convert mel + * energies to logarithmic scale. The difference from + * default behaviour is that the power is converted to dB + * and subsequently clamped. + * @param[in,out] melEnergies 1D vector of Mel energies + **/ + void ConvertToLogarithmicScale(std::vector& melEnergies) override; + + /** + * @brief Create a matrix used to calculate Discrete Cosine + * Transform. Override for the base class' default + * implementation as the first and last elements + * use a different normaliser. + * @param[in] inputLength input length of the buffer on which + * DCT will be performed + * @param[in] coefficientCount Total coefficients per input length. + * @return 1D vector with inputLength x coefficientCount elements + * populated with DCT coefficients. + */ + std::vector CreateDCTMatrix(int32_t inputLength, + int32_t coefficientCount) override; + + /** + * @brief Given the low and high Mel values, get the normaliser + * for weights to be applied when populating the filter + * bank. Override for the base class implementation. + * @param[in] leftMel Low Mel frequency value. + * @param[in] rightMel High Mel frequency value. + * @param[in] useHTKMethod bool to signal if HTK method is to be + * used for calculation. + * @return Value to use for normalising. + */ + float GetMelFilterBankNormaliser(const float& leftMel, + const float& rightMel, + bool useHTKMethod) override; + }; + +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ + +#endif /* ASR_WAV2LETTER_MFCC_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/asr/include/Wav2LetterModel.hpp b/source/application/api/use_case/asr/include/Wav2LetterModel.hpp new file mode 100644 index 0000000..a02eed1 --- /dev/null +++ b/source/application/api/use_case/asr/include/Wav2LetterModel.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ASR_WAV2LETTER_MODEL_HPP +#define ASR_WAV2LETTER_MODEL_HPP + +#include "Model.hpp" + +namespace arm { +namespace app { +namespace asr { + extern const int g_FrameLength; + extern const int g_FrameStride; + extern const float g_ScoreThreshold; + extern const int g_ctxLen; +} /* namespace asr */ +} /* namespace app */ +} /* namespace arm */ + +namespace arm { +namespace app { + + class Wav2LetterModel : public Model { + + public: + /* Indices for the expected model - based on input and output tensor shapes */ + static constexpr uint32_t ms_inputRowsIdx = 1; + static constexpr uint32_t ms_inputColsIdx = 2; + static constexpr uint32_t ms_outputRowsIdx = 2; + static constexpr uint32_t ms_outputColsIdx = 3; + + /* Model specific constants. */ + static constexpr uint32_t ms_blankTokenIdx = 28; + static constexpr uint32_t ms_numMfccFeatures = 13; + + protected: + /** @brief Gets the reference to op resolver interface class. */ + const tflite::MicroOpResolver& GetOpResolver() override; + + /** @brief Adds operations to the op resolver instance. */ + bool EnlistOperations() override; + + private: + /* Maximum number of individual operations that can be enlisted. */ + static constexpr int ms_maxOpCnt = 5; + + /* A mutable op resolver instance. */ + tflite::MicroMutableOpResolver m_opResolver; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* ASR_WAV2LETTER_MODEL_HPP */ diff --git a/source/application/api/use_case/asr/include/Wav2LetterPostprocess.hpp b/source/application/api/use_case/asr/include/Wav2LetterPostprocess.hpp new file mode 100644 index 0000000..02738bc --- /dev/null +++ b/source/application/api/use_case/asr/include/Wav2LetterPostprocess.hpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ASR_WAV2LETTER_POSTPROCESS_HPP +#define ASR_WAV2LETTER_POSTPROCESS_HPP + +#include "TensorFlowLiteMicro.hpp" /* TensorFlow headers. */ +#include "BaseProcessing.hpp" +#include "Model.hpp" +#include "AsrClassifier.hpp" +#include "AsrResult.hpp" +#include "log_macros.h" + +namespace arm { +namespace app { + + /** + * @brief Helper class to manage tensor post-processing for "wav2letter" + * output. + */ + class AsrPostProcess : public BasePostProcess { + public: + bool m_lastIteration = false; /* Flag to set if processing the last set of data for a clip. */ + + /** + * @brief Constructor + * @param[in] outputTensor Pointer to the TFLite Micro output Tensor. + * @param[in] classifier Object used to get top N results from classification. + * @param[in] labels Vector of string labels to identify each output of the model. + * @param[in/out] result Vector of classification results to store decoded outputs. + * @param[in] outputContextLen Left/right context length for output tensor. + * @param[in] blankTokenIdx Index in the labels that the "Blank token" takes. + * @param[in] reductionAxis The axis that the logits of each time step is on. + **/ + AsrPostProcess(TfLiteTensor* outputTensor, AsrClassifier& classifier, + const std::vector& labels, asr::ResultVec& result, + uint32_t outputContextLen, + uint32_t blankTokenIdx, uint32_t reductionAxis); + + /** + * @brief Should perform post-processing of the result of inference then + * populate ASR result data for any later use. + * @return true if successful, false otherwise. + **/ + bool DoPostProcess() override; + + /** @brief Gets the output inner length for post-processing. */ + static uint32_t GetOutputInnerLen(const TfLiteTensor*, uint32_t outputCtxLen); + + /** @brief Gets the output context length (left/right) for post-processing. */ + static uint32_t GetOutputContextLen(const Model& model, uint32_t inputCtxLen); + + /** @brief Gets the number of feature vectors to be computed. */ + static uint32_t GetNumFeatureVectors(const Model& model); + + private: + AsrClassifier& m_classifier; /* ASR Classifier object. */ + TfLiteTensor* m_outputTensor; /* Model output tensor. */ + const std::vector& m_labels; /* ASR Labels. */ + asr::ResultVec & m_results; /* Results vector for a single inference. */ + uint32_t m_outputContextLen; /* lengths of left/right contexts for output. */ + uint32_t m_outputInnerLen; /* Length of output inner context. */ + uint32_t m_totalLen; /* Total length of the required axis. */ + uint32_t m_countIterations; /* Current number of iterations. */ + uint32_t m_blankTokenIdx; /* Index of the labels blank token. */ + uint32_t m_reductionAxisIdx; /* Axis containing output logits for a single step. */ + + /** + * @brief Checks if the tensor and axis index are valid + * inputs to the object - based on how it has been initialised. + * @return true if valid, false otherwise. + */ + bool IsInputValid(TfLiteTensor* tensor, + uint32_t axisIdx) const; + + /** + * @brief Gets the tensor data element size in bytes based + * on the tensor type. + * @return Size in bytes, 0 if not supported. + */ + static uint32_t GetTensorElementSize(TfLiteTensor* tensor); + + /** + * @brief Erases sections from the data assuming row-wise + * arrangement along the context axis. + * @return true if successful, false otherwise. + */ + bool EraseSectionsRowWise(uint8_t* ptrData, + uint32_t strideSzBytes, + bool lastIteration); + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* ASR_WAV2LETTER_POSTPROCESS_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/asr/include/Wav2LetterPreprocess.hpp b/source/application/api/use_case/asr/include/Wav2LetterPreprocess.hpp new file mode 100644 index 0000000..9943946 --- /dev/null +++ b/source/application/api/use_case/asr/include/Wav2LetterPreprocess.hpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef ASR_WAV2LETTER_PREPROCESS_HPP +#define ASR_WAV2LETTER_PREPROCESS_HPP + +#include "TensorFlowLiteMicro.hpp" +#include "Wav2LetterMfcc.hpp" +#include "AudioUtils.hpp" +#include "DataStructures.hpp" +#include "BaseProcessing.hpp" +#include "log_macros.h" + +namespace arm { +namespace app { + + /* Class to facilitate pre-processing calculation for Wav2Letter model + * for ASR. */ + using AudioWindow = audio::SlidingWindow; + + class AsrPreProcess : public BasePreProcess { + public: + /** + * @brief Constructor. + * @param[in] inputTensor Pointer to the TFLite Micro input Tensor. + * @param[in] numMfccFeatures Number of MFCC features per window. + * @param[in] numFeatureFrames Number of MFCC vectors that need to be calculated + * for an inference. + * @param[in] mfccWindowLen Number of audio elements to calculate MFCC features per window. + * @param[in] mfccWindowStride Stride (in number of elements) for moving the MFCC window. + */ + AsrPreProcess(TfLiteTensor* inputTensor, + uint32_t numMfccFeatures, + uint32_t numFeatureFrames, + uint32_t mfccWindowLen, + uint32_t mfccWindowStride); + + /** + * @brief Calculates the features required from audio data. This + * includes MFCC, first and second order deltas, + * normalisation and finally, quantisation. The tensor is + * populated with features from a given window placed along + * in a single row. + * @param[in] audioData Pointer to the first element of audio data. + * @param[in] audioDataLen Number of elements in the audio data. + * @return true if successful, false in case of error. + */ + bool DoPreProcess(const void* audioData, size_t audioDataLen) override; + + protected: + /** + * @brief Computes the first and second order deltas for the + * MFCC buffers - they are assumed to be populated. + * + * @param[in] mfcc MFCC buffers. + * @param[out] delta1 Result of the first diff computation. + * @param[out] delta2 Result of the second diff computation. + * @return true if successful, false otherwise. + */ + static bool ComputeDeltas(Array2d& mfcc, + Array2d& delta1, + Array2d& delta2); + + /** + * @brief Given a 2D vector of floats, rescale it to have mean of 0 and + * standard deviation of 1. + * @param[in,out] vec Vector of vector of floats. + */ + static void StandardizeVecF32(Array2d& vec); + + /** + * @brief Standardizes all the MFCC and delta buffers to have mean 0 and std. dev 1. + */ + void Standarize(); + + /** + * @brief Given the quantisation and data type limits, computes + * the quantised values of a floating point input data. + * @param[in] elem Element to be quantised. + * @param[in] quantScale Scale. + * @param[in] quantOffset Offset. + * @param[in] minVal Numerical limit - minimum. + * @param[in] maxVal Numerical limit - maximum. + * @return Floating point quantised value. + */ + static float GetQuantElem( + float elem, + float quantScale, + int quantOffset, + float minVal, + float maxVal); + + /** + * @brief Quantises the MFCC and delta buffers, and places them + * in the output buffer. While doing so, it transposes + * the data. Reason: Buffers in this class are arranged + * for "time" axis to be row major. Primary reason for + * this being the convolution speed up (as we can use + * contiguous memory). The output, however, requires the + * time axis to be in column major arrangement. + * @param[in] outputBuf Pointer to the output buffer. + * @param[in] outputBufSz Output buffer's size. + * @param[in] quantScale Quantisation scale. + * @param[in] quantOffset Quantisation offset. + */ + template + bool Quantise( + T* outputBuf, + const uint32_t outputBufSz, + const float quantScale, + const int quantOffset) + { + /* Check the output size will fit everything. */ + if (outputBufSz < (this->m_mfccBuf.size(0) * 3 * sizeof(T))) { + printf_err("Tensor size too small for features\n"); + return false; + } + + /* Populate. */ + T* outputBufMfcc = outputBuf; + T* outputBufD1 = outputBuf + this->m_numMfccFeats; + T* outputBufD2 = outputBufD1 + this->m_numMfccFeats; + const uint32_t ptrIncr = this->m_numMfccFeats * 2; /* (3 vectors - 1 vector) */ + + const float minVal = std::numeric_limits::min(); + const float maxVal = std::numeric_limits::max(); + + /* Need to transpose while copying and concatenating the tensor. */ + for (uint32_t j = 0; j < this->m_numFeatureFrames; ++j) { + for (uint32_t i = 0; i < this->m_numMfccFeats; ++i) { + *outputBufMfcc++ = static_cast(AsrPreProcess::GetQuantElem( + this->m_mfccBuf(i, j), quantScale, + quantOffset, minVal, maxVal)); + *outputBufD1++ = static_cast(AsrPreProcess::GetQuantElem( + this->m_delta1Buf(i, j), quantScale, + quantOffset, minVal, maxVal)); + *outputBufD2++ = static_cast(AsrPreProcess::GetQuantElem( + this->m_delta2Buf(i, j), quantScale, + quantOffset, minVal, maxVal)); + } + outputBufMfcc += ptrIncr; + outputBufD1 += ptrIncr; + outputBufD2 += ptrIncr; + } + + return true; + } + + private: + audio::Wav2LetterMFCC m_mfcc; /* MFCC instance. */ + TfLiteTensor* m_inputTensor; /* Model input tensor. */ + + /* Actual buffers to be populated. */ + Array2d m_mfccBuf; /* Contiguous buffer 1D: MFCC */ + Array2d m_delta1Buf; /* Contiguous buffer 1D: Delta 1 */ + Array2d m_delta2Buf; /* Contiguous buffer 1D: Delta 2 */ + + uint32_t m_mfccWindowLen; /* Window length for MFCC. */ + uint32_t m_mfccWindowStride; /* Window stride len for MFCC. */ + uint32_t m_numMfccFeats; /* Number of MFCC features per window. */ + uint32_t m_numFeatureFrames; /* How many sets of m_numMfccFeats. */ + AudioWindow m_mfccSlidingWindow; /* Sliding window to calculate MFCCs. */ + + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* ASR_WAV2LETTER_PREPROCESS_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/asr/src/AsrClassifier.cc b/source/application/api/use_case/asr/src/AsrClassifier.cc new file mode 100644 index 0000000..4ba8c7b --- /dev/null +++ b/source/application/api/use_case/asr/src/AsrClassifier.cc @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "AsrClassifier.hpp" + +#include "log_macros.h" +#include "TensorFlowLiteMicro.hpp" +#include "Wav2LetterModel.hpp" + +namespace arm { +namespace app { + + template + bool AsrClassifier::GetTopResults(TfLiteTensor* tensor, + std::vector& vecResults, + const std::vector & labels, double scale, double zeroPoint) + { + const uint32_t nElems = tensor->dims->data[Wav2LetterModel::ms_outputRowsIdx]; + const uint32_t nLetters = tensor->dims->data[Wav2LetterModel::ms_outputColsIdx]; + + if (nLetters != labels.size()) { + printf("Output size doesn't match the labels' size\n"); + return false; + } + + /* NOTE: tensor's size verification against labels should be + * checked by the calling/public function. */ + if (nLetters < 1) { + return false; + } + + /* Final results' container. */ + vecResults = std::vector(nElems); + + T* tensorData = tflite::GetTensorData(tensor); + + /* Get the top 1 results. */ + for (uint32_t i = 0, row = 0; i < nElems; ++i, row+=nLetters) { + std::pair top_1 = std::make_pair(tensorData[row + 0], 0); + + for (uint32_t j = 1; j < nLetters; ++j) { + if (top_1.first < tensorData[row + j]) { + top_1.first = tensorData[row + j]; + top_1.second = j; + } + } + + double score = static_cast (top_1.first); + vecResults[i].m_normalisedVal = scale * (score - zeroPoint); + vecResults[i].m_label = labels[top_1.second]; + vecResults[i].m_labelIdx = top_1.second; + } + + return true; + } + template bool AsrClassifier::GetTopResults(TfLiteTensor* tensor, + std::vector& vecResults, + const std::vector & labels, + double scale, double zeroPoint); + template bool AsrClassifier::GetTopResults(TfLiteTensor* tensor, + std::vector& vecResults, + const std::vector & labels, + double scale, double zeroPoint); + + bool AsrClassifier::GetClassificationResults( + TfLiteTensor* outputTensor, + std::vector& vecResults, + const std::vector & labels, uint32_t topNCount, bool use_softmax) + { + UNUSED(use_softmax); + vecResults.clear(); + + constexpr int minTensorDims = static_cast( + (Wav2LetterModel::ms_outputRowsIdx > Wav2LetterModel::ms_outputColsIdx)? + Wav2LetterModel::ms_outputRowsIdx : Wav2LetterModel::ms_outputColsIdx); + + constexpr uint32_t outColsIdx = Wav2LetterModel::ms_outputColsIdx; + + /* Sanity checks. */ + if (outputTensor == nullptr) { + printf_err("Output vector is null pointer.\n"); + return false; + } else if (outputTensor->dims->size < minTensorDims) { + printf_err("Output tensor expected to be %dD\n", minTensorDims); + return false; + } else if (static_cast(outputTensor->dims->data[outColsIdx]) < topNCount) { + printf_err("Output vectors are smaller than %" PRIu32 "\n", topNCount); + return false; + } else if (static_cast(outputTensor->dims->data[outColsIdx]) != labels.size()) { + printf("Output size doesn't match the labels' size\n"); + return false; + } + + if (topNCount != 1) { + warn("TopNCount value ignored in this implementation\n"); + } + + /* To return the floating point values, we need quantization parameters. */ + QuantParams quantParams = GetTensorQuantParams(outputTensor); + + bool resultState; + + switch (outputTensor->type) { + case kTfLiteUInt8: + resultState = this->GetTopResults( + outputTensor, vecResults, + labels, quantParams.scale, + quantParams.offset); + break; + case kTfLiteInt8: + resultState = this->GetTopResults( + outputTensor, vecResults, + labels, quantParams.scale, + quantParams.offset); + break; + default: + printf_err("Tensor type %s not supported by classifier\n", + TfLiteTypeGetName(outputTensor->type)); + return false; + } + + if (!resultState) { + printf_err("Failed to get sorted set\n"); + return false; + } + + return true; + } + +} /* namespace app */ +} /* namespace arm */ \ No newline at end of file diff --git a/source/application/api/use_case/asr/src/OutputDecode.cc b/source/application/api/use_case/asr/src/OutputDecode.cc new file mode 100644 index 0000000..41fbe07 --- /dev/null +++ b/source/application/api/use_case/asr/src/OutputDecode.cc @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "OutputDecode.hpp" + +namespace arm { +namespace app { +namespace audio { +namespace asr { + + std::string DecodeOutput(const std::vector& vecResults) + { + std::string CleanOutputBuffer; + + for (size_t i = 0; i < vecResults.size(); ++i) /* For all elements in vector. */ + { + while (i+1 < vecResults.size() && + vecResults[i].m_label == vecResults[i+1].m_label) /* While the current element is equal to the next, ignore it and move on. */ + { + ++i; + } + if (vecResults[i].m_label != "$") /* $ is a character used to represent unknown and double characters so should not be in output. */ + { + CleanOutputBuffer += vecResults[i].m_label; /* If the element is different to the next, it will be appended to CleanOutputBuffer. */ + } + } + + return CleanOutputBuffer; /* Return string type containing clean output. */ + } + +} /* namespace asr */ +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ diff --git a/source/application/api/use_case/asr/src/Wav2LetterMfcc.cc b/source/application/api/use_case/asr/src/Wav2LetterMfcc.cc new file mode 100644 index 0000000..bb29b0f --- /dev/null +++ b/source/application/api/use_case/asr/src/Wav2LetterMfcc.cc @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "Wav2LetterMfcc.hpp" + +#include "PlatformMath.hpp" +#include "log_macros.h" + +#include + +namespace arm { +namespace app { +namespace audio { + + bool Wav2LetterMFCC::ApplyMelFilterBank( + std::vector& fftVec, + std::vector>& melFilterBank, + std::vector& filterBankFilterFirst, + std::vector& filterBankFilterLast, + std::vector& melEnergies) + { + const size_t numBanks = melEnergies.size(); + + if (numBanks != filterBankFilterFirst.size() || + numBanks != filterBankFilterLast.size()) { + printf_err("Unexpected filter bank lengths\n"); + return false; + } + + for (size_t bin = 0; bin < numBanks; ++bin) { + auto filterBankIter = melFilterBank[bin].begin(); + auto end = melFilterBank[bin].end(); + /* Avoid log of zero at later stages, same value used in librosa. + * The number was used during our default wav2letter model training. */ + float melEnergy = 1e-10; + const uint32_t firstIndex = filterBankFilterFirst[bin]; + const uint32_t lastIndex = std::min(filterBankFilterLast[bin], fftVec.size() - 1); + + for (uint32_t i = firstIndex; i <= lastIndex && filterBankIter != end; ++i) { + melEnergy += (*filterBankIter++ * fftVec[i]); + } + + melEnergies[bin] = melEnergy; + } + + return true; + } + + void Wav2LetterMFCC::ConvertToLogarithmicScale( + std::vector& melEnergies) + { + float maxMelEnergy = -FLT_MAX; + + /* Container for natural logarithms of mel energies. */ + std::vector vecLogEnergies(melEnergies.size(), 0.f); + + /* Because we are taking natural logs, we need to multiply by log10(e). + * Also, for wav2letter model, we scale our log10 values by 10. */ + constexpr float multiplier = 10.0 * /* Default scalar. */ + 0.4342944819032518; /* log10f(std::exp(1.0)) */ + + /* Take log of the whole vector. */ + math::MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies); + + /* Scale the log values and get the max. */ + for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin(); + iterM != melEnergies.end() && iterL != vecLogEnergies.end(); ++iterM, ++iterL) { + + *iterM = *iterL * multiplier; + + /* Save the max mel energy. */ + if (*iterM > maxMelEnergy) { + maxMelEnergy = *iterM; + } + } + + /* Clamp the mel energies. */ + constexpr float maxDb = 80.0; + const float clampLevelLowdB = maxMelEnergy - maxDb; + for (float& melEnergy : melEnergies) { + melEnergy = std::max(melEnergy, clampLevelLowdB); + } + } + + std::vector Wav2LetterMFCC::CreateDCTMatrix( + const int32_t inputLength, + const int32_t coefficientCount) + { + std::vector dctMatix(inputLength * coefficientCount); + + /* Orthonormal normalization. */ + const float normalizerK0 = 2 * math::MathUtils::SqrtF32(1.0f / + static_cast(4*inputLength)); + const float normalizer = 2 * math::MathUtils::SqrtF32(1.0f / + static_cast(2*inputLength)); + + const float angleIncr = M_PI / inputLength; + float angle = angleIncr; /* We start using it at k = 1 loop. */ + + /* First row of DCT will use normalizer K0. */ + for (int32_t n = 0; n < inputLength; ++n) { + dctMatix[n] = normalizerK0 /* cos(0) = 1 */; + } + + /* Second row (index = 1) onwards, we use standard normalizer. */ + for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength) { + for (int32_t n = 0; n < inputLength; ++n) { + dctMatix[m+n] = normalizer * + math::MathUtils::CosineF32((n + 0.5f) * angle); + } + angle += angleIncr; + } + return dctMatix; + } + + float Wav2LetterMFCC::GetMelFilterBankNormaliser( + const float& leftMel, + const float& rightMel, + const bool useHTKMethod) + { + /* Slaney normalization for mel weights. */ + return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) - + MFCC::InverseMelScale(leftMel, useHTKMethod))); + } + +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ diff --git a/source/application/api/use_case/asr/src/Wav2LetterModel.cc b/source/application/api/use_case/asr/src/Wav2LetterModel.cc new file mode 100644 index 0000000..7b1e521 --- /dev/null +++ b/source/application/api/use_case/asr/src/Wav2LetterModel.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "Wav2LetterModel.hpp" + +#include "log_macros.h" + + +const tflite::MicroOpResolver& arm::app::Wav2LetterModel::GetOpResolver() +{ + return this->m_opResolver; +} + +bool arm::app::Wav2LetterModel::EnlistOperations() +{ + this->m_opResolver.AddConv2D(); + this->m_opResolver.AddReshape(); + this->m_opResolver.AddLeakyRelu(); + this->m_opResolver.AddSoftmax(); + + if (kTfLiteOk == this->m_opResolver.AddEthosU()) { + info("Added %s support to op resolver\n", + tflite::GetString_ETHOSU()); + } else { + printf_err("Failed to add Arm NPU support to op resolver."); + return false; + } + return true; +} diff --git a/source/application/api/use_case/asr/src/Wav2LetterPostprocess.cc b/source/application/api/use_case/asr/src/Wav2LetterPostprocess.cc new file mode 100644 index 0000000..00e689b --- /dev/null +++ b/source/application/api/use_case/asr/src/Wav2LetterPostprocess.cc @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "Wav2LetterPostprocess.hpp" + +#include "Wav2LetterModel.hpp" +#include "log_macros.h" + +#include + +namespace arm { +namespace app { + + AsrPostProcess::AsrPostProcess(TfLiteTensor* outputTensor, AsrClassifier& classifier, + const std::vector& labels, std::vector& results, + const uint32_t outputContextLen, + const uint32_t blankTokenIdx, const uint32_t reductionAxisIdx + ): + m_classifier(classifier), + m_outputTensor(outputTensor), + m_labels{labels}, + m_results(results), + m_outputContextLen(outputContextLen), + m_countIterations(0), + m_blankTokenIdx(blankTokenIdx), + m_reductionAxisIdx(reductionAxisIdx) + { + this->m_outputInnerLen = AsrPostProcess::GetOutputInnerLen(this->m_outputTensor, this->m_outputContextLen); + this->m_totalLen = (2 * this->m_outputContextLen + this->m_outputInnerLen); + } + + bool AsrPostProcess::DoPostProcess() + { + /* Basic checks. */ + if (!this->IsInputValid(this->m_outputTensor, this->m_reductionAxisIdx)) { + return false; + } + + /* Irrespective of tensor type, we use unsigned "byte" */ + auto* ptrData = tflite::GetTensorData(this->m_outputTensor); + const uint32_t elemSz = AsrPostProcess::GetTensorElementSize(this->m_outputTensor); + + /* Other sanity checks. */ + if (0 == elemSz) { + printf_err("Tensor type not supported for post processing\n"); + return false; + } else if (elemSz * this->m_totalLen > this->m_outputTensor->bytes) { + printf_err("Insufficient number of tensor bytes\n"); + return false; + } + + /* Which axis do we need to process? */ + switch (this->m_reductionAxisIdx) { + case Wav2LetterModel::ms_outputRowsIdx: + this->EraseSectionsRowWise( + ptrData, elemSz * this->m_outputTensor->dims->data[Wav2LetterModel::ms_outputColsIdx], + this->m_lastIteration); + break; + default: + printf_err("Unsupported axis index: %" PRIu32 "\n", this->m_reductionAxisIdx); + return false; + } + this->m_classifier.GetClassificationResults(this->m_outputTensor, + this->m_results, this->m_labels, 1); + + return true; + } + + bool AsrPostProcess::IsInputValid(TfLiteTensor* tensor, const uint32_t axisIdx) const + { + if (nullptr == tensor) { + return false; + } + + if (static_cast(axisIdx) >= tensor->dims->size) { + printf_err("Invalid axis index: %" PRIu32 "; Max: %d\n", + axisIdx, tensor->dims->size); + return false; + } + + if (static_cast(this->m_totalLen) != + tensor->dims->data[axisIdx]) { + printf_err("Unexpected tensor dimension for axis %" PRIu32", got %d.\n", + axisIdx, tensor->dims->data[axisIdx]); + return false; + } + + return true; + } + + uint32_t AsrPostProcess::GetTensorElementSize(TfLiteTensor* tensor) + { + switch(tensor->type) { + case kTfLiteUInt8: + case kTfLiteInt8: + return 1; + case kTfLiteInt16: + return 2; + case kTfLiteInt32: + case kTfLiteFloat32: + return 4; + default: + printf_err("Unsupported tensor type %s\n", + TfLiteTypeGetName(tensor->type)); + } + + return 0; + } + + bool AsrPostProcess::EraseSectionsRowWise( + uint8_t* ptrData, + const uint32_t strideSzBytes, + const bool lastIteration) + { + /* In this case, the "zero-ing" is quite simple as the region + * to be zeroed sits in contiguous memory (row-major). */ + const uint32_t eraseLen = strideSzBytes * this->m_outputContextLen; + + /* Erase left context? */ + if (this->m_countIterations > 0) { + /* Set output of each classification window to the blank token. */ + std::memset(ptrData, 0, eraseLen); + for (size_t windowIdx = 0; windowIdx < this->m_outputContextLen; windowIdx++) { + ptrData[windowIdx*strideSzBytes + this->m_blankTokenIdx] = 1; + } + } + + /* Erase right context? */ + if (false == lastIteration) { + uint8_t* rightCtxPtr = ptrData + (strideSzBytes * (this->m_outputContextLen + this->m_outputInnerLen)); + /* Set output of each classification window to the blank token. */ + std::memset(rightCtxPtr, 0, eraseLen); + for (size_t windowIdx = 0; windowIdx < this->m_outputContextLen; windowIdx++) { + rightCtxPtr[windowIdx*strideSzBytes + this->m_blankTokenIdx] = 1; + } + } + + if (lastIteration) { + this->m_countIterations = 0; + } else { + ++this->m_countIterations; + } + + return true; + } + + uint32_t AsrPostProcess::GetNumFeatureVectors(const Model& model) + { + TfLiteTensor* inputTensor = model.GetInputTensor(0); + const int inputRows = std::max(inputTensor->dims->data[Wav2LetterModel::ms_inputRowsIdx], 0); + if (inputRows == 0) { + printf_err("Error getting number of input rows for axis: %" PRIu32 "\n", + Wav2LetterModel::ms_inputRowsIdx); + } + return inputRows; + } + + uint32_t AsrPostProcess::GetOutputInnerLen(const TfLiteTensor* outputTensor, const uint32_t outputCtxLen) + { + const uint32_t outputRows = std::max(outputTensor->dims->data[Wav2LetterModel::ms_outputRowsIdx], 0); + if (outputRows == 0) { + printf_err("Error getting number of output rows for axis: %" PRIu32 "\n", + Wav2LetterModel::ms_outputRowsIdx); + } + + /* Watching for underflow. */ + int innerLen = (outputRows - (2 * outputCtxLen)); + + return std::max(innerLen, 0); + } + + uint32_t AsrPostProcess::GetOutputContextLen(const Model& model, const uint32_t inputCtxLen) + { + const uint32_t inputRows = AsrPostProcess::GetNumFeatureVectors(model); + const uint32_t inputInnerLen = inputRows - (2 * inputCtxLen); + constexpr uint32_t ms_outputRowsIdx = Wav2LetterModel::ms_outputRowsIdx; + + /* Check to make sure that the input tensor supports the above + * context and inner lengths. */ + if (inputRows <= 2 * inputCtxLen || inputRows <= inputInnerLen) { + printf_err("Input rows not compatible with ctx of %" PRIu32 "\n", + inputCtxLen); + return 0; + } + + TfLiteTensor* outputTensor = model.GetOutputTensor(0); + const uint32_t outputRows = std::max(outputTensor->dims->data[ms_outputRowsIdx], 0); + if (outputRows == 0) { + printf_err("Error getting number of output rows for axis: %" PRIu32 "\n", + Wav2LetterModel::ms_outputRowsIdx); + return 0; + } + + const float inOutRowRatio = static_cast(inputRows) / + static_cast(outputRows); + + return std::round(static_cast(inputCtxLen) / inOutRowRatio); + } + +} /* namespace app */ +} /* namespace arm */ diff --git a/source/application/api/use_case/asr/src/Wav2LetterPreprocess.cc b/source/application/api/use_case/asr/src/Wav2LetterPreprocess.cc new file mode 100644 index 0000000..92b0631 --- /dev/null +++ b/source/application/api/use_case/asr/src/Wav2LetterPreprocess.cc @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "Wav2LetterPreprocess.hpp" + +#include "PlatformMath.hpp" +#include "TensorFlowLiteMicro.hpp" + +#include +#include + +namespace arm { +namespace app { + + AsrPreProcess::AsrPreProcess(TfLiteTensor* inputTensor, const uint32_t numMfccFeatures, + const uint32_t numFeatureFrames, const uint32_t mfccWindowLen, + const uint32_t mfccWindowStride + ): + m_mfcc(numMfccFeatures, mfccWindowLen), + m_inputTensor(inputTensor), + m_mfccBuf(numMfccFeatures, numFeatureFrames), + m_delta1Buf(numMfccFeatures, numFeatureFrames), + m_delta2Buf(numMfccFeatures, numFeatureFrames), + m_mfccWindowLen(mfccWindowLen), + m_mfccWindowStride(mfccWindowStride), + m_numMfccFeats(numMfccFeatures), + m_numFeatureFrames(numFeatureFrames) + { + if (numMfccFeatures > 0 && mfccWindowLen > 0) { + this->m_mfcc.Init(); + } + } + + bool AsrPreProcess::DoPreProcess(const void* audioData, const size_t audioDataLen) + { + this->m_mfccSlidingWindow = audio::SlidingWindow( + static_cast(audioData), audioDataLen, + this->m_mfccWindowLen, this->m_mfccWindowStride); + + uint32_t mfccBufIdx = 0; + + std::fill(m_mfccBuf.begin(), m_mfccBuf.end(), 0.f); + std::fill(m_delta1Buf.begin(), m_delta1Buf.end(), 0.f); + std::fill(m_delta2Buf.begin(), m_delta2Buf.end(), 0.f); + + /* While we can slide over the audio. */ + while (this->m_mfccSlidingWindow.HasNext()) { + const int16_t* mfccWindow = this->m_mfccSlidingWindow.Next(); + auto mfccAudioData = std::vector( + mfccWindow, + mfccWindow + this->m_mfccWindowLen); + auto mfcc = this->m_mfcc.MfccCompute(mfccAudioData); + for (size_t i = 0; i < this->m_mfccBuf.size(0); ++i) { + this->m_mfccBuf(i, mfccBufIdx) = mfcc[i]; + } + ++mfccBufIdx; + } + + /* Pad MFCC if needed by adding MFCC for zeros. */ + if (mfccBufIdx != this->m_numFeatureFrames) { + std::vector zerosWindow = std::vector(this->m_mfccWindowLen, 0); + std::vector mfccZeros = this->m_mfcc.MfccCompute(zerosWindow); + + while (mfccBufIdx != this->m_numFeatureFrames) { + memcpy(&this->m_mfccBuf(0, mfccBufIdx), + mfccZeros.data(), sizeof(float) * m_numMfccFeats); + ++mfccBufIdx; + } + } + + /* Compute first and second order deltas from MFCCs. */ + AsrPreProcess::ComputeDeltas(this->m_mfccBuf, this->m_delta1Buf, this->m_delta2Buf); + + /* Standardize calculated features. */ + this->Standarize(); + + /* Quantise. */ + QuantParams quantParams = GetTensorQuantParams(this->m_inputTensor); + + if (0 == quantParams.scale) { + printf_err("Quantisation scale can't be 0\n"); + return false; + } + + switch(this->m_inputTensor->type) { + case kTfLiteUInt8: + return this->Quantise( + tflite::GetTensorData(this->m_inputTensor), this->m_inputTensor->bytes, + quantParams.scale, quantParams.offset); + case kTfLiteInt8: + return this->Quantise( + tflite::GetTensorData(this->m_inputTensor), this->m_inputTensor->bytes, + quantParams.scale, quantParams.offset); + default: + printf_err("Unsupported tensor type %s\n", + TfLiteTypeGetName(this->m_inputTensor->type)); + } + + return false; + } + + bool AsrPreProcess::ComputeDeltas(Array2d& mfcc, + Array2d& delta1, + Array2d& delta2) + { + const std::vector delta1Coeffs = + {6.66666667e-02, 5.00000000e-02, 3.33333333e-02, + 1.66666667e-02, -3.46944695e-18, -1.66666667e-02, + -3.33333333e-02, -5.00000000e-02, -6.66666667e-02}; + + const std::vector delta2Coeffs = + {0.06060606, 0.01515152, -0.01731602, + -0.03679654, -0.04329004, -0.03679654, + -0.01731602, 0.01515152, 0.06060606}; + + if (delta1.size(0) == 0 || delta2.size(0) != delta1.size(0) || + mfcc.size(0) == 0 || mfcc.size(1) == 0) { + return false; + } + + /* Get the middle index; coeff vec len should always be odd. */ + const size_t coeffLen = delta1Coeffs.size(); + const size_t fMidIdx = (coeffLen - 1)/2; + const size_t numFeatures = mfcc.size(0); + const size_t numFeatVectors = mfcc.size(1); + + /* Iterate through features in MFCC vector. */ + for (size_t i = 0; i < numFeatures; ++i) { + /* For each feature, iterate through time (t) samples representing feature evolution and + * calculate d/dt and d^2/dt^2, using 1D convolution with differential kernels. + * Convolution padding = valid, result size is `time length - kernel length + 1`. + * The result is padded with 0 from both sides to match the size of initial time samples data. + * + * For the small filter, conv1D implementation as a simple loop is efficient enough. + * Filters of a greater size would need CMSIS-DSP functions to be used, like arm_fir_f32. + */ + + for (size_t j = fMidIdx; j < numFeatVectors - fMidIdx; ++j) { + float d1 = 0; + float d2 = 0; + const size_t mfccStIdx = j - fMidIdx; + + for (size_t k = 0, m = coeffLen - 1; k < coeffLen; ++k, --m) { + + d1 += mfcc(i,mfccStIdx + k) * delta1Coeffs[m]; + d2 += mfcc(i,mfccStIdx + k) * delta2Coeffs[m]; + } + + delta1(i,j) = d1; + delta2(i,j) = d2; + } + } + + return true; + } + + void AsrPreProcess::StandardizeVecF32(Array2d& vec) + { + auto mean = math::MathUtils::MeanF32(vec.begin(), vec.totalSize()); + auto stddev = math::MathUtils::StdDevF32(vec.begin(), vec.totalSize(), mean); + + debug("Mean: %f, Stddev: %f\n", mean, stddev); + if (stddev == 0) { + std::fill(vec.begin(), vec.end(), 0); + } else { + const float stddevInv = 1.f/stddev; + const float normalisedMean = mean/stddev; + + auto NormalisingFunction = [=](float& value) { + value = value * stddevInv - normalisedMean; + }; + std::for_each(vec.begin(), vec.end(), NormalisingFunction); + } + } + + void AsrPreProcess::Standarize() + { + AsrPreProcess::StandardizeVecF32(this->m_mfccBuf); + AsrPreProcess::StandardizeVecF32(this->m_delta1Buf); + AsrPreProcess::StandardizeVecF32(this->m_delta2Buf); + } + + float AsrPreProcess::GetQuantElem( + const float elem, + const float quantScale, + const int quantOffset, + const float minVal, + const float maxVal) + { + float val = std::round((elem/quantScale) + quantOffset); + return std::min(std::max(val, minVal), maxVal); + } + +} /* namespace app */ +} /* namespace arm */ \ No newline at end of file diff --git a/source/application/api/use_case/img_class/CMakeLists.txt b/source/application/api/use_case/img_class/CMakeLists.txt new file mode 100644 index 0000000..f4818d8 --- /dev/null +++ b/source/application/api/use_case/img_class/CMakeLists.txt @@ -0,0 +1,39 @@ +#---------------------------------------------------------------------------- +# Copyright (c) 2022 Arm Limited. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#---------------------------------------------------------------------------- +######################################################### +# IMG CLASS API library # +######################################################### +cmake_minimum_required(VERSION 3.15.6) + +set(IMG_CLASS_API_TARGET img_class_api) +project(${IMG_CLASS_API_TARGET} + DESCRIPTION "Image classification use case API library" + LANGUAGES C CXX) + +# Create static library +add_library(${IMG_CLASS_API_TARGET} STATIC + src/ImgClassProcessing.cc + src/MobileNetModel.cc) + +target_include_directories(${IMG_CLASS_API_TARGET} PUBLIC include) + +target_link_libraries(${IMG_CLASS_API_TARGET} PUBLIC common_api) + +message(STATUS "*******************************************************") +message(STATUS "Library : " ${IMG_CLASS_API_TARGET}) +message(STATUS "CMAKE_SYSTEM_PROCESSOR : " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "*******************************************************") diff --git a/source/application/api/use_case/img_class/include/ImgClassProcessing.hpp b/source/application/api/use_case/img_class/include/ImgClassProcessing.hpp new file mode 100644 index 0000000..55b5ce1 --- /dev/null +++ b/source/application/api/use_case/img_class/include/ImgClassProcessing.hpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef IMG_CLASS_PROCESSING_HPP +#define IMG_CLASS_PROCESSING_HPP + +#include "BaseProcessing.hpp" +#include "Classifier.hpp" + +namespace arm { +namespace app { + + /** + * @brief Pre-processing class for Image Classification use case. + * Implements methods declared by BasePreProcess and anything else needed + * to populate input tensors ready for inference. + */ + class ImgClassPreProcess : public BasePreProcess { + + public: + /** + * @brief Constructor + * @param[in] inputTensor Pointer to the TFLite Micro input Tensor. + * @param[in] convertToInt8 Should the image be converted to Int8 range. + **/ + explicit ImgClassPreProcess(TfLiteTensor* inputTensor, bool convertToInt8); + + /** + * @brief Should perform pre-processing of 'raw' input image data and load it into + * TFLite Micro input tensors ready for inference + * @param[in] input Pointer to the data that pre-processing will work on. + * @param[in] inputSize Size of the input data. + * @return true if successful, false otherwise. + **/ + bool DoPreProcess(const void* input, size_t inputSize) override; + + private: + TfLiteTensor* m_inputTensor; + bool m_convertToInt8; + }; + + /** + * @brief Post-processing class for Image Classification use case. + * Implements methods declared by BasePostProcess and anything else needed + * to populate result vector. + */ + class ImgClassPostProcess : public BasePostProcess { + + public: + /** + * @brief Constructor + * @param[in] outputTensor Pointer to the TFLite Micro output Tensor. + * @param[in] classifier Classifier object used to get top N results from classification. + * @param[in] labels Vector of string labels to identify each output of the model. + * @param[in] results Vector of classification results to store decoded outputs. + **/ + ImgClassPostProcess(TfLiteTensor* outputTensor, Classifier& classifier, + const std::vector& labels, + std::vector& results); + + /** + * @brief Should perform post-processing of the result of inference then + * populate classification result data for any later use. + * @return true if successful, false otherwise. + **/ + bool DoPostProcess() override; + + private: + TfLiteTensor* m_outputTensor; + Classifier& m_imgClassifier; + const std::vector& m_labels; + std::vector& m_results; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* IMG_CLASS_PROCESSING_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/img_class/include/MobileNetModel.hpp b/source/application/api/use_case/img_class/include/MobileNetModel.hpp new file mode 100644 index 0000000..adaa9c2 --- /dev/null +++ b/source/application/api/use_case/img_class/include/MobileNetModel.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef IMG_CLASS_MOBILENETMODEL_HPP +#define IMG_CLASS_MOBILENETMODEL_HPP + +#include "Model.hpp" + +namespace arm { +namespace app { + + class MobileNetModel : public Model { + + public: + /* Indices for the expected model - based on input tensor shape */ + static constexpr uint32_t ms_inputRowsIdx = 1; + static constexpr uint32_t ms_inputColsIdx = 2; + static constexpr uint32_t ms_inputChannelsIdx = 3; + + protected: + /** @brief Gets the reference to op resolver interface class. */ + const tflite::MicroOpResolver& GetOpResolver() override; + + /** @brief Adds operations to the op resolver instance. */ + bool EnlistOperations() override; + + private: + /* Maximum number of individual operations that can be enlisted. */ + static constexpr int ms_maxOpCnt = 7; + + /* A mutable op resolver instance. */ + tflite::MicroMutableOpResolver m_opResolver; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* IMG_CLASS_MOBILENETMODEL_HPP */ diff --git a/source/application/api/use_case/img_class/src/ImgClassProcessing.cc b/source/application/api/use_case/img_class/src/ImgClassProcessing.cc new file mode 100644 index 0000000..491e751 --- /dev/null +++ b/source/application/api/use_case/img_class/src/ImgClassProcessing.cc @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "ImgClassProcessing.hpp" + +#include "ImageUtils.hpp" +#include "log_macros.h" + +namespace arm { +namespace app { + + ImgClassPreProcess::ImgClassPreProcess(TfLiteTensor* inputTensor, bool convertToInt8) + :m_inputTensor{inputTensor}, + m_convertToInt8{convertToInt8} + {} + + bool ImgClassPreProcess::DoPreProcess(const void* data, size_t inputSize) + { + if (data == nullptr) { + printf_err("Data pointer is null"); + return false; + } + + auto input = static_cast(data); + + std::memcpy(this->m_inputTensor->data.data, input, inputSize); + debug("Input tensor populated \n"); + + if (this->m_convertToInt8) { + image::ConvertImgToInt8(this->m_inputTensor->data.data, this->m_inputTensor->bytes); + } + + return true; + } + + ImgClassPostProcess::ImgClassPostProcess(TfLiteTensor* outputTensor, Classifier& classifier, + const std::vector& labels, + std::vector& results) + :m_outputTensor{outputTensor}, + m_imgClassifier{classifier}, + m_labels{labels}, + m_results{results} + {} + + bool ImgClassPostProcess::DoPostProcess() + { + return this->m_imgClassifier.GetClassificationResults( + this->m_outputTensor, this->m_results, + this->m_labels, 5, false); + } + +} /* namespace app */ +} /* namespace arm */ \ No newline at end of file diff --git a/source/application/api/use_case/img_class/src/MobileNetModel.cc b/source/application/api/use_case/img_class/src/MobileNetModel.cc new file mode 100644 index 0000000..b700d70 --- /dev/null +++ b/source/application/api/use_case/img_class/src/MobileNetModel.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "MobileNetModel.hpp" +#include "log_macros.h" + +const tflite::MicroOpResolver& arm::app::MobileNetModel::GetOpResolver() +{ + return this->m_opResolver; +} + +bool arm::app::MobileNetModel::EnlistOperations() +{ + this->m_opResolver.AddDepthwiseConv2D(); + this->m_opResolver.AddConv2D(); + this->m_opResolver.AddAveragePool2D(); + this->m_opResolver.AddAdd(); + this->m_opResolver.AddReshape(); + this->m_opResolver.AddSoftmax(); + + if (kTfLiteOk == this->m_opResolver.AddEthosU()) { + info("Added %s support to op resolver\n", + tflite::GetString_ETHOSU()); + } else { + printf_err("Failed to add Arm NPU support to op resolver."); + return false; + } + return true; +} diff --git a/source/application/api/use_case/inference_runner/CMakeLists.txt b/source/application/api/use_case/inference_runner/CMakeLists.txt new file mode 100644 index 0000000..d0fe629 --- /dev/null +++ b/source/application/api/use_case/inference_runner/CMakeLists.txt @@ -0,0 +1,37 @@ +#---------------------------------------------------------------------------- +# Copyright (c) 2022 Arm Limited. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#---------------------------------------------------------------------------- +######################################################### +# INFERENCE RUNNER API library # +######################################################### +cmake_minimum_required(VERSION 3.15.6) + +set(INFERENCE_RUNNER_API_TARGET inference_runner_api) +project(${INFERENCE_RUNNER_API_TARGET} + DESCRIPTION "Inference runner use case API library" + LANGUAGES C CXX) + +# Create static library +add_library(${INFERENCE_RUNNER_API_TARGET} STATIC src/TestModel.cc) + +target_include_directories(${INFERENCE_RUNNER_API_TARGET} PUBLIC include) + +target_link_libraries(${INFERENCE_RUNNER_API_TARGET} PUBLIC common_api) + +message(STATUS "*******************************************************") +message(STATUS "Library : " ${INFERENCE_RUNNER_API_TARGET}) +message(STATUS "CMAKE_SYSTEM_PROCESSOR : " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "*******************************************************") diff --git a/source/application/api/use_case/inference_runner/include/TestModel.hpp b/source/application/api/use_case/inference_runner/include/TestModel.hpp new file mode 100644 index 0000000..648198c --- /dev/null +++ b/source/application/api/use_case/inference_runner/include/TestModel.hpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef INF_RUNNER_TESTMODEL_HPP +#define INF_RUNNER_TESTMODEL_HPP + +#include "Model.hpp" + +namespace arm { +namespace app { + + class TestModel : public Model { + + protected: + /** @brief Gets the reference to op resolver interface class. */ + const tflite::AllOpsResolver& GetOpResolver() override; + + /** @brief Adds operations to the op resolver instance, not needed as using AllOpsResolver. */ + bool EnlistOperations() override {return false;} + + private: + + /* No need to define individual ops at the cost of extra memory. */ + tflite::AllOpsResolver m_opResolver; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* INF_RUNNER_TESTMODEL_HPP */ diff --git a/source/application/api/use_case/inference_runner/src/TestModel.cc b/source/application/api/use_case/inference_runner/src/TestModel.cc new file mode 100644 index 0000000..1891e44 --- /dev/null +++ b/source/application/api/use_case/inference_runner/src/TestModel.cc @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "TestModel.hpp" +#include "log_macros.h" + +const tflite::AllOpsResolver& arm::app::TestModel::GetOpResolver() +{ + return this->m_opResolver; +} diff --git a/source/application/api/use_case/kws/CMakeLists.txt b/source/application/api/use_case/kws/CMakeLists.txt new file mode 100644 index 0000000..3256d03 --- /dev/null +++ b/source/application/api/use_case/kws/CMakeLists.txt @@ -0,0 +1,39 @@ +#---------------------------------------------------------------------------- +# Copyright (c) 2022 Arm Limited. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#---------------------------------------------------------------------------- +######################################################### +# KEYWORD SPOTTING API library # +######################################################### +cmake_minimum_required(VERSION 3.15.6) + +set(KWS_API_TARGET kws_api) +project(${KWS_API_TARGET} + DESCRIPTION "Keyword spotting use case API library" + LANGUAGES C CXX) + +# Create static library +add_library(${KWS_API_TARGET} STATIC + src/KwsProcessing.cc + src/MicroNetKwsModel.cc) + +target_include_directories(${KWS_API_TARGET} PUBLIC include) + +target_link_libraries(${KWS_API_TARGET} PUBLIC common_api) + +message(STATUS "*******************************************************") +message(STATUS "Library : " ${KWS_API_TARGET}) +message(STATUS "CMAKE_SYSTEM_PROCESSOR : " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "*******************************************************") diff --git a/source/application/api/use_case/kws/include/KwsProcessing.hpp b/source/application/api/use_case/kws/include/KwsProcessing.hpp new file mode 100644 index 0000000..0ede425 --- /dev/null +++ b/source/application/api/use_case/kws/include/KwsProcessing.hpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef KWS_PROCESSING_HPP +#define KWS_PROCESSING_HPP + +#include "AudioUtils.hpp" +#include "BaseProcessing.hpp" +#include "Classifier.hpp" +#include "MicroNetKwsMfcc.hpp" + +#include + +namespace arm { +namespace app { + + /** + * @brief Pre-processing class for Keyword Spotting use case. + * Implements methods declared by BasePreProcess and anything else needed + * to populate input tensors ready for inference. + */ + class KwsPreProcess : public BasePreProcess { + + public: + /** + * @brief Constructor + * @param[in] inputTensor Pointer to the TFLite Micro input Tensor. + * @param[in] numFeatures How many MFCC features to use. + * @param[in] numFeatureFrames Number of MFCC vectors that need to be calculated + * for an inference. + * @param[in] mfccFrameLength Number of audio samples used to calculate one set of MFCC values when + * sliding a window through the audio sample. + * @param[in] mfccFrameStride Number of audio samples between consecutive windows. + **/ + explicit KwsPreProcess(TfLiteTensor* inputTensor, size_t numFeatures, size_t numFeatureFrames, + int mfccFrameLength, int mfccFrameStride); + + /** + * @brief Should perform pre-processing of 'raw' input audio data and load it into + * TFLite Micro input tensors ready for inference. + * @param[in] input Pointer to the data that pre-processing will work on. + * @param[in] inputSize Size of the input data. + * @return true if successful, false otherwise. + **/ + bool DoPreProcess(const void* input, size_t inputSize) override; + + size_t m_audioWindowIndex = 0; /* Index of audio slider, used when caching features in longer clips. */ + size_t m_audioDataWindowSize; /* Amount of audio needed for 1 inference. */ + size_t m_audioDataStride; /* Amount of audio to stride across if doing >1 inference in longer clips. */ + + private: + TfLiteTensor* m_inputTensor; /* Model input tensor. */ + const int m_mfccFrameLength; + const int m_mfccFrameStride; + const size_t m_numMfccFrames; /* How many sets of m_numMfccFeats. */ + + audio::MicroNetKwsMFCC m_mfcc; + audio::SlidingWindow m_mfccSlidingWindow; + size_t m_numMfccVectorsInAudioStride; + size_t m_numReusedMfccVectors; + std::function&, int, bool, size_t)> m_mfccFeatureCalculator; + + /** + * @brief Returns a function to perform feature calculation and populates input tensor data with + * MFCC data. + * + * Input tensor data type check is performed to choose correct MFCC feature data type. + * If tensor has an integer data type then original features are quantised. + * + * Warning: MFCC calculator provided as input must have the same life scope as returned function. + * + * @param[in] mfcc MFCC feature calculator. + * @param[in,out] inputTensor Input tensor pointer to store calculated features. + * @param[in] cacheSize Size of the feature vectors cache (number of feature vectors). + * @return Function to be called providing audio sample and sliding window index. + */ + std::function&, int, bool, size_t)> + GetFeatureCalculator(audio::MicroNetKwsMFCC& mfcc, + TfLiteTensor* inputTensor, + size_t cacheSize); + + template + std::function&, size_t, bool, size_t)> + FeatureCalc(TfLiteTensor* inputTensor, size_t cacheSize, + std::function (std::vector& )> compute); + }; + + /** + * @brief Post-processing class for Keyword Spotting use case. + * Implements methods declared by BasePostProcess and anything else needed + * to populate result vector. + */ + class KwsPostProcess : public BasePostProcess { + + private: + TfLiteTensor* m_outputTensor; /* Model output tensor. */ + Classifier& m_kwsClassifier; /* KWS Classifier object. */ + const std::vector& m_labels; /* KWS Labels. */ + std::vector& m_results; /* Results vector for a single inference. */ + + public: + /** + * @brief Constructor + * @param[in] outputTensor Pointer to the TFLite Micro output Tensor. + * @param[in] classifier Classifier object used to get top N results from classification. + * @param[in] labels Vector of string labels to identify each output of the model. + * @param[in/out] results Vector of classification results to store decoded outputs. + **/ + KwsPostProcess(TfLiteTensor* outputTensor, Classifier& classifier, + const std::vector& labels, + std::vector& results); + + /** + * @brief Should perform post-processing of the result of inference then + * populate KWS result data for any later use. + * @return true if successful, false otherwise. + **/ + bool DoPostProcess() override; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* KWS_PROCESSING_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/kws/include/KwsResult.hpp b/source/application/api/use_case/kws/include/KwsResult.hpp new file mode 100644 index 0000000..38f32b4 --- /dev/null +++ b/source/application/api/use_case/kws/include/KwsResult.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef KWS_RESULT_HPP +#define KWS_RESULT_HPP + +#include "ClassificationResult.hpp" + +#include + +namespace arm { +namespace app { +namespace kws { + + using ResultVec = std::vector; + + /* Structure for holding kws result. */ + class KwsResult { + + public: + ResultVec m_resultVec; /* Container for "thresholded" classification results. */ + float m_timeStamp; /* Audio timestamp for this result. */ + uint32_t m_inferenceNumber; /* Corresponding inference number. */ + float m_threshold; /* Threshold value for `m_resultVec`. */ + + KwsResult() = delete; + KwsResult(ResultVec& resultVec, + const float timestamp, + const uint32_t inferenceIdx, + const float scoreThreshold) { + + this->m_threshold = scoreThreshold; + this->m_timeStamp = timestamp; + this->m_inferenceNumber = inferenceIdx; + + this->m_resultVec = ResultVec(); + for (auto & i : resultVec) { + if (i.m_normalisedVal >= this->m_threshold) { + this->m_resultVec.emplace_back(i); + } + } + } + ~KwsResult() = default; + }; + +} /* namespace kws */ +} /* namespace app */ +} /* namespace arm */ + +#endif /* KWS_RESULT_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/kws/include/MicroNetKwsMfcc.hpp b/source/application/api/use_case/kws/include/MicroNetKwsMfcc.hpp new file mode 100644 index 0000000..b2565a3 --- /dev/null +++ b/source/application/api/use_case/kws/include/MicroNetKwsMfcc.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef KWS_MICRONET_MFCC_HPP +#define KWS_MICRONET_MFCC_HPP + +#include "Mfcc.hpp" + +namespace arm { +namespace app { +namespace audio { + + /* Class to provide MicroNet specific MFCC calculation requirements. */ + class MicroNetKwsMFCC : public MFCC { + + public: + static constexpr uint32_t ms_defaultSamplingFreq = 16000; + static constexpr uint32_t ms_defaultNumFbankBins = 40; + static constexpr uint32_t ms_defaultMelLoFreq = 20; + static constexpr uint32_t ms_defaultMelHiFreq = 4000; + static constexpr bool ms_defaultUseHtkMethod = true; + + explicit MicroNetKwsMFCC(const size_t numFeats, const size_t frameLen) + : MFCC(MfccParams( + ms_defaultSamplingFreq, ms_defaultNumFbankBins, + ms_defaultMelLoFreq, ms_defaultMelHiFreq, + numFeats, frameLen, ms_defaultUseHtkMethod)) + {} + MicroNetKwsMFCC() = delete; + ~MicroNetKwsMFCC() = default; + }; + +} /* namespace audio */ +} /* namespace app */ +} /* namespace arm */ + +#endif /* KWS_MICRONET_MFCC_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/kws/include/MicroNetKwsModel.hpp b/source/application/api/use_case/kws/include/MicroNetKwsModel.hpp new file mode 100644 index 0000000..3d2f3de --- /dev/null +++ b/source/application/api/use_case/kws/include/MicroNetKwsModel.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef KWS_MICRONETMODEL_HPP +#define KWS_MICRONETMODEL_HPP + +#include "Model.hpp" + +namespace arm { +namespace app { +namespace kws { + extern const int g_FrameLength; + extern const int g_FrameStride; + extern const float g_ScoreThreshold; + extern const uint32_t g_NumMfcc; + extern const uint32_t g_NumAudioWins; +} /* namespace kws */ +} /* namespace app */ +} /* namespace arm */ + +namespace arm { +namespace app { + + class MicroNetKwsModel : public Model { + public: + /* Indices for the expected model - based on input and output tensor shapes */ + static constexpr uint32_t ms_inputRowsIdx = 1; + static constexpr uint32_t ms_inputColsIdx = 2; + static constexpr uint32_t ms_outputRowsIdx = 2; + static constexpr uint32_t ms_outputColsIdx = 3; + + protected: + /** @brief Gets the reference to op resolver interface class. */ + const tflite::MicroOpResolver& GetOpResolver() override; + + /** @brief Adds operations to the op resolver instance. */ + bool EnlistOperations() override; + + private: + /* Maximum number of individual operations that can be enlisted. */ + static constexpr int ms_maxOpCnt = 7; + + /* A mutable op resolver instance. */ + tflite::MicroMutableOpResolver m_opResolver; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* KWS_MICRONETMODEL_HPP */ diff --git a/source/application/api/use_case/kws/src/KwsProcessing.cc b/source/application/api/use_case/kws/src/KwsProcessing.cc new file mode 100644 index 0000000..40de498 --- /dev/null +++ b/source/application/api/use_case/kws/src/KwsProcessing.cc @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "KwsProcessing.hpp" +#include "log_macros.h" +#include "MicroNetKwsModel.hpp" + +namespace arm { +namespace app { + + KwsPreProcess::KwsPreProcess(TfLiteTensor* inputTensor, size_t numFeatures, size_t numMfccFrames, + int mfccFrameLength, int mfccFrameStride + ): + m_inputTensor{inputTensor}, + m_mfccFrameLength{mfccFrameLength}, + m_mfccFrameStride{mfccFrameStride}, + m_numMfccFrames{numMfccFrames}, + m_mfcc{audio::MicroNetKwsMFCC(numFeatures, mfccFrameLength)} + { + this->m_mfcc.Init(); + + /* Deduce the data length required for 1 inference from the network parameters. */ + this->m_audioDataWindowSize = this->m_numMfccFrames * this->m_mfccFrameStride + + (this->m_mfccFrameLength - this->m_mfccFrameStride); + + /* Creating an MFCC feature sliding window for the data required for 1 inference. */ + this->m_mfccSlidingWindow = audio::SlidingWindow(nullptr, this->m_audioDataWindowSize, + this->m_mfccFrameLength, this->m_mfccFrameStride); + + /* For longer audio clips we choose to move by half the audio window size + * => for a 1 second window size there is an overlap of 0.5 seconds. */ + this->m_audioDataStride = this->m_audioDataWindowSize / 2; + + /* To have the previously calculated features re-usable, stride must be multiple + * of MFCC features window stride. Reduce stride through audio if needed. */ + if (0 != this->m_audioDataStride % this->m_mfccFrameStride) { + this->m_audioDataStride -= this->m_audioDataStride % this->m_mfccFrameStride; + } + + this->m_numMfccVectorsInAudioStride = this->m_audioDataStride / this->m_mfccFrameStride; + + /* Calculate number of the feature vectors in the window overlap region. + * These feature vectors will be reused.*/ + this->m_numReusedMfccVectors = this->m_mfccSlidingWindow.TotalStrides() + 1 + - this->m_numMfccVectorsInAudioStride; + + /* Construct feature calculation function. */ + this->m_mfccFeatureCalculator = GetFeatureCalculator(this->m_mfcc, this->m_inputTensor, + this->m_numReusedMfccVectors); + + if (!this->m_mfccFeatureCalculator) { + printf_err("Feature calculator not initialized."); + } + } + + bool KwsPreProcess::DoPreProcess(const void* data, size_t inputSize) + { + UNUSED(inputSize); + if (data == nullptr) { + printf_err("Data pointer is null"); + } + + /* Set the features sliding window to the new address. */ + auto input = static_cast(data); + this->m_mfccSlidingWindow.Reset(input); + + /* Cache is only usable if we have more than 1 inference in an audio clip. */ + bool useCache = this->m_audioWindowIndex > 0 && this->m_numReusedMfccVectors > 0; + + /* Use a sliding window to calculate MFCC features frame by frame. */ + while (this->m_mfccSlidingWindow.HasNext()) { + const int16_t* mfccWindow = this->m_mfccSlidingWindow.Next(); + + std::vector mfccFrameAudioData = std::vector(mfccWindow, + mfccWindow + this->m_mfccFrameLength); + + /* Compute features for this window and write them to input tensor. */ + this->m_mfccFeatureCalculator(mfccFrameAudioData, this->m_mfccSlidingWindow.Index(), + useCache, this->m_numMfccVectorsInAudioStride); + } + + debug("Input tensor populated \n"); + + return true; + } + + /** + * @brief Generic feature calculator factory. + * + * Returns lambda function to compute features using features cache. + * Real features math is done by a lambda function provided as a parameter. + * Features are written to input tensor memory. + * + * @tparam T Feature vector type. + * @param[in] inputTensor Model input tensor pointer. + * @param[in] cacheSize Number of feature vectors to cache. Defined by the sliding window overlap. + * @param[in] compute Features calculator function. + * @return Lambda function to compute features. + */ + template + std::function&, size_t, bool, size_t)> + KwsPreProcess::FeatureCalc(TfLiteTensor* inputTensor, size_t cacheSize, + std::function (std::vector& )> compute) + { + /* Feature cache to be captured by lambda function. */ + static std::vector> featureCache = std::vector>(cacheSize); + + return [=](std::vector& audioDataWindow, + size_t index, + bool useCache, + size_t featuresOverlapIndex) + { + T* tensorData = tflite::GetTensorData(inputTensor); + std::vector features; + + /* Reuse features from cache if cache is ready and sliding windows overlap. + * Overlap is in the beginning of sliding window with a size of a feature cache. */ + if (useCache && index < featureCache.size()) { + features = std::move(featureCache[index]); + } else { + features = std::move(compute(audioDataWindow)); + } + auto size = features.size(); + auto sizeBytes = sizeof(T) * size; + std::memcpy(tensorData + (index * size), features.data(), sizeBytes); + + /* Start renewing cache as soon iteration goes out of the windows overlap. */ + if (index >= featuresOverlapIndex) { + featureCache[index - featuresOverlapIndex] = std::move(features); + } + }; + } + + template std::function&, size_t , bool, size_t)> + KwsPreProcess::FeatureCalc(TfLiteTensor* inputTensor, + size_t cacheSize, + std::function (std::vector&)> compute); + + template std::function&, size_t, bool, size_t)> + KwsPreProcess::FeatureCalc(TfLiteTensor* inputTensor, + size_t cacheSize, + std::function(std::vector&)> compute); + + + std::function&, int, bool, size_t)> + KwsPreProcess::GetFeatureCalculator(audio::MicroNetKwsMFCC& mfcc, TfLiteTensor* inputTensor, size_t cacheSize) + { + std::function&, size_t, bool, size_t)> mfccFeatureCalc; + + TfLiteQuantization quant = inputTensor->quantization; + + if (kTfLiteAffineQuantization == quant.type) { + auto *quantParams = (TfLiteAffineQuantization *) quant.params; + const float quantScale = quantParams->scale->data[0]; + const int quantOffset = quantParams->zero_point->data[0]; + + switch (inputTensor->type) { + case kTfLiteInt8: { + mfccFeatureCalc = this->FeatureCalc(inputTensor, + cacheSize, + [=, &mfcc](std::vector& audioDataWindow) { + return mfcc.MfccComputeQuant(audioDataWindow, + quantScale, + quantOffset); + } + ); + break; + } + default: + printf_err("Tensor type %s not supported\n", TfLiteTypeGetName(inputTensor->type)); + } + } else { + mfccFeatureCalc = this->FeatureCalc(inputTensor, cacheSize, + [&mfcc](std::vector& audioDataWindow) { + return mfcc.MfccCompute(audioDataWindow); } + ); + } + return mfccFeatureCalc; + } + + KwsPostProcess::KwsPostProcess(TfLiteTensor* outputTensor, Classifier& classifier, + const std::vector& labels, + std::vector& results) + :m_outputTensor{outputTensor}, + m_kwsClassifier{classifier}, + m_labels{labels}, + m_results{results} + {} + + bool KwsPostProcess::DoPostProcess() + { + return this->m_kwsClassifier.GetClassificationResults( + this->m_outputTensor, this->m_results, + this->m_labels, 1, true); + } + +} /* namespace app */ +} /* namespace arm */ \ No newline at end of file diff --git a/source/application/api/use_case/kws/src/MicroNetKwsModel.cc b/source/application/api/use_case/kws/src/MicroNetKwsModel.cc new file mode 100644 index 0000000..bedca99 --- /dev/null +++ b/source/application/api/use_case/kws/src/MicroNetKwsModel.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "MicroNetKwsModel.hpp" +#include "log_macros.h" + +const tflite::MicroOpResolver& arm::app::MicroNetKwsModel::GetOpResolver() +{ + return this->m_opResolver; +} + +bool arm::app::MicroNetKwsModel::EnlistOperations() +{ + this->m_opResolver.AddReshape(); + this->m_opResolver.AddAveragePool2D(); + this->m_opResolver.AddConv2D(); + this->m_opResolver.AddDepthwiseConv2D(); + this->m_opResolver.AddFullyConnected(); + this->m_opResolver.AddRelu(); + + if (kTfLiteOk == this->m_opResolver.AddEthosU()) { + info("Added %s support to op resolver\n", + tflite::GetString_ETHOSU()); + } else { + printf_err("Failed to add Arm NPU support to op resolver."); + return false; + } + return true; +} diff --git a/source/application/api/use_case/noise_reduction/CMakeLists.txt b/source/application/api/use_case/noise_reduction/CMakeLists.txt new file mode 100644 index 0000000..5fa9a73 --- /dev/null +++ b/source/application/api/use_case/noise_reduction/CMakeLists.txt @@ -0,0 +1,40 @@ +#---------------------------------------------------------------------------- +# Copyright (c) 2022 Arm Limited. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#---------------------------------------------------------------------------- +######################################################### +# NOISE REDUCTION API library # +######################################################### +cmake_minimum_required(VERSION 3.15.6) + +set(NOISE_REDUCTION_API_TARGET noise_reduction_api) +project(${NOISE_REDUCTION_API_TARGET} + DESCRIPTION "Noise reduction use case API library" + LANGUAGES C CXX) + +# Create static library +add_library(${NOISE_REDUCTION_API_TARGET} STATIC + src/RNNoiseProcessing.cc + src/RNNoiseFeatureProcessor.cc + src/RNNoiseModel.cc) + +target_include_directories(${NOISE_REDUCTION_API_TARGET} PUBLIC include) + +target_link_libraries(${NOISE_REDUCTION_API_TARGET} PUBLIC common_api) + +message(STATUS "*******************************************************") +message(STATUS "Library : " ${NOISE_REDUCTION_API_TARGET}) +message(STATUS "CMAKE_SYSTEM_PROCESSOR : " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "*******************************************************") diff --git a/source/application/api/use_case/noise_reduction/include/RNNoiseFeatureProcessor.hpp b/source/application/api/use_case/noise_reduction/include/RNNoiseFeatureProcessor.hpp new file mode 100644 index 0000000..cbf0e4e --- /dev/null +++ b/source/application/api/use_case/noise_reduction/include/RNNoiseFeatureProcessor.hpp @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RNNOISE_FEATURE_PROCESSOR_HPP +#define RNNOISE_FEATURE_PROCESSOR_HPP + +#include "PlatformMath.hpp" +#include +#include +#include +#include + +namespace arm { +namespace app { +namespace rnn { + + using vec1D32F = std::vector; + using vec2D32F = std::vector; + using arrHp = std::array; + using math::FftInstance; + using math::FftType; + + class FrameFeatures { + public: + bool m_silence{false}; /* If frame contains silence or not. */ + vec1D32F m_featuresVec{}; /* Calculated feature vector to feed to model. */ + vec1D32F m_fftX{}; /* Vector of floats arranged to represent complex numbers. */ + vec1D32F m_fftP{}; /* Vector of floats arranged to represent complex numbers. */ + vec1D32F m_Ex{}; /* Spectral band energy for audio x. */ + vec1D32F m_Ep{}; /* Spectral band energy for pitch p. */ + vec1D32F m_Exp{}; /* Correlated spectral energy between x and p. */ + }; + + /** + * @brief RNNoise pre and post processing class based on the 2018 paper from + * Jan-Marc Valin. Recommended reading: + * - https://jmvalin.ca/demo/rnnoise/ + * - https://arxiv.org/abs/1709.08243 + **/ + class RNNoiseFeatureProcessor { + /* Public interface */ + public: + RNNoiseFeatureProcessor(); + ~RNNoiseFeatureProcessor() = default; + + /** + * @brief Calculates the features from a given audio buffer ready to be sent to RNNoise model. + * @param[in] audioData Pointer to the floating point vector + * with audio data (within the numerical + * limits of int16_t type). + * @param[in] audioLen Number of elements in the audio window. + * @param[out] features FrameFeatures object reference. + **/ + void PreprocessFrame(const float* audioData, + size_t audioLen, + FrameFeatures& features); + + /** + * @brief Use the RNNoise model output gain values with pre-processing features + * to generate audio with noise suppressed. + * @param[in] modelOutput Output gain values from model. + * @param[in] features Calculated features from pre-processing step. + * @param[out] outFrame Output frame to be populated. + **/ + void PostProcessFrame(vec1D32F& modelOutput, FrameFeatures& features, vec1D32F& outFrame); + + + /* Public constants */ + public: + static constexpr uint32_t FRAME_SIZE_SHIFT{2}; + static constexpr uint32_t FRAME_SIZE{512}; + static constexpr uint32_t WINDOW_SIZE{2 * FRAME_SIZE}; + static constexpr uint32_t FREQ_SIZE{FRAME_SIZE + 1}; + + static constexpr uint32_t PITCH_MIN_PERIOD{64}; + static constexpr uint32_t PITCH_MAX_PERIOD{820}; + static constexpr uint32_t PITCH_FRAME_SIZE{1024}; + static constexpr uint32_t PITCH_BUF_SIZE{PITCH_MAX_PERIOD + PITCH_FRAME_SIZE}; + + static constexpr uint32_t NB_BANDS{22}; + static constexpr uint32_t CEPS_MEM{8}; + static constexpr uint32_t NB_DELTA_CEPS{6}; + + static constexpr uint32_t NB_FEATURES{NB_BANDS + 3*NB_DELTA_CEPS + 2}; + + /* Private functions */ + private: + + /** + * @brief Initialises the half window and DCT tables. + */ + void InitTables(); + + /** + * @brief Applies a bi-quadratic filter over the audio window. + * @param[in] bHp Constant coefficient set b (arrHp type). + * @param[in] aHp Constant coefficient set a (arrHp type). + * @param[in,out] memHpX Coefficients populated by this function. + * @param[in,out] audioWindow Floating point vector with audio data. + **/ + void BiQuad( + const arrHp& bHp, + const arrHp& aHp, + arrHp& memHpX, + vec1D32F& audioWindow); + + /** + * @brief Computes features from the "filtered" audio window. + * @param[in] audioWindow Floating point vector with audio data. + * @param[out] features FrameFeatures object reference. + **/ + void ComputeFrameFeatures(vec1D32F& audioWindow, FrameFeatures& features); + + /** + * @brief Runs analysis on the audio buffer. + * @param[in] audioWindow Floating point vector with audio data. + * @param[out] fft Floating point FFT vector containing real and + * imaginary pairs of elements. NOTE: this vector + * does not contain the mirror image (conjugates) + * part of the spectrum. + * @param[out] energy Computed energy for each band in the Bark scale. + * @param[out] analysisMem Buffer sequentially, but partially, + * populated with new audio data. + **/ + void FrameAnalysis( + const vec1D32F& audioWindow, + vec1D32F& fft, + vec1D32F& energy, + vec1D32F& analysisMem); + + /** + * @brief Applies the window function, in-place, over the given + * floating point buffer. + * @param[in,out] x Buffer the window will be applied to. + **/ + void ApplyWindow(vec1D32F& x); + + /** + * @brief Computes the FFT for a given vector. + * @param[in] x Vector to compute the FFT from. + * @param[out] fft Floating point FFT vector containing real and + * imaginary pairs of elements. NOTE: this vector + * does not contain the mirror image (conjugates) + * part of the spectrum. + **/ + void ForwardTransform( + vec1D32F& x, + vec1D32F& fft); + + /** + * @brief Computes band energy for each of the 22 Bark scale bands. + * @param[in] fft_X FFT spectrum (as computed by ForwardTransform). + * @param[out] bandE Vector with 22 elements populated with energy for + * each band. + **/ + void ComputeBandEnergy(const vec1D32F& fft_X, vec1D32F& bandE); + + /** + * @brief Computes band energy correlation. + * @param[in] X FFT vector X. + * @param[in] P FFT vector P. + * @param[out] bandC Vector with 22 elements populated with band energy + * correlation for the two input FFT vectors. + **/ + void ComputeBandCorr(const vec1D32F& X, const vec1D32F& P, vec1D32F& bandC); + + /** + * @brief Performs pitch auto-correlation for a given vector for + * given lag. + * @param[in] x Input vector. + * @param[out] ac Auto-correlation output vector. + * @param[in] lag Lag value. + * @param[in] n Number of elements to consider for correlation + * computation. + **/ + void AutoCorr(const vec1D32F &x, + vec1D32F &ac, + size_t lag, + size_t n); + + /** + * @brief Computes pitch cross-correlation. + * @param[in] x Input vector 1. + * @param[in] y Input vector 2. + * @param[out] xCorr Cross-correlation output vector. + * @param[in] len Number of elements to consider for correlation. + * computation. + * @param[in] maxPitch Maximum pitch. + **/ + void PitchXCorr( + const vec1D32F& x, + const vec1D32F& y, + vec1D32F& xCorr, + size_t len, + size_t maxPitch); + + /** + * @brief Computes "Linear Predictor Coefficients". + * @param[in] ac Correlation vector. + * @param[in] p Number of elements of input vector to consider. + * @param[out] lpc Output coefficients vector. + **/ + void LPC(const vec1D32F& ac, int32_t p, vec1D32F& lpc); + + /** + * @brief Custom FIR implementation. + * @param[in] num FIR coefficient vector. + * @param[in] N Number of elements. + * @param[out] x Vector to be be processed. + **/ + void Fir5(const vec1D32F& num, uint32_t N, vec1D32F& x); + + /** + * @brief Down-sample the pitch buffer. + * @param[in,out] pitchBuf Pitch buffer. + * @param[in] pitchBufSz Buffer size. + **/ + void PitchDownsample(vec1D32F& pitchBuf, size_t pitchBufSz); + + /** + * @brief Pitch search function. + * @param[in] xLP Shifted pitch buffer input. + * @param[in] y Pitch buffer input. + * @param[in] len Length to search for. + * @param[in] maxPitch Maximum pitch. + * @return pitch index. + **/ + int PitchSearch(vec1D32F& xLp, vec1D32F& y, uint32_t len, uint32_t maxPitch); + + /** + * @brief Finds the "best" pitch from the buffer. + * @param[in] xCorr Pitch correlation vector. + * @param[in] y Pitch buffer input. + * @param[in] len Length to search for. + * @param[in] maxPitch Maximum pitch. + * @return pitch array (2 elements). + **/ + arrHp FindBestPitch(vec1D32F& xCorr, vec1D32F& y, uint32_t len, uint32_t maxPitch); + + /** + * @brief Remove pitch period doubling errors. + * @param[in,out] pitchBuf Pitch buffer vector. + * @param[in] maxPeriod Maximum period. + * @param[in] minPeriod Minimum period. + * @param[in] frameSize Frame size. + * @param[in] pitchIdx0_ Pitch index 0. + * @return pitch index. + **/ + int RemoveDoubling( + vec1D32F& pitchBuf, + uint32_t maxPeriod, + uint32_t minPeriod, + uint32_t frameSize, + size_t pitchIdx0_); + + /** + * @brief Computes pitch gain. + * @param[in] xy Single xy cross correlation value. + * @param[in] xx Single xx auto correlation value. + * @param[in] yy Single yy auto correlation value. + * @return Calculated pitch gain. + **/ + float ComputePitchGain(float xy, float xx, float yy); + + /** + * @brief Computes DCT vector from the given input. + * @param[in] input Input vector. + * @param[out] output Output vector with DCT coefficients. + **/ + void DCT(vec1D32F& input, vec1D32F& output); + + /** + * @brief Perform inverse fourier transform on complex spectral vector. + * @param[out] out Output vector. + * @param[in] fftXIn Vector of floats arranged to represent complex numbers interleaved. + **/ + void InverseTransform(vec1D32F& out, vec1D32F& fftXIn); + + /** + * @brief Perform pitch filtering. + * @param[in] features Object with pre-processing calculated frame features. + * @param[in] g Gain values. + **/ + void PitchFilter(FrameFeatures& features, vec1D32F& g); + + /** + * @brief Interpolate the band gain values. + * @param[out] g Gain values. + * @param[in] bandE Vector with 22 elements populated with energy for + * each band. + **/ + void InterpBandGain(vec1D32F& g, vec1D32F& bandE); + + /** + * @brief Create de-noised frame. + * @param[out] outFrame Output vector for storing the created audio frame. + * @param[in] fftY Gain adjusted complex spectral vector. + */ + void FrameSynthesis(vec1D32F& outFrame, vec1D32F& fftY); + + /* Private objects */ + private: + FftInstance m_fftInstReal; /* FFT instance for real numbers */ + FftInstance m_fftInstCmplx; /* FFT instance for complex numbers */ + vec1D32F m_halfWindow; /* Window coefficients */ + vec1D32F m_dctTable; /* DCT table */ + vec1D32F m_analysisMem; /* Buffer used for frame analysis */ + vec2D32F m_cepstralMem; /* Cepstral coefficients */ + size_t m_memId; /* memory ID */ + vec1D32F m_synthesisMem; /* Synthesis mem (used by post-processing) */ + vec1D32F m_pitchBuf; /* Pitch buffer */ + float m_lastGain; /* Last gain calculated */ + int m_lastPeriod; /* Last period calculated */ + arrHp m_memHpX; /* HpX coefficients. */ + vec1D32F m_lastGVec; /* Last gain vector (used by post-processing) */ + + /* Constants */ + const std::array m_eband5ms { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, + 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100}; + }; + + +} /* namespace rnn */ +} /* namespace app */ +} /* namespace arm */ + +#endif /* RNNOISE_FEATURE_PROCESSOR_HPP */ diff --git a/source/application/api/use_case/noise_reduction/include/RNNoiseModel.hpp b/source/application/api/use_case/noise_reduction/include/RNNoiseModel.hpp new file mode 100644 index 0000000..3d2f23c --- /dev/null +++ b/source/application/api/use_case/noise_reduction/include/RNNoiseModel.hpp @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RNNOISE_MODEL_HPP +#define RNNOISE_MODEL_HPP + +#include "Model.hpp" + +extern const uint32_t g_NumInputFeatures; +extern const uint32_t g_FrameLength; +extern const uint32_t g_FrameStride; + +namespace arm { +namespace app { + + class RNNoiseModel : public Model { + public: + /** + * @brief Runs inference for RNNoise model. + * + * Call CopyGruStates so GRU state outputs are copied to GRU state inputs before the inference run. + * Run ResetGruState() method to set states to zero before starting processing logically related data. + * @return True if inference succeeded, False - otherwise + */ + bool RunInference() override; + + /** + * @brief Sets GRU input states to zeros. + * Call this method before starting processing the new sequence of logically related data. + */ + void ResetGruState(); + + /** + * @brief Copy current GRU output states to input states. + * Call this method before starting processing the next sequence of logically related data. + */ + bool CopyGruStates(); + + /* Which index of model outputs does the main output (gains) come from. */ + const size_t m_indexForModelOutput = 1; + + protected: + /** @brief Gets the reference to op resolver interface class. */ + const tflite::MicroOpResolver& GetOpResolver() override; + + /** @brief Adds operations to the op resolver instance. */ + bool EnlistOperations() override; + + /* + Each inference after the first needs to copy 3 GRU states from a output index to input index (model dependent): + 0 -> 3, 2 -> 2, 3 -> 1 + */ + const std::vector> m_gruStateMap = {{0,3}, {2, 2}, {3, 1}}; + private: + /* Maximum number of individual operations that can be enlisted. */ + static constexpr int ms_maxOpCnt = 15; + + /* A mutable op resolver instance. */ + tflite::MicroMutableOpResolver m_opResolver; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* RNNOISE_MODEL_HPP */ diff --git a/source/application/api/use_case/noise_reduction/include/RNNoiseProcessing.hpp b/source/application/api/use_case/noise_reduction/include/RNNoiseProcessing.hpp new file mode 100644 index 0000000..15e62d9 --- /dev/null +++ b/source/application/api/use_case/noise_reduction/include/RNNoiseProcessing.hpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RNNOISE_PROCESSING_HPP +#define RNNOISE_PROCESSING_HPP + +#include "BaseProcessing.hpp" +#include "Model.hpp" +#include "RNNoiseFeatureProcessor.hpp" + +namespace arm { +namespace app { + + /** + * @brief Pre-processing class for Noise Reduction use case. + * Implements methods declared by BasePreProcess and anything else needed + * to populate input tensors ready for inference. + */ + class RNNoisePreProcess : public BasePreProcess { + + public: + /** + * @brief Constructor + * @param[in] inputTensor Pointer to the TFLite Micro input Tensor. + * @param[in/out] featureProcessor RNNoise specific feature extractor object. + * @param[in/out] frameFeatures RNNoise specific features shared between pre & post-processing. + * + **/ + explicit RNNoisePreProcess(TfLiteTensor* inputTensor, + std::shared_ptr featureProcessor, + std::shared_ptr frameFeatures); + + /** + * @brief Should perform pre-processing of 'raw' input audio data and load it into + * TFLite Micro input tensors ready for inference + * @param[in] input Pointer to the data that pre-processing will work on. + * @param[in] inputSize Size of the input data. + * @return true if successful, false otherwise. + **/ + bool DoPreProcess(const void* input, size_t inputSize) override; + + private: + TfLiteTensor* m_inputTensor; /* Model input tensor. */ + std::shared_ptr m_featureProcessor; /* RNNoise feature processor shared between pre & post-processing. */ + std::shared_ptr m_frameFeatures; /* RNNoise features shared between pre & post-processing. */ + rnn::vec1D32F m_audioFrame; /* Audio frame cast to FP32 */ + + /** + * @brief Quantize the given features and populate the input Tensor. + * @param[in] inputFeatures Vector of floating point features to quantize. + * @param[in] quantScale Quantization scale for the inputTensor. + * @param[in] quantOffset Quantization offset for the inputTensor. + * @param[in,out] inputTensor TFLite micro tensor to populate. + **/ + static void QuantizeAndPopulateInput(rnn::vec1D32F& inputFeatures, + float quantScale, int quantOffset, + TfLiteTensor* inputTensor); + }; + + /** + * @brief Post-processing class for Noise Reduction use case. + * Implements methods declared by BasePostProcess and anything else needed + * to populate result vector. + */ + class RNNoisePostProcess : public BasePostProcess { + + public: + /** + * @brief Constructor + * @param[in] outputTensor Pointer to the TFLite Micro output Tensor. + * @param[out] denoisedAudioFrame Vector to store the final denoised audio frame. + * @param[in/out] featureProcessor RNNoise specific feature extractor object. + * @param[in/out] frameFeatures RNNoise specific features shared between pre & post-processing. + **/ + RNNoisePostProcess(TfLiteTensor* outputTensor, + std::vector& denoisedAudioFrame, + std::shared_ptr featureProcessor, + std::shared_ptr frameFeatures); + + /** + * @brief Should perform post-processing of the result of inference then + * populate result data for any later use. + * @return true if successful, false otherwise. + **/ + bool DoPostProcess() override; + + private: + TfLiteTensor* m_outputTensor; /* Model output tensor. */ + std::vector& m_denoisedAudioFrame; /* Vector to store the final denoised frame. */ + rnn::vec1D32F m_denoisedAudioFrameFloat; /* Internal vector to store the final denoised frame (FP32). */ + std::shared_ptr m_featureProcessor; /* RNNoise feature processor shared between pre & post-processing. */ + std::shared_ptr m_frameFeatures; /* RNNoise features shared between pre & post-processing. */ + std::vector m_modelOutputFloat; /* Internal vector to store de-quantized model output. */ + + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* RNNOISE_PROCESSING_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/noise_reduction/src/RNNoiseFeatureProcessor.cc b/source/application/api/use_case/noise_reduction/src/RNNoiseFeatureProcessor.cc new file mode 100644 index 0000000..036894c --- /dev/null +++ b/source/application/api/use_case/noise_reduction/src/RNNoiseFeatureProcessor.cc @@ -0,0 +1,892 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "RNNoiseFeatureProcessor.hpp" +#include "log_macros.h" + +#include +#include +#include + +namespace arm { +namespace app { +namespace rnn { + +#define VERIFY(x) \ +do { \ + if (!(x)) { \ + printf_err("Assert failed:" #x "\n"); \ + exit(1); \ + } \ +} while(0) + +RNNoiseFeatureProcessor::RNNoiseFeatureProcessor() : + m_halfWindow(FRAME_SIZE, 0), + m_dctTable(NB_BANDS * NB_BANDS), + m_analysisMem(FRAME_SIZE, 0), + m_cepstralMem(CEPS_MEM, vec1D32F(NB_BANDS, 0)), + m_memId{0}, + m_synthesisMem(FRAME_SIZE, 0), + m_pitchBuf(PITCH_BUF_SIZE, 0), + m_lastGain{0.0}, + m_lastPeriod{0}, + m_memHpX{}, + m_lastGVec(NB_BANDS, 0) +{ + constexpr uint32_t numFFt = 2 * FRAME_SIZE; + static_assert(numFFt != 0, "Num FFT can't be 0"); + + math::MathUtils::FftInitF32(numFFt, this->m_fftInstReal, FftType::real); + math::MathUtils::FftInitF32(numFFt, this->m_fftInstCmplx, FftType::complex); + this->InitTables(); +} + +void RNNoiseFeatureProcessor::PreprocessFrame(const float* audioData, + const size_t audioLen, + FrameFeatures& features) +{ + /* Note audioWindow is modified in place */ + const arrHp aHp {-1.99599, 0.99600 }; + const arrHp bHp {-2.00000, 1.00000 }; + + vec1D32F audioWindow{audioData, audioData + audioLen}; + + this->BiQuad(bHp, aHp, this->m_memHpX, audioWindow); + this->ComputeFrameFeatures(audioWindow, features); +} + +void RNNoiseFeatureProcessor::PostProcessFrame(vec1D32F& modelOutput, FrameFeatures& features, vec1D32F& outFrame) +{ + std::vector outputBands = modelOutput; + std::vector gain(FREQ_SIZE, 0); + + if (!features.m_silence) { + PitchFilter(features, outputBands); + for (size_t i = 0; i < NB_BANDS; i++) { + float alpha = .6f; + outputBands[i] = std::max(outputBands[i], alpha * m_lastGVec[i]); + m_lastGVec[i] = outputBands[i]; + } + InterpBandGain(gain, outputBands); + for (size_t i = 0; i < FREQ_SIZE; i++) { + features.m_fftX[2 * i] *= gain[i]; /* Real. */ + features.m_fftX[2 * i + 1] *= gain[i]; /*imaginary. */ + + } + + } + + FrameSynthesis(outFrame, features.m_fftX); +} + +void RNNoiseFeatureProcessor::InitTables() +{ + constexpr float pi = M_PI; + constexpr float halfPi = M_PI / 2; + constexpr float halfPiOverFrameSz = halfPi/FRAME_SIZE; + + for (uint32_t i = 0; i < FRAME_SIZE; i++) { + const float sinVal = math::MathUtils::SineF32(halfPiOverFrameSz * (i + 0.5f)); + m_halfWindow[i] = math::MathUtils::SineF32(halfPi * sinVal * sinVal); + } + + for (uint32_t i = 0; i < NB_BANDS; i++) { + for (uint32_t j = 0; j < NB_BANDS; j++) { + m_dctTable[i * NB_BANDS + j] = math::MathUtils::CosineF32((i + 0.5f) * j * pi / NB_BANDS); + } + m_dctTable[i * NB_BANDS] *= math::MathUtils::SqrtF32(0.5f); + } +} + +void RNNoiseFeatureProcessor::BiQuad( + const arrHp& bHp, + const arrHp& aHp, + arrHp& memHpX, + vec1D32F& audioWindow) +{ + for (float& audioElement : audioWindow) { + const auto xi = audioElement; + const auto yi = audioElement + memHpX[0]; + memHpX[0] = memHpX[1] + (bHp[0] * xi - aHp[0] * yi); + memHpX[1] = (bHp[1] * xi - aHp[1] * yi); + audioElement = yi; + } +} + +void RNNoiseFeatureProcessor::ComputeFrameFeatures(vec1D32F& audioWindow, + FrameFeatures& features) +{ + this->FrameAnalysis(audioWindow, + features.m_fftX, + features.m_Ex, + this->m_analysisMem); + + float energy = 0.0; + + vec1D32F Ly(NB_BANDS, 0); + vec1D32F p(WINDOW_SIZE, 0); + vec1D32F pitchBuf(PITCH_BUF_SIZE >> 1, 0); + + VERIFY(PITCH_BUF_SIZE >= this->m_pitchBuf.size()); + std::copy_n(this->m_pitchBuf.begin() + FRAME_SIZE, + PITCH_BUF_SIZE - FRAME_SIZE, + this->m_pitchBuf.begin()); + + VERIFY(FRAME_SIZE <= audioWindow.size() && PITCH_BUF_SIZE > FRAME_SIZE); + std::copy_n(audioWindow.begin(), + FRAME_SIZE, + this->m_pitchBuf.begin() + PITCH_BUF_SIZE - FRAME_SIZE); + + this->PitchDownsample(pitchBuf, PITCH_BUF_SIZE); + + VERIFY(pitchBuf.size() > PITCH_MAX_PERIOD/2); + vec1D32F xLp(pitchBuf.size() - PITCH_MAX_PERIOD/2, 0); + std::copy_n(pitchBuf.begin() + PITCH_MAX_PERIOD/2, xLp.size(), xLp.begin()); + + int pitchIdx = this->PitchSearch(xLp, pitchBuf, + PITCH_FRAME_SIZE, (PITCH_MAX_PERIOD - (3*PITCH_MIN_PERIOD))); + + pitchIdx = this->RemoveDoubling( + pitchBuf, + PITCH_MAX_PERIOD, + PITCH_MIN_PERIOD, + PITCH_FRAME_SIZE, + PITCH_MAX_PERIOD - pitchIdx); + + size_t stIdx = PITCH_BUF_SIZE - WINDOW_SIZE - pitchIdx; + VERIFY((static_cast(PITCH_BUF_SIZE) - static_cast(WINDOW_SIZE) - pitchIdx) >= 0); + std::copy_n(this->m_pitchBuf.begin() + stIdx, WINDOW_SIZE, p.begin()); + + this->ApplyWindow(p); + this->ForwardTransform(p, features.m_fftP); + this->ComputeBandEnergy(features.m_fftP, features.m_Ep); + this->ComputeBandCorr(features.m_fftX, features.m_fftP, features.m_Exp); + + for (uint32_t i = 0 ; i < NB_BANDS; ++i) { + features.m_Exp[i] /= math::MathUtils::SqrtF32( + 0.001f + features.m_Ex[i] * features.m_Ep[i]); + } + + vec1D32F dctVec(NB_BANDS, 0); + this->DCT(features.m_Exp, dctVec); + + features.m_featuresVec = vec1D32F (NB_FEATURES, 0); + for (uint32_t i = 0; i < NB_DELTA_CEPS; ++i) { + features.m_featuresVec[NB_BANDS + 2*NB_DELTA_CEPS + i] = dctVec[i]; + } + + features.m_featuresVec[NB_BANDS + 2*NB_DELTA_CEPS] -= 1.3; + features.m_featuresVec[NB_BANDS + 2*NB_DELTA_CEPS + 1] -= 0.9; + features.m_featuresVec[NB_BANDS + 3*NB_DELTA_CEPS] = 0.01 * (static_cast(pitchIdx) - 300); + + float logMax = -2.f; + float follow = -2.f; + for (uint32_t i = 0; i < NB_BANDS; ++i) { + Ly[i] = log10f(1e-2f + features.m_Ex[i]); + Ly[i] = std::max(logMax - 7, std::max(follow - 1.5, Ly[i])); + logMax = std::max(logMax, Ly[i]); + follow = std::max(follow - 1.5, Ly[i]); + energy += features.m_Ex[i]; + } + + /* If there's no audio avoid messing up the state. */ + features.m_silence = true; + if (energy < 0.04) { + return; + } else { + features.m_silence = false; + } + + this->DCT(Ly, features.m_featuresVec); + features.m_featuresVec[0] -= 12.0; + features.m_featuresVec[1] -= 4.0; + + VERIFY(CEPS_MEM > 2); + uint32_t stIdx1 = this->m_memId < 1 ? CEPS_MEM + this->m_memId - 1 : this->m_memId - 1; + uint32_t stIdx2 = this->m_memId < 2 ? CEPS_MEM + this->m_memId - 2 : this->m_memId - 2; + VERIFY(stIdx1 < this->m_cepstralMem.size()); + VERIFY(stIdx2 < this->m_cepstralMem.size()); + auto ceps1 = this->m_cepstralMem[stIdx1]; + auto ceps2 = this->m_cepstralMem[stIdx2]; + + /* Ceps 0 */ + for (uint32_t i = 0; i < NB_BANDS; ++i) { + this->m_cepstralMem[this->m_memId][i] = features.m_featuresVec[i]; + } + + for (uint32_t i = 0; i < NB_DELTA_CEPS; ++i) { + features.m_featuresVec[i] = this->m_cepstralMem[this->m_memId][i] + ceps1[i] + ceps2[i]; + features.m_featuresVec[NB_BANDS + i] = this->m_cepstralMem[this->m_memId][i] - ceps2[i]; + features.m_featuresVec[NB_BANDS + NB_DELTA_CEPS + i] = + this->m_cepstralMem[this->m_memId][i] - 2 * ceps1[i] + ceps2[i]; + } + + /* Spectral variability features. */ + this->m_memId += 1; + if (this->m_memId == CEPS_MEM) { + this->m_memId = 0; + } + + float specVariability = 0.f; + + VERIFY(this->m_cepstralMem.size() >= CEPS_MEM); + for (size_t i = 0; i < CEPS_MEM; ++i) { + float minDist = 1e15; + for (size_t j = 0; j < CEPS_MEM; ++j) { + float dist = 0.f; + for (size_t k = 0; k < NB_BANDS; ++k) { + VERIFY(this->m_cepstralMem[i].size() >= NB_BANDS); + auto tmp = this->m_cepstralMem[i][k] - this->m_cepstralMem[j][k]; + dist += tmp * tmp; + } + + if (j != i) { + minDist = std::min(minDist, dist); + } + } + specVariability += minDist; + } + + VERIFY(features.m_featuresVec.size() >= NB_BANDS + 3 * NB_DELTA_CEPS + 1); + features.m_featuresVec[NB_BANDS + 3 * NB_DELTA_CEPS + 1] = specVariability / CEPS_MEM - 2.1; +} + +void RNNoiseFeatureProcessor::FrameAnalysis( + const vec1D32F& audioWindow, + vec1D32F& fft, + vec1D32F& energy, + vec1D32F& analysisMem) +{ + vec1D32F x(WINDOW_SIZE, 0); + + /* Move old audio down and populate end with latest audio window. */ + VERIFY(x.size() >= FRAME_SIZE && analysisMem.size() >= FRAME_SIZE); + VERIFY(audioWindow.size() >= FRAME_SIZE); + + std::copy_n(analysisMem.begin(), FRAME_SIZE, x.begin()); + std::copy_n(audioWindow.begin(), x.size() - FRAME_SIZE, x.begin() + FRAME_SIZE); + std::copy_n(audioWindow.begin(), FRAME_SIZE, analysisMem.begin()); + + this->ApplyWindow(x); + + /* Calculate FFT. */ + ForwardTransform(x, fft); + + /* Compute band energy. */ + ComputeBandEnergy(fft, energy); +} + +void RNNoiseFeatureProcessor::ApplyWindow(vec1D32F& x) +{ + if (WINDOW_SIZE != x.size()) { + printf_err("Invalid size for vector to be windowed\n"); + return; + } + + VERIFY(this->m_halfWindow.size() >= FRAME_SIZE); + + /* Multiply input by sinusoidal function. */ + for (size_t i = 0; i < FRAME_SIZE; i++) { + x[i] *= this->m_halfWindow[i]; + x[WINDOW_SIZE - 1 - i] *= this->m_halfWindow[i]; + } +} + +void RNNoiseFeatureProcessor::ForwardTransform( + vec1D32F& x, + vec1D32F& fft) +{ + /* The input vector can be modified by the fft function. */ + fft.reserve(x.size() + 2); + fft.resize(x.size() + 2, 0); + math::MathUtils::FftF32(x, fft, this->m_fftInstReal); + + /* Normalise. */ + for (auto& f : fft) { + f /= this->m_fftInstReal.m_fftLen; + } + + /* Place the last freq element correctly */ + fft[fft.size()-2] = fft[1]; + fft[1] = 0; + + /* NOTE: We don't truncate out FFT vector as it already contains only the + * first half of the FFT's. The conjugates are not present. */ +} + +void RNNoiseFeatureProcessor::ComputeBandEnergy(const vec1D32F& fftX, vec1D32F& bandE) +{ + bandE = vec1D32F(NB_BANDS, 0); + + VERIFY(this->m_eband5ms.size() >= NB_BANDS); + for (uint32_t i = 0; i < NB_BANDS - 1; i++) { + const auto bandSize = (this->m_eband5ms[i + 1] - this->m_eband5ms[i]) + << FRAME_SIZE_SHIFT; + + for (uint32_t j = 0; j < bandSize; j++) { + const auto frac = static_cast(j) / bandSize; + const auto idx = (this->m_eband5ms[i] << FRAME_SIZE_SHIFT) + j; + + auto tmp = fftX[2 * idx] * fftX[2 * idx]; /* Real part */ + tmp += fftX[2 * idx + 1] * fftX[2 * idx + 1]; /* Imaginary part */ + + bandE[i] += (1 - frac) * tmp; + bandE[i + 1] += frac * tmp; + } + } + bandE[0] *= 2; + bandE[NB_BANDS - 1] *= 2; +} + +void RNNoiseFeatureProcessor::ComputeBandCorr(const vec1D32F& X, const vec1D32F& P, vec1D32F& bandC) +{ + bandC = vec1D32F(NB_BANDS, 0); + VERIFY(this->m_eband5ms.size() >= NB_BANDS); + + for (uint32_t i = 0; i < NB_BANDS - 1; i++) { + const auto bandSize = (this->m_eband5ms[i + 1] - this->m_eband5ms[i]) << FRAME_SIZE_SHIFT; + + for (uint32_t j = 0; j < bandSize; j++) { + const auto frac = static_cast(j) / bandSize; + const auto idx = (this->m_eband5ms[i] << FRAME_SIZE_SHIFT) + j; + + auto tmp = X[2 * idx] * P[2 * idx]; /* Real part */ + tmp += X[2 * idx + 1] * P[2 * idx + 1]; /* Imaginary part */ + + bandC[i] += (1 - frac) * tmp; + bandC[i + 1] += frac * tmp; + } + } + bandC[0] *= 2; + bandC[NB_BANDS - 1] *= 2; +} + +void RNNoiseFeatureProcessor::DCT(vec1D32F& input, vec1D32F& output) +{ + VERIFY(this->m_dctTable.size() >= NB_BANDS * NB_BANDS); + for (uint32_t i = 0; i < NB_BANDS; ++i) { + float sum = 0; + + for (uint32_t j = 0, k = 0; j < NB_BANDS; ++j, k += NB_BANDS) { + sum += input[j] * this->m_dctTable[k + i]; + } + output[i] = sum * math::MathUtils::SqrtF32(2.0/22); + } +} + +void RNNoiseFeatureProcessor::PitchDownsample(vec1D32F& pitchBuf, size_t pitchBufSz) { + for (size_t i = 1; i < (pitchBufSz >> 1); ++i) { + pitchBuf[i] = 0.5 * ( + 0.5 * (this->m_pitchBuf[2 * i - 1] + this->m_pitchBuf[2 * i + 1]) + + this->m_pitchBuf[2 * i]); + } + + pitchBuf[0] = 0.5*(0.5*(this->m_pitchBuf[1]) + this->m_pitchBuf[0]); + + vec1D32F ac(5, 0); + size_t numLags = 4; + + this->AutoCorr(pitchBuf, ac, numLags, pitchBufSz >> 1); + + /* Noise floor -40db */ + ac[0] *= 1.0001; + + /* Lag windowing. */ + for (size_t i = 1; i < numLags + 1; ++i) { + ac[i] -= ac[i] * (0.008 * i) * (0.008 * i); + } + + vec1D32F lpc(numLags, 0); + this->LPC(ac, numLags, lpc); + + float tmp = 1.0; + for (size_t i = 0; i < numLags; ++i) { + tmp = 0.9f * tmp; + lpc[i] = lpc[i] * tmp; + } + + vec1D32F lpc2(numLags + 1, 0); + float c1 = 0.8; + + /* Add a zero. */ + lpc2[0] = lpc[0] + 0.8; + lpc2[1] = lpc[1] + (c1 * lpc[0]); + lpc2[2] = lpc[2] + (c1 * lpc[1]); + lpc2[3] = lpc[3] + (c1 * lpc[2]); + lpc2[4] = (c1 * lpc[3]); + + this->Fir5(lpc2, pitchBufSz >> 1, pitchBuf); +} + +int RNNoiseFeatureProcessor::PitchSearch(vec1D32F& xLp, vec1D32F& y, uint32_t len, uint32_t maxPitch) { + uint32_t lag = len + maxPitch; + vec1D32F xLp4(len >> 2, 0); + vec1D32F yLp4(lag >> 2, 0); + vec1D32F xCorr(maxPitch >> 1, 0); + + /* Downsample by 2 again. */ + for (size_t j = 0; j < (len >> 2); ++j) { + xLp4[j] = xLp[2*j]; + } + for (size_t j = 0; j < (lag >> 2); ++j) { + yLp4[j] = y[2*j]; + } + + this->PitchXCorr(xLp4, yLp4, xCorr, len >> 2, maxPitch >> 2); + + /* Coarse search with 4x decimation. */ + arrHp bestPitch = this->FindBestPitch(xCorr, yLp4, len >> 2, maxPitch >> 2); + + /* Finer search with 2x decimation. */ + const int maxIdx = (maxPitch >> 1); + for (int i = 0; i < maxIdx; ++i) { + xCorr[i] = 0; + if (std::abs(i - 2*bestPitch[0]) > 2 and std::abs(i - 2*bestPitch[1]) > 2) { + continue; + } + float sum = 0; + for (size_t j = 0; j < len >> 1; ++j) { + sum += xLp[j] * y[i+j]; + } + + xCorr[i] = std::max(-1.0f, sum); + } + + bestPitch = this->FindBestPitch(xCorr, y, len >> 1, maxPitch >> 1); + + int offset; + /* Refine by pseudo-interpolation. */ + if ( 0 < bestPitch[0] && bestPitch[0] < ((maxPitch >> 1) - 1)) { + float a = xCorr[bestPitch[0] - 1]; + float b = xCorr[bestPitch[0]]; + float c = xCorr[bestPitch[0] + 1]; + + if ( (c-a) > 0.7*(b-a) ) { + offset = 1; + } else if ( (a-c) > 0.7*(b-c) ) { + offset = -1; + } else { + offset = 0; + } + } else { + offset = 0; + } + + return 2*bestPitch[0] - offset; +} + +arrHp RNNoiseFeatureProcessor::FindBestPitch(vec1D32F& xCorr, vec1D32F& y, uint32_t len, uint32_t maxPitch) +{ + float Syy = 1; + arrHp bestNum {-1, -1}; + arrHp bestDen {0, 0}; + arrHp bestPitch {0, 1}; + + for (size_t j = 0; j < len; ++j) { + Syy += (y[j] * y[j]); + } + + for (size_t i = 0; i < maxPitch; ++i ) { + if (xCorr[i] > 0) { + float xCorr16 = xCorr[i] * 1e-12f; /* Avoid problems when squaring. */ + + float num = xCorr16 * xCorr16; + if (num*bestDen[1] > bestNum[1]*Syy) { + if (num*bestDen[0] > bestNum[0]*Syy) { + bestNum[1] = bestNum[0]; + bestDen[1] = bestDen[0]; + bestPitch[1] = bestPitch[0]; + bestNum[0] = num; + bestDen[0] = Syy; + bestPitch[0] = i; + } else { + bestNum[1] = num; + bestDen[1] = Syy; + bestPitch[1] = i; + } + } + } + + Syy += (y[i+len]*y[i+len]) - (y[i]*y[i]); + Syy = std::max(1.0f, Syy); + } + + return bestPitch; +} + +int RNNoiseFeatureProcessor::RemoveDoubling( + vec1D32F& pitchBuf, + uint32_t maxPeriod, + uint32_t minPeriod, + uint32_t frameSize, + size_t pitchIdx0_) +{ + constexpr std::array secondCheck {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2}; + uint32_t minPeriod0 = minPeriod; + float lastPeriod = static_cast(this->m_lastPeriod)/2; + float lastGain = static_cast(this->m_lastGain); + + maxPeriod /= 2; + minPeriod /= 2; + pitchIdx0_ /= 2; + frameSize /= 2; + uint32_t xStart = maxPeriod; + + if (pitchIdx0_ >= maxPeriod) { + pitchIdx0_ = maxPeriod - 1; + } + + size_t pitchIdx = pitchIdx0_; + const size_t pitchIdx0 = pitchIdx0_; + + float xx = 0; + for ( size_t i = xStart; i < xStart+frameSize; ++i) { + xx += (pitchBuf[i] * pitchBuf[i]); + } + + float xy = 0; + for ( size_t i = xStart; i < xStart+frameSize; ++i) { + xy += (pitchBuf[i] * pitchBuf[i-pitchIdx0]); + } + + vec1D32F yyLookup (maxPeriod+1, 0); + yyLookup[0] = xx; + float yy = xx; + + for ( size_t i = 1; i < yyLookup.size(); ++i) { + yy = yy + (pitchBuf[xStart-i] * pitchBuf[xStart-i]) - + (pitchBuf[xStart+frameSize-i] * pitchBuf[xStart+frameSize-i]); + yyLookup[i] = std::max(0.0f, yy); + } + + yy = yyLookup[pitchIdx0]; + float bestXy = xy; + float bestYy = yy; + + float g = this->ComputePitchGain(xy, xx, yy); + float g0 = g; + + /* Look for any pitch at pitchIndex/k. */ + for ( size_t k = 2; k < 16; ++k) { + size_t pitchIdx1 = (2*pitchIdx0+k) / (2*k); + if (pitchIdx1 < minPeriod) { + break; + } + + size_t pitchIdx1b; + /* Look for another strong correlation at T1b. */ + if (k == 2) { + if ((pitchIdx1 + pitchIdx0) > maxPeriod) { + pitchIdx1b = pitchIdx0; + } else { + pitchIdx1b = pitchIdx0 + pitchIdx1; + } + } else { + pitchIdx1b = (2*(secondCheck[k])*pitchIdx0 + k) / (2*k); + } + + xy = 0; + for ( size_t i = xStart; i < xStart+frameSize; ++i) { + xy += (pitchBuf[i] * pitchBuf[i-pitchIdx1]); + } + + float xy2 = 0; + for ( size_t i = xStart; i < xStart+frameSize; ++i) { + xy2 += (pitchBuf[i] * pitchBuf[i-pitchIdx1b]); + } + xy = 0.5f * (xy + xy2); + VERIFY(pitchIdx1b < maxPeriod+1); + yy = 0.5f * (yyLookup[pitchIdx1] + yyLookup[pitchIdx1b]); + + float g1 = this->ComputePitchGain(xy, xx, yy); + + float cont; + if (std::abs(pitchIdx1-lastPeriod) <= 1) { + cont = lastGain; + } else if (std::abs(pitchIdx1-lastPeriod) <= 2 and 5*k*k < pitchIdx0) { + cont = 0.5f*lastGain; + } else { + cont = 0.0f; + } + + float thresh = std::max(0.3, 0.7*g0-cont); + + /* Bias against very high pitch (very short period) to avoid false-positives + * due to short-term correlation */ + if (pitchIdx1 < 3*minPeriod) { + thresh = std::max(0.4, 0.85*g0-cont); + } else if (pitchIdx1 < 2*minPeriod) { + thresh = std::max(0.5, 0.9*g0-cont); + } + if (g1 > thresh) { + bestXy = xy; + bestYy = yy; + pitchIdx = pitchIdx1; + g = g1; + } + } + + bestXy = std::max(0.0f, bestXy); + float pg; + if (bestYy <= bestXy) { + pg = 1.0; + } else { + pg = bestXy/(bestYy+1); + } + + std::array xCorr {0}; + for ( size_t k = 0; k < 3; ++k ) { + for ( size_t i = xStart; i < xStart+frameSize; ++i) { + xCorr[k] += (pitchBuf[i] * pitchBuf[i-(pitchIdx+k-1)]); + } + } + + size_t offset; + if ((xCorr[2]-xCorr[0]) > 0.7*(xCorr[1]-xCorr[0])) { + offset = 1; + } else if ((xCorr[0]-xCorr[2]) > 0.7*(xCorr[1]-xCorr[2])) { + offset = -1; + } else { + offset = 0; + } + + if (pg > g) { + pg = g; + } + + pitchIdx0_ = 2*pitchIdx + offset; + + if (pitchIdx0_ < minPeriod0) { + pitchIdx0_ = minPeriod0; + } + + this->m_lastPeriod = pitchIdx0_; + this->m_lastGain = pg; + + return this->m_lastPeriod; +} + +float RNNoiseFeatureProcessor::ComputePitchGain(float xy, float xx, float yy) +{ + return xy / math::MathUtils::SqrtF32(1+xx*yy); +} + +void RNNoiseFeatureProcessor::AutoCorr( + const vec1D32F& x, + vec1D32F& ac, + size_t lag, + size_t n) +{ + if (n < lag) { + printf_err("Invalid parameters for AutoCorr\n"); + return; + } + + auto fastN = n - lag; + + /* Auto-correlation - can be done by PlatformMath functions */ + this->PitchXCorr(x, x, ac, fastN, lag + 1); + + /* Modify auto-correlation by summing with auto-correlation for different lags. */ + for (size_t k = 0; k < lag + 1; k++) { + float d = 0; + for (size_t i = k + fastN; i < n; i++) { + d += x[i] * x[i - k]; + } + ac[k] += d; + } +} + + +void RNNoiseFeatureProcessor::PitchXCorr( + const vec1D32F& x, + const vec1D32F& y, + vec1D32F& xCorr, + size_t len, + size_t maxPitch) +{ + for (size_t i = 0; i < maxPitch; i++) { + float sum = 0; + for (size_t j = 0; j < len; j++) { + sum += x[j] * y[i + j]; + } + xCorr[i] = sum; + } +} + +/* Linear predictor coefficients */ +void RNNoiseFeatureProcessor::LPC( + const vec1D32F& correlation, + int32_t p, + vec1D32F& lpc) +{ + auto error = correlation[0]; + + if (error != 0) { + for (int i = 0; i < p; i++) { + + /* Sum up this iteration's reflection coefficient */ + float rr = 0; + for (int j = 0; j < i; j++) { + rr += lpc[j] * correlation[i - j]; + } + + rr += correlation[i + 1]; + auto r = -rr / error; + + /* Update LP coefficients and total error */ + lpc[i] = r; + for (int j = 0; j < ((i + 1) >> 1); j++) { + auto tmp1 = lpc[j]; + auto tmp2 = lpc[i - 1 - j]; + lpc[j] = tmp1 + (r * tmp2); + lpc[i - 1 - j] = tmp2 + (r * tmp1); + } + + error = error - (r * r * error); + + /* Bail out once we get 30dB gain */ + if (error < (0.001 * correlation[0])) { + break; + } + } + } +} + +void RNNoiseFeatureProcessor::Fir5( + const vec1D32F &num, + uint32_t N, + vec1D32F &x) +{ + auto num0 = num[0]; + auto num1 = num[1]; + auto num2 = num[2]; + auto num3 = num[3]; + auto num4 = num[4]; + auto mem0 = 0; + auto mem1 = 0; + auto mem2 = 0; + auto mem3 = 0; + auto mem4 = 0; + for (uint32_t i = 0; i < N; i++) + { + auto sum_ = x[i] + (num0 * mem0) + (num1 * mem1) + + (num2 * mem2) + (num3 * mem3) + (num4 * mem4); + mem4 = mem3; + mem3 = mem2; + mem2 = mem1; + mem1 = mem0; + mem0 = x[i]; + x[i] = sum_; + } +} + +void RNNoiseFeatureProcessor::PitchFilter(FrameFeatures &features, vec1D32F &gain) { + std::vector r(NB_BANDS, 0); + std::vector rf(FREQ_SIZE, 0); + std::vector newE(NB_BANDS); + + for (size_t i = 0; i < NB_BANDS; i++) { + if (features.m_Exp[i] > gain[i]) { + r[i] = 1; + } else { + + + r[i] = std::pow(features.m_Exp[i], 2) * (1 - std::pow(gain[i], 2)) / + (.001 + std::pow(gain[i], 2) * (1 - std::pow(features.m_Exp[i], 2))); + } + + + r[i] = math::MathUtils::SqrtF32(std::min(1.0f, std::max(0.0f, r[i]))); + r[i] *= math::MathUtils::SqrtF32(features.m_Ex[i] / (1e-8f + features.m_Ep[i])); + } + + InterpBandGain(rf, r); + for (size_t i = 0; i < FREQ_SIZE - 1; i++) { + features.m_fftX[2 * i] += rf[i] * features.m_fftP[2 * i]; /* Real. */ + features.m_fftX[2 * i + 1] += rf[i] * features.m_fftP[2 * i + 1]; /* Imaginary. */ + + } + ComputeBandEnergy(features.m_fftX, newE); + std::vector norm(NB_BANDS); + std::vector normf(FRAME_SIZE, 0); + for (size_t i = 0; i < NB_BANDS; i++) { + norm[i] = math::MathUtils::SqrtF32(features.m_Ex[i] / (1e-8f + newE[i])); + } + + InterpBandGain(normf, norm); + for (size_t i = 0; i < FREQ_SIZE - 1; i++) { + features.m_fftX[2 * i] *= normf[i]; /* Real. */ + features.m_fftX[2 * i + 1] *= normf[i]; /* Imaginary. */ + + } +} + +void RNNoiseFeatureProcessor::FrameSynthesis(vec1D32F& outFrame, vec1D32F& fftY) { + std::vector x(WINDOW_SIZE, 0); + InverseTransform(x, fftY); + ApplyWindow(x); + for (size_t i = 0; i < FRAME_SIZE; i++) { + outFrame[i] = x[i] + m_synthesisMem[i]; + } + memcpy((m_synthesisMem.data()), &x[FRAME_SIZE], FRAME_SIZE*sizeof(float)); +} + +void RNNoiseFeatureProcessor::InterpBandGain(vec1D32F& g, vec1D32F& bandE) { + for (size_t i = 0; i < NB_BANDS - 1; i++) { + int bandSize = (m_eband5ms[i + 1] - m_eband5ms[i]) << FRAME_SIZE_SHIFT; + for (int j = 0; j < bandSize; j++) { + float frac = static_cast(j) / bandSize; + g[(m_eband5ms[i] << FRAME_SIZE_SHIFT) + j] = (1 - frac) * bandE[i] + frac * bandE[i + 1]; + } + } +} + +void RNNoiseFeatureProcessor::InverseTransform(vec1D32F& out, vec1D32F& fftXIn) { + + std::vector x(WINDOW_SIZE * 2); /* This is complex. */ + vec1D32F newFFT; /* This is complex. */ + + size_t i; + for (i = 0; i < FREQ_SIZE * 2; i++) { + x[i] = fftXIn[i]; + } + for (i = FREQ_SIZE; i < WINDOW_SIZE; i++) { + x[2 * i] = x[2 * (WINDOW_SIZE - i)]; /* Real. */ + x[2 * i + 1] = -x[2 * (WINDOW_SIZE - i) + 1]; /* Imaginary. */ + } + + constexpr uint32_t numFFt = 2 * FRAME_SIZE; + static_assert(numFFt != 0, "numFFt cannot be 0!"); + + vec1D32F fftOut = vec1D32F(x.size(), 0); + math::MathUtils::FftF32(x,fftOut, m_fftInstCmplx); + + /* Normalize. */ + for (auto &f: fftOut) { + f /= numFFt; + } + + out[0] = WINDOW_SIZE * fftOut[0]; /* Real. */ + for (i = 1; i < WINDOW_SIZE; i++) { + out[i] = WINDOW_SIZE * fftOut[(WINDOW_SIZE * 2) - (2 * i)]; /* Real. */ + } +} + + +} /* namespace rnn */ +} /* namespace app */ +} /* namspace arm */ diff --git a/source/application/api/use_case/noise_reduction/src/RNNoiseModel.cc b/source/application/api/use_case/noise_reduction/src/RNNoiseModel.cc new file mode 100644 index 0000000..457cda9 --- /dev/null +++ b/source/application/api/use_case/noise_reduction/src/RNNoiseModel.cc @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "RNNoiseModel.hpp" +#include "log_macros.h" + +const tflite::MicroOpResolver& arm::app::RNNoiseModel::GetOpResolver() +{ + return this->m_opResolver; +} + +bool arm::app::RNNoiseModel::EnlistOperations() +{ + this->m_opResolver.AddUnpack(); + this->m_opResolver.AddFullyConnected(); + this->m_opResolver.AddSplit(); + this->m_opResolver.AddSplitV(); + this->m_opResolver.AddAdd(); + this->m_opResolver.AddLogistic(); + this->m_opResolver.AddMul(); + this->m_opResolver.AddSub(); + this->m_opResolver.AddTanh(); + this->m_opResolver.AddPack(); + this->m_opResolver.AddReshape(); + this->m_opResolver.AddQuantize(); + this->m_opResolver.AddConcatenation(); + this->m_opResolver.AddRelu(); + + if (kTfLiteOk == this->m_opResolver.AddEthosU()) { + info("Added %s support to op resolver\n", + tflite::GetString_ETHOSU()); + } else { + printf_err("Failed to add Arm NPU support to op resolver."); + return false; + } + return true; +} + +bool arm::app::RNNoiseModel::RunInference() +{ + return Model::RunInference(); +} + +void arm::app::RNNoiseModel::ResetGruState() +{ + for (auto& stateMapping: this->m_gruStateMap) { + TfLiteTensor* inputGruStateTensor = this->GetInputTensor(stateMapping.second); + auto* inputGruState = tflite::GetTensorData(inputGruStateTensor); + /* Initial value of states is 0, but this is affected by quantization zero point. */ + auto quantParams = arm::app::GetTensorQuantParams(inputGruStateTensor); + memset(inputGruState, quantParams.offset, inputGruStateTensor->bytes); + } +} + +bool arm::app::RNNoiseModel::CopyGruStates() +{ + std::vector>> tempOutGruStates; + /* Saving output states before copying them to input states to avoid output states modification in the tensor. + * tflu shares input and output tensors memory, thus writing to input tensor can change output tensor values. */ + for (auto& stateMapping: this->m_gruStateMap) { + TfLiteTensor* outputGruStateTensor = this->GetOutputTensor(stateMapping.first); + std::vector tempOutGruState(outputGruStateTensor->bytes); + auto* outGruState = tflite::GetTensorData(outputGruStateTensor); + memcpy(tempOutGruState.data(), outGruState, outputGruStateTensor->bytes); + /* Index of the input tensor and the data to copy. */ + tempOutGruStates.emplace_back(stateMapping.second, std::move(tempOutGruState)); + } + /* Updating input GRU states with saved GRU output states. */ + for (auto& stateMapping: tempOutGruStates) { + auto outputGruStateTensorData = stateMapping.second; + TfLiteTensor* inputGruStateTensor = this->GetInputTensor(stateMapping.first); + if (outputGruStateTensorData.size() != inputGruStateTensor->bytes) { + printf_err("Unexpected number of bytes for GRU state mapping. Input = %zuz, output = %zuz.\n", + inputGruStateTensor->bytes, + outputGruStateTensorData.size()); + return false; + } + auto* inputGruState = tflite::GetTensorData(inputGruStateTensor); + auto* outGruState = outputGruStateTensorData.data(); + memcpy(inputGruState, outGruState, inputGruStateTensor->bytes); + } + return true; +} diff --git a/source/application/api/use_case/noise_reduction/src/RNNoiseProcessing.cc b/source/application/api/use_case/noise_reduction/src/RNNoiseProcessing.cc new file mode 100644 index 0000000..f6a3ec4 --- /dev/null +++ b/source/application/api/use_case/noise_reduction/src/RNNoiseProcessing.cc @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "RNNoiseProcessing.hpp" +#include "log_macros.h" + +namespace arm { +namespace app { + + RNNoisePreProcess::RNNoisePreProcess(TfLiteTensor* inputTensor, + std::shared_ptr featureProcessor, std::shared_ptr frameFeatures) + : m_inputTensor{inputTensor}, + m_featureProcessor{featureProcessor}, + m_frameFeatures{frameFeatures} + {} + + bool RNNoisePreProcess::DoPreProcess(const void* data, size_t inputSize) + { + if (data == nullptr) { + printf_err("Data pointer is null"); + return false; + } + + auto input = static_cast(data); + this->m_audioFrame = rnn::vec1D32F(input, input + inputSize); + m_featureProcessor->PreprocessFrame(this->m_audioFrame.data(), inputSize, *this->m_frameFeatures); + + QuantizeAndPopulateInput(this->m_frameFeatures->m_featuresVec, + this->m_inputTensor->params.scale, this->m_inputTensor->params.zero_point, + this->m_inputTensor); + + debug("Input tensor populated \n"); + + return true; + } + + void RNNoisePreProcess::QuantizeAndPopulateInput(rnn::vec1D32F& inputFeatures, + const float quantScale, const int quantOffset, + TfLiteTensor* inputTensor) + { + const float minVal = std::numeric_limits::min(); + const float maxVal = std::numeric_limits::max(); + + auto* inputTensorData = tflite::GetTensorData(inputTensor); + + for (size_t i=0; i < inputFeatures.size(); ++i) { + float quantValue = ((inputFeatures[i] / quantScale) + quantOffset); + inputTensorData[i] = static_cast(std::min(std::max(quantValue, minVal), maxVal)); + } + } + + RNNoisePostProcess::RNNoisePostProcess(TfLiteTensor* outputTensor, + std::vector& denoisedAudioFrame, + std::shared_ptr featureProcessor, + std::shared_ptr frameFeatures) + : m_outputTensor{outputTensor}, + m_denoisedAudioFrame{denoisedAudioFrame}, + m_featureProcessor{featureProcessor}, + m_frameFeatures{frameFeatures} + { + this->m_denoisedAudioFrameFloat.reserve(denoisedAudioFrame.size()); + this->m_modelOutputFloat.resize(outputTensor->bytes); + } + + bool RNNoisePostProcess::DoPostProcess() + { + const auto* outputData = tflite::GetTensorData(this->m_outputTensor); + auto outputQuantParams = GetTensorQuantParams(this->m_outputTensor); + + for (size_t i = 0; i < this->m_outputTensor->bytes; ++i) { + this->m_modelOutputFloat[i] = (static_cast(outputData[i]) - outputQuantParams.offset) + * outputQuantParams.scale; + } + + this->m_featureProcessor->PostProcessFrame(this->m_modelOutputFloat, + *this->m_frameFeatures, this->m_denoisedAudioFrameFloat); + + for (size_t i = 0; i < this->m_denoisedAudioFrame.size(); ++i) { + this->m_denoisedAudioFrame[i] = static_cast( + std::roundf(this->m_denoisedAudioFrameFloat[i])); + } + + return true; + } + +} /* namespace app */ +} /* namespace arm */ \ No newline at end of file diff --git a/source/application/api/use_case/object_detection/CMakeLists.txt b/source/application/api/use_case/object_detection/CMakeLists.txt new file mode 100644 index 0000000..797ff55 --- /dev/null +++ b/source/application/api/use_case/object_detection/CMakeLists.txt @@ -0,0 +1,40 @@ +#---------------------------------------------------------------------------- +# Copyright (c) 2022 Arm Limited. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#---------------------------------------------------------------------------- +######################################################### +# OBJECT DETECTION API library # +######################################################### +cmake_minimum_required(VERSION 3.15.6) + +set(OBJECT_DETECTION_API_TARGET object_detection_api) +project(${OBJECT_DETECTION_API_TARGET} + DESCRIPTION "Object detection use case API library" + LANGUAGES C CXX) + +# Create static library +add_library(${OBJECT_DETECTION_API_TARGET} STATIC + src/DetectorPreProcessing.cc + src/DetectorPostProcessing.cc + src/YoloFastestModel.cc) + +target_include_directories(${OBJECT_DETECTION_API_TARGET} PUBLIC include) + +target_link_libraries(${OBJECT_DETECTION_API_TARGET} PUBLIC common_api) + +message(STATUS "*******************************************************") +message(STATUS "Library : " ${OBJECT_DETECTION_API_TARGET}) +message(STATUS "CMAKE_SYSTEM_PROCESSOR : " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "*******************************************************") diff --git a/source/application/api/use_case/object_detection/include/DetectionResult.hpp b/source/application/api/use_case/object_detection/include/DetectionResult.hpp new file mode 100644 index 0000000..aa74d90 --- /dev/null +++ b/source/application/api/use_case/object_detection/include/DetectionResult.hpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DETECTION_RESULT_HPP +#define DETECTION_RESULT_HPP + + +namespace arm { +namespace app { +namespace object_detection { + + /** + * @brief Class representing a single detection result. + */ + class DetectionResult { + public: + /** + * @brief Constructor + * @param[in] normalisedVal Result normalized value + * @param[in] x0 Top corner x starting point + * @param[in] y0 Top corner y starting point + * @param[in] w Detection result width + * @param[in] h Detection result height + **/ + DetectionResult(double normalisedVal,int x0,int y0, int w,int h) : + m_normalisedVal(normalisedVal), + m_x0(x0), + m_y0(y0), + m_w(w), + m_h(h) + { + } + + DetectionResult() = default; + ~DetectionResult() = default; + + double m_normalisedVal{0.0}; + int m_x0{0}; + int m_y0{0}; + int m_w{0}; + int m_h{0}; + }; + +} /* namespace object_detection */ +} /* namespace app */ +} /* namespace arm */ + +#endif /* DETECTION_RESULT_HPP */ diff --git a/source/application/api/use_case/object_detection/include/DetectorPostProcessing.hpp b/source/application/api/use_case/object_detection/include/DetectorPostProcessing.hpp new file mode 100644 index 0000000..30bc123 --- /dev/null +++ b/source/application/api/use_case/object_detection/include/DetectorPostProcessing.hpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DETECTOR_POST_PROCESSING_HPP +#define DETECTOR_POST_PROCESSING_HPP + +#include "ImageUtils.hpp" +#include "DetectionResult.hpp" +#include "YoloFastestModel.hpp" +#include "BaseProcessing.hpp" + +#include + +namespace arm { +namespace app { + +namespace object_detection { + + struct Branch { + int resolution; + int numBox; + const float* anchor; + int8_t* modelOutput; + float scale; + int zeroPoint; + size_t size; + }; + + struct Network { + int inputWidth; + int inputHeight; + int numClasses; + std::vector branches; + int topN; + }; + +} /* namespace object_detection */ + + /** + * @brief Post-processing class for Object Detection use case. + * Implements methods declared by BasePostProcess and anything else needed + * to populate result vector. + */ + class DetectorPostProcess : public BasePostProcess { + public: + /** + * @brief Constructor. + * @param[in] outputTensor0 Pointer to the TFLite Micro output Tensor at index 0. + * @param[in] outputTensor1 Pointer to the TFLite Micro output Tensor at index 1. + * @param[out] results Vector of detected results. + * @param[in] inputImgRows Number of rows in the input image. + * @param[in] inputImgCols Number of columns in the input image. + * @param[in] threshold Post-processing threshold. + * @param[in] nms Non-maximum Suppression threshold. + * @param[in] numClasses Number of classes. + * @param[in] topN Top N for each class. + **/ + explicit DetectorPostProcess(TfLiteTensor* outputTensor0, + TfLiteTensor* outputTensor1, + std::vector& results, + int inputImgRows, + int inputImgCols, + float threshold = 0.5f, + float nms = 0.45f, + int numClasses = 1, + int topN = 0); + + /** + * @brief Should perform YOLO post-processing of the result of inference then + * populate Detection result data for any later use. + * @return true if successful, false otherwise. + **/ + bool DoPostProcess() override; + + private: + TfLiteTensor* m_outputTensor0; /* Output tensor index 0 */ + TfLiteTensor* m_outputTensor1; /* Output tensor index 1 */ + std::vector& m_results; /* Single inference results. */ + int m_inputImgRows; /* Number of rows for model input. */ + int m_inputImgCols; /* Number of cols for model input. */ + float m_threshold; /* Post-processing threshold. */ + float m_nms; /* NMS threshold. */ + int m_numClasses; /* Number of classes. */ + int m_topN; /* TopN. */ + object_detection::Network m_net; /* YOLO network object. */ + + /** + * @brief Insert the given Detection in the list. + * @param[in] detections List of detections. + * @param[in] det Detection to be inserted. + **/ + void InsertTopNDetections(std::forward_list& detections, image::Detection& det); + + /** + * @brief Given a Network calculate the detection boxes. + * @param[in] net Network. + * @param[in] imageWidth Original image width. + * @param[in] imageHeight Original image height. + * @param[in] threshold Detections threshold. + * @param[out] detections Detection boxes. + **/ + void GetNetworkBoxes(object_detection::Network& net, + int imageWidth, + int imageHeight, + float threshold, + std::forward_list& detections); + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* DETECTOR_POST_PROCESSING_HPP */ diff --git a/source/application/api/use_case/object_detection/include/DetectorPreProcessing.hpp b/source/application/api/use_case/object_detection/include/DetectorPreProcessing.hpp new file mode 100644 index 0000000..4936048 --- /dev/null +++ b/source/application/api/use_case/object_detection/include/DetectorPreProcessing.hpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DETECTOR_PRE_PROCESSING_HPP +#define DETECTOR_PRE_PROCESSING_HPP + +#include "BaseProcessing.hpp" +#include "Classifier.hpp" + +namespace arm { +namespace app { + + /** + * @brief Pre-processing class for Object detection use case. + * Implements methods declared by BasePreProcess and anything else needed + * to populate input tensors ready for inference. + */ + class DetectorPreProcess : public BasePreProcess { + + public: + /** + * @brief Constructor + * @param[in] inputTensor Pointer to the TFLite Micro input Tensor. + * @param[in] rgb2Gray Convert image from 3 channel RGB to 1 channel grayscale. + * @param[in] convertToInt8 Convert the image from uint8 to int8 range. + **/ + explicit DetectorPreProcess(TfLiteTensor* inputTensor, bool rgb2Gray, bool convertToInt8); + + /** + * @brief Should perform pre-processing of 'raw' input image data and load it into + * TFLite Micro input tensor ready for inference + * @param[in] input Pointer to the data that pre-processing will work on. + * @param[in] inputSize Size of the input data. + * @return true if successful, false otherwise. + **/ + bool DoPreProcess(const void* input, size_t inputSize) override; + + private: + TfLiteTensor* m_inputTensor; + bool m_rgb2Gray; + bool m_convertToInt8; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* DETECTOR_PRE_PROCESSING_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/object_detection/include/YoloFastestModel.hpp b/source/application/api/use_case/object_detection/include/YoloFastestModel.hpp new file mode 100644 index 0000000..4c64433 --- /dev/null +++ b/source/application/api/use_case/object_detection/include/YoloFastestModel.hpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef YOLO_FASTEST_MODEL_HPP +#define YOLO_FASTEST_MODEL_HPP + +#include "Model.hpp" + +extern const int originalImageSize; +extern const int channelsImageDisplayed; +extern const float anchor1[]; +extern const float anchor2[]; + +namespace arm { +namespace app { + + class YoloFastestModel : public Model { + + public: + /* Indices for the expected model - based on input tensor shape */ + static constexpr uint32_t ms_inputRowsIdx = 1; + static constexpr uint32_t ms_inputColsIdx = 2; + static constexpr uint32_t ms_inputChannelsIdx = 3; + + protected: + /** @brief Gets the reference to op resolver interface class. */ + const tflite::MicroOpResolver& GetOpResolver() override; + + /** @brief Adds operations to the op resolver instance. */ + bool EnlistOperations() override; + + private: + /* Maximum number of individual operations that can be enlisted. */ + static constexpr int ms_maxOpCnt = 8; + + /* A mutable op resolver instance. */ + tflite::MicroMutableOpResolver m_opResolver; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* YOLO_FASTEST_MODEL_HPP */ diff --git a/source/application/api/use_case/object_detection/src/DetectorPostProcessing.cc b/source/application/api/use_case/object_detection/src/DetectorPostProcessing.cc new file mode 100644 index 0000000..fb1606a --- /dev/null +++ b/source/application/api/use_case/object_detection/src/DetectorPostProcessing.cc @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "DetectorPostProcessing.hpp" +#include "PlatformMath.hpp" + +#include + +namespace arm { +namespace app { + + DetectorPostProcess::DetectorPostProcess( + TfLiteTensor* modelOutput0, + TfLiteTensor* modelOutput1, + std::vector& results, + int inputImgRows, + int inputImgCols, + const float threshold, + const float nms, + int numClasses, + int topN) + : m_outputTensor0{modelOutput0}, + m_outputTensor1{modelOutput1}, + m_results{results}, + m_inputImgRows{inputImgRows}, + m_inputImgCols{inputImgCols}, + m_threshold(threshold), + m_nms(nms), + m_numClasses(numClasses), + m_topN(topN) +{ + /* Init PostProcessing */ + this->m_net = + object_detection::Network { + .inputWidth = inputImgCols, + .inputHeight = inputImgRows, + .numClasses = numClasses, + .branches = { + object_detection::Branch { + .resolution = inputImgCols/32, + .numBox = 3, + .anchor = anchor1, + .modelOutput = this->m_outputTensor0->data.int8, + .scale = (static_cast( + this->m_outputTensor0->quantization.params))->scale->data[0], + .zeroPoint = (static_cast( + this->m_outputTensor0->quantization.params))->zero_point->data[0], + .size = this->m_outputTensor0->bytes + }, + object_detection::Branch { + .resolution = inputImgCols/16, + .numBox = 3, + .anchor = anchor2, + .modelOutput = this->m_outputTensor1->data.int8, + .scale = (static_cast( + this->m_outputTensor1->quantization.params))->scale->data[0], + .zeroPoint = (static_cast( + this->m_outputTensor1->quantization.params))->zero_point->data[0], + .size = this->m_outputTensor1->bytes + } + }, + .topN = m_topN + }; + /* End init */ +} + +bool DetectorPostProcess::DoPostProcess() +{ + /* Start postprocessing */ + int originalImageWidth = originalImageSize; + int originalImageHeight = originalImageSize; + + std::forward_list detections; + GetNetworkBoxes(this->m_net, originalImageWidth, originalImageHeight, m_threshold, detections); + + /* Do nms */ + CalculateNMS(detections, this->m_net.numClasses, m_nms); + + for (auto& it: detections) { + float xMin = it.bbox.x - it.bbox.w / 2.0f; + float xMax = it.bbox.x + it.bbox.w / 2.0f; + float yMin = it.bbox.y - it.bbox.h / 2.0f; + float yMax = it.bbox.y + it.bbox.h / 2.0f; + + if (xMin < 0) { + xMin = 0; + } + if (yMin < 0) { + yMin = 0; + } + if (xMax > originalImageWidth) { + xMax = originalImageWidth; + } + if (yMax > originalImageHeight) { + yMax = originalImageHeight; + } + + float boxX = xMin; + float boxY = yMin; + float boxWidth = xMax - xMin; + float boxHeight = yMax - yMin; + + for (int j = 0; j < this->m_net.numClasses; ++j) { + if (it.prob[j] > 0) { + + object_detection::DetectionResult tmpResult = {}; + tmpResult.m_normalisedVal = it.prob[j]; + tmpResult.m_x0 = boxX; + tmpResult.m_y0 = boxY; + tmpResult.m_w = boxWidth; + tmpResult.m_h = boxHeight; + + this->m_results.push_back(tmpResult); + } + } + } + return true; +} + +void DetectorPostProcess::InsertTopNDetections(std::forward_list& detections, image::Detection& det) +{ + std::forward_list::iterator it; + std::forward_list::iterator last_it; + for ( it = detections.begin(); it != detections.end(); ++it ) { + if(it->objectness > det.objectness) + break; + last_it = it; + } + if(it != detections.begin()) { + detections.emplace_after(last_it, det); + detections.pop_front(); + } +} + +void DetectorPostProcess::GetNetworkBoxes( + object_detection::Network& net, + int imageWidth, + int imageHeight, + float threshold, + std::forward_list& detections) +{ + int numClasses = net.numClasses; + int num = 0; + auto det_objectness_comparator = [](image::Detection& pa, image::Detection& pb) { + return pa.objectness < pb.objectness; + }; + for (size_t i = 0; i < net.branches.size(); ++i) { + int height = net.branches[i].resolution; + int width = net.branches[i].resolution; + int channel = net.branches[i].numBox*(5+numClasses); + + for (int h = 0; h < net.branches[i].resolution; h++) { + for (int w = 0; w < net.branches[i].resolution; w++) { + for (int anc = 0; anc < net.branches[i].numBox; anc++) { + + /* Objectness score */ + int bbox_obj_offset = h * width * channel + w * channel + anc * (numClasses + 5) + 4; + float objectness = math::MathUtils::SigmoidF32( + (static_cast(net.branches[i].modelOutput[bbox_obj_offset]) + - net.branches[i].zeroPoint + ) * net.branches[i].scale); + + if(objectness > threshold) { + image::Detection det; + det.objectness = objectness; + /* Get bbox prediction data for each anchor, each feature point */ + int bbox_x_offset = bbox_obj_offset -4; + int bbox_y_offset = bbox_x_offset + 1; + int bbox_w_offset = bbox_x_offset + 2; + int bbox_h_offset = bbox_x_offset + 3; + int bbox_scores_offset = bbox_x_offset + 5; + + det.bbox.x = (static_cast(net.branches[i].modelOutput[bbox_x_offset]) + - net.branches[i].zeroPoint) * net.branches[i].scale; + det.bbox.y = (static_cast(net.branches[i].modelOutput[bbox_y_offset]) + - net.branches[i].zeroPoint) * net.branches[i].scale; + det.bbox.w = (static_cast(net.branches[i].modelOutput[bbox_w_offset]) + - net.branches[i].zeroPoint) * net.branches[i].scale; + det.bbox.h = (static_cast(net.branches[i].modelOutput[bbox_h_offset]) + - net.branches[i].zeroPoint) * net.branches[i].scale; + + float bbox_x, bbox_y; + + /* Eliminate grid sensitivity trick involved in YOLOv4 */ + bbox_x = math::MathUtils::SigmoidF32(det.bbox.x); + bbox_y = math::MathUtils::SigmoidF32(det.bbox.y); + det.bbox.x = (bbox_x + w) / width; + det.bbox.y = (bbox_y + h) / height; + + det.bbox.w = std::exp(det.bbox.w) * net.branches[i].anchor[anc*2] / net.inputWidth; + det.bbox.h = std::exp(det.bbox.h) * net.branches[i].anchor[anc*2+1] / net.inputHeight; + + for (int s = 0; s < numClasses; s++) { + float sig = math::MathUtils::SigmoidF32( + (static_cast(net.branches[i].modelOutput[bbox_scores_offset + s]) - + net.branches[i].zeroPoint) * net.branches[i].scale + ) * objectness; + det.prob.emplace_back((sig > threshold) ? sig : 0); + } + + /* Correct_YOLO_boxes */ + det.bbox.x *= imageWidth; + det.bbox.w *= imageWidth; + det.bbox.y *= imageHeight; + det.bbox.h *= imageHeight; + + if (num < net.topN || net.topN <=0) { + detections.emplace_front(det); + num += 1; + } else if (num == net.topN) { + detections.sort(det_objectness_comparator); + InsertTopNDetections(detections,det); + num += 1; + } else { + InsertTopNDetections(detections,det); + } + } + } + } + } + } + if(num > net.topN) + num -=1; +} + +} /* namespace app */ +} /* namespace arm */ diff --git a/source/application/api/use_case/object_detection/src/DetectorPreProcessing.cc b/source/application/api/use_case/object_detection/src/DetectorPreProcessing.cc new file mode 100644 index 0000000..7212046 --- /dev/null +++ b/source/application/api/use_case/object_detection/src/DetectorPreProcessing.cc @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "DetectorPreProcessing.hpp" +#include "ImageUtils.hpp" +#include "log_macros.h" + +namespace arm { +namespace app { + + DetectorPreProcess::DetectorPreProcess(TfLiteTensor* inputTensor, bool rgb2Gray, bool convertToInt8) + : m_inputTensor{inputTensor}, + m_rgb2Gray{rgb2Gray}, + m_convertToInt8{convertToInt8} + {} + + bool DetectorPreProcess::DoPreProcess(const void* data, size_t inputSize) { + if (data == nullptr) { + printf_err("Data pointer is null"); + } + + auto input = static_cast(data); + + if (this->m_rgb2Gray) { + image::RgbToGrayscale(input, this->m_inputTensor->data.uint8, this->m_inputTensor->bytes); + } else { + std::memcpy(this->m_inputTensor->data.data, input, inputSize); + } + debug("Input tensor populated \n"); + + if (this->m_convertToInt8) { + image::ConvertImgToInt8(this->m_inputTensor->data.data, this->m_inputTensor->bytes); + } + + return true; + } + +} /* namespace app */ +} /* namespace arm */ \ No newline at end of file diff --git a/source/application/api/use_case/object_detection/src/YoloFastestModel.cc b/source/application/api/use_case/object_detection/src/YoloFastestModel.cc new file mode 100644 index 0000000..e293181 --- /dev/null +++ b/source/application/api/use_case/object_detection/src/YoloFastestModel.cc @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "YoloFastestModel.hpp" + +#include "log_macros.h" + +const tflite::MicroOpResolver& arm::app::YoloFastestModel::GetOpResolver() +{ + return this->m_opResolver; +} + +bool arm::app::YoloFastestModel::EnlistOperations() +{ + this->m_opResolver.AddDepthwiseConv2D(); + this->m_opResolver.AddConv2D(); + this->m_opResolver.AddAdd(); + this->m_opResolver.AddResizeNearestNeighbor(); + /*These are needed for UT to work, not needed on FVP */ + this->m_opResolver.AddPad(); + this->m_opResolver.AddMaxPool2D(); + this->m_opResolver.AddConcatenation(); + + if (kTfLiteOk == this->m_opResolver.AddEthosU()) { + info("Added %s support to op resolver\n", + tflite::GetString_ETHOSU()); + } else { + printf_err("Failed to add Arm NPU support to op resolver."); + return false; + } + return true; +} diff --git a/source/application/api/use_case/vww/CMakeLists.txt b/source/application/api/use_case/vww/CMakeLists.txt new file mode 100644 index 0000000..b933d32 --- /dev/null +++ b/source/application/api/use_case/vww/CMakeLists.txt @@ -0,0 +1,39 @@ +#---------------------------------------------------------------------------- +# Copyright (c) 2022 Arm Limited. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#---------------------------------------------------------------------------- +######################################################### +# VISUAL WAKE WORD API library # +######################################################### +cmake_minimum_required(VERSION 3.15.6) + +set(VWW_API_TARGET vww_api) +project(${VWW_API_TARGET} + DESCRIPTION "Visual wake word use case API library" + LANGUAGES C CXX) + +# Create static library +add_library(${VWW_API_TARGET} STATIC + src/VisualWakeWordProcessing.cc + src/VisualWakeWordModel.cc) + +target_include_directories(${VWW_API_TARGET} PUBLIC include) + +target_link_libraries(${VWW_API_TARGET} PUBLIC common_api) + +message(STATUS "*******************************************************") +message(STATUS "Library : " ${VWW_API_TARGET}) +message(STATUS "CMAKE_SYSTEM_PROCESSOR : " ${CMAKE_SYSTEM_PROCESSOR}) +message(STATUS "*******************************************************") diff --git a/source/application/api/use_case/vww/include/VisualWakeWordModel.hpp b/source/application/api/use_case/vww/include/VisualWakeWordModel.hpp new file mode 100644 index 0000000..a34b904 --- /dev/null +++ b/source/application/api/use_case/vww/include/VisualWakeWordModel.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021 - 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef VISUAL_WAKE_WORD_MODEL_HPP +#define VISUAL_WAKE_WORD_MODEL_HPP + +#include "Model.hpp" + +namespace arm { +namespace app { + + class VisualWakeWordModel : public Model { + + public: + /* Indices for the expected model - based on input tensor shape */ + static constexpr uint32_t ms_inputRowsIdx = 1; + static constexpr uint32_t ms_inputColsIdx = 2; + static constexpr uint32_t ms_inputChannelsIdx = 3; + + protected: + /** @brief Gets the reference to op resolver interface class. */ + const tflite::MicroOpResolver& GetOpResolver() override; + + /** @brief Adds operations to the op resolver instance. */ + bool EnlistOperations() override; + private: + /* Maximum number of individual operations that can be enlisted. */ + static constexpr int ms_maxOpCnt = 7; + + /* A mutable op resolver instance. */ + tflite::MicroMutableOpResolver m_opResolver; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* VISUAL_WAKE_WORD_MODEL_HPP */ diff --git a/source/application/api/use_case/vww/include/VisualWakeWordProcessing.hpp b/source/application/api/use_case/vww/include/VisualWakeWordProcessing.hpp new file mode 100644 index 0000000..f9f9d72 --- /dev/null +++ b/source/application/api/use_case/vww/include/VisualWakeWordProcessing.hpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef VWW_PROCESSING_HPP +#define VWW_PROCESSING_HPP + +#include "BaseProcessing.hpp" +#include "Model.hpp" +#include "Classifier.hpp" + +namespace arm { +namespace app { + + /** + * @brief Pre-processing class for Visual Wake Word use case. + * Implements methods declared by BasePreProcess and anything else needed + * to populate input tensors ready for inference. + */ + class VisualWakeWordPreProcess : public BasePreProcess { + + public: + /** + * @brief Constructor + * @param[in] inputTensor Pointer to the TFLite Micro input Tensor. + * @param[in] rgb2Gray Convert image from 3 channel RGB to 1 channel grayscale. + **/ + explicit VisualWakeWordPreProcess(TfLiteTensor* inputTensor, bool rgb2Gray=true); + + /** + * @brief Should perform pre-processing of 'raw' input image data and load it into + * TFLite Micro input tensors ready for inference + * @param[in] input Pointer to the data that pre-processing will work on. + * @param[in] inputSize Size of the input data. + * @return true if successful, false otherwise. + **/ + bool DoPreProcess(const void* input, size_t inputSize) override; + + private: + TfLiteTensor* m_inputTensor; + bool m_rgb2Gray; + }; + + /** + * @brief Post-processing class for Visual Wake Word use case. + * Implements methods declared by BasePostProcess and anything else needed + * to populate result vector. + */ + class VisualWakeWordPostProcess : public BasePostProcess { + + private: + TfLiteTensor* m_outputTensor; + Classifier& m_vwwClassifier; + const std::vector& m_labels; + std::vector& m_results; + + public: + /** + * @brief Constructor + * @param[in] outputTensor Pointer to the TFLite Micro output Tensor. + * @param[in] classifier Classifier object used to get top N results from classification. + * @param[in] model Pointer to the VWW classification Model object. + * @param[in] labels Vector of string labels to identify each output of the model. + * @param[out] results Vector of classification results to store decoded outputs. + **/ + VisualWakeWordPostProcess(TfLiteTensor* outputTensor, Classifier& classifier, + const std::vector& labels, + std::vector& results); + + /** + * @brief Should perform post-processing of the result of inference then + * populate classification result data for any later use. + * @return true if successful, false otherwise. + **/ + bool DoPostProcess() override; + }; + +} /* namespace app */ +} /* namespace arm */ + +#endif /* VWW_PROCESSING_HPP */ \ No newline at end of file diff --git a/source/application/api/use_case/vww/src/VisualWakeWordModel.cc b/source/application/api/use_case/vww/src/VisualWakeWordModel.cc new file mode 100644 index 0000000..2d8a125 --- /dev/null +++ b/source/application/api/use_case/vww/src/VisualWakeWordModel.cc @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "VisualWakeWordModel.hpp" +#include "log_macros.h" + +const tflite::MicroOpResolver& arm::app::VisualWakeWordModel::GetOpResolver() +{ + return this->m_opResolver; +} + +bool arm::app::VisualWakeWordModel::EnlistOperations() +{ + this->m_opResolver.AddDepthwiseConv2D(); + this->m_opResolver.AddConv2D(); + this->m_opResolver.AddAveragePool2D(); + this->m_opResolver.AddReshape(); + this->m_opResolver.AddPad(); + this->m_opResolver.AddAdd(); + + if (kTfLiteOk == this->m_opResolver.AddEthosU()) { + info("Added %s support to op resolver\n", + tflite::GetString_ETHOSU()); + } else { + printf_err("Failed to add Arm NPU support to op resolver."); + return false; + } + return true; +} diff --git a/source/application/api/use_case/vww/src/VisualWakeWordProcessing.cc b/source/application/api/use_case/vww/src/VisualWakeWordProcessing.cc new file mode 100644 index 0000000..4ae8a54 --- /dev/null +++ b/source/application/api/use_case/vww/src/VisualWakeWordProcessing.cc @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2022 Arm Limited. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "VisualWakeWordProcessing.hpp" + +#include "ImageUtils.hpp" +#include "VisualWakeWordModel.hpp" +#include "log_macros.h" + +namespace arm { +namespace app { + + VisualWakeWordPreProcess::VisualWakeWordPreProcess(TfLiteTensor* inputTensor, bool rgb2Gray) + :m_inputTensor{inputTensor}, + m_rgb2Gray{rgb2Gray} + {} + + bool VisualWakeWordPreProcess::DoPreProcess(const void* data, size_t inputSize) + { + if (data == nullptr) { + printf_err("Data pointer is null"); + } + + auto input = static_cast(data); + + uint8_t* unsignedDstPtr = this->m_inputTensor->data.uint8; + + if (this->m_rgb2Gray) { + image::RgbToGrayscale(input, unsignedDstPtr, inputSize); + } else { + std::memcpy(unsignedDstPtr, input, inputSize); + } + + /* VWW model pre-processing is image conversion from uint8 to [0,1] float values, + * then quantize them with input quantization info. */ + QuantParams inQuantParams = GetTensorQuantParams(this->m_inputTensor); + + int8_t* signedDstPtr = this->m_inputTensor->data.int8; + for (size_t i = 0; i < this->m_inputTensor->bytes; i++) { + auto i_data_int8 = static_cast( + ((static_cast(unsignedDstPtr[i]) / 255.0f) / inQuantParams.scale) + inQuantParams.offset + ); + signedDstPtr[i] = std::min(INT8_MAX, std::max(i_data_int8, INT8_MIN)); + } + + debug("Input tensor populated \n"); + + return true; + } + + VisualWakeWordPostProcess::VisualWakeWordPostProcess(TfLiteTensor* outputTensor, Classifier& classifier, + const std::vector& labels, std::vector& results) + :m_outputTensor{outputTensor}, + m_vwwClassifier{classifier}, + m_labels{labels}, + m_results{results} + {} + + bool VisualWakeWordPostProcess::DoPostProcess() + { + return this->m_vwwClassifier.GetClassificationResults( + this->m_outputTensor, this->m_results, + this->m_labels, 1, true); + } + +} /* namespace app */ +} /* namespace arm */ \ No newline at end of file -- cgit v1.2.1