From 95b930c940c47e2f9a783cf17c87449cab4633c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89anna=20=C3=93=20Cath=C3=A1in?= Date: Wed, 7 Apr 2021 14:35:25 +0100 Subject: MLECO-1252 ASR sample application using the public ArmNN C++ API. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change-Id: I98cd505b8772a8c8fa88308121bc94135bb45068 Signed-off-by: Éanna Ó Catháin --- samples/ObjectDetection/CMakeLists.txt | 10 +- samples/ObjectDetection/Readme.md | 5 +- .../ObjectDetection/cmake/aarch64-toolchain.cmake | 20 -- .../cmake/arm-linux-gnueabihf-toolchain.cmake | 20 -- samples/ObjectDetection/cmake/find_armnn.cmake | 35 -- samples/ObjectDetection/cmake/find_catch.cmake | 16 - samples/ObjectDetection/cmake/find_opencv.cmake | 203 ----------- samples/ObjectDetection/cmake/unit_tests.cmake | 6 +- .../include/ArmnnNetworkExecutor.hpp | 80 ----- samples/ObjectDetection/include/CmdArgsParser.hpp | 50 --- .../ObjectDetection/include/CvVideoFileWriter.hpp | 61 ---- .../ObjectDetection/include/CvVideoFrameReader.hpp | 108 ------ samples/ObjectDetection/include/CvWindowOutput.hpp | 53 --- .../include/IDetectionResultDecoder.hpp | 6 +- samples/ObjectDetection/include/IFrameOutput.hpp | 48 --- samples/ObjectDetection/include/IFrameReader.hpp | 45 --- samples/ObjectDetection/include/ImageUtils.hpp | 6 +- .../ObjectDetection/include/NetworkPipeline.hpp | 148 -------- .../include/ObjectDetectionPipeline.hpp | 148 ++++++++ .../ObjectDetection/include/SSDResultDecoder.hpp | 6 +- samples/ObjectDetection/include/Types.hpp | 50 --- .../ObjectDetection/include/YoloResultDecoder.hpp | 6 +- .../ObjectDetection/src/ArmnnNetworkExecutor.cpp | 140 -------- samples/ObjectDetection/src/CmdArgsParser.cpp | 70 ---- samples/ObjectDetection/src/CvVideoFileWriter.cpp | 38 -- samples/ObjectDetection/src/CvVideoFrameReader.cpp | 98 ----- samples/ObjectDetection/src/CvWindowOutput.cpp | 33 -- samples/ObjectDetection/src/ImageUtils.cpp | 6 +- samples/ObjectDetection/src/Main.cpp | 54 ++- samples/ObjectDetection/src/NetworkPipeline.cpp | 102 ------ .../src/ObjectDetectionPipeline.cpp | 102 ++++++ samples/ObjectDetection/src/SSDResultDecoder.cpp | 6 +- samples/ObjectDetection/src/YoloResultDecoder.cpp | 8 +- samples/ObjectDetection/test/FrameReaderTest.cpp | 4 +- samples/ObjectDetection/test/ImageUtilsTest.cpp | 4 +- samples/ObjectDetection/test/PipelineTest.cpp | 6 +- samples/SpeechRecognition/CMakeLists.txt | 62 ++++ samples/SpeechRecognition/Readme.md | 245 +++++++++++++ samples/SpeechRecognition/cmake/unit_tests.cmake | 34 ++ samples/SpeechRecognition/include/AudioCapture.hpp | 62 ++++ .../SpeechRecognition/include/DataStructures.hpp | 102 ++++++ samples/SpeechRecognition/include/Decoder.hpp | 63 ++++ samples/SpeechRecognition/include/MFCC.hpp | 244 +++++++++++++ samples/SpeechRecognition/include/MathUtils.hpp | 85 +++++ samples/SpeechRecognition/include/Preprocess.hpp | 175 +++++++++ .../SpeechRecognition/include/SlidingWindow.hpp | 161 +++++++++ .../include/SpeechRecognitionPipeline.hpp | 139 ++++++++ samples/SpeechRecognition/src/AudioCapture.cpp | 104 ++++++ samples/SpeechRecognition/src/Decoder.cpp | 37 ++ samples/SpeechRecognition/src/MFCC.cpp | 397 +++++++++++++++++++++ samples/SpeechRecognition/src/Main.cpp | 157 ++++++++ samples/SpeechRecognition/src/MathUtils.cpp | 112 ++++++ samples/SpeechRecognition/src/Preprocess.cpp | 192 ++++++++++ .../src/SpeechRecognitionPipeline.cpp | 26 ++ .../SpeechRecognition/test/AudioCaptureTest.cpp | 61 ++++ samples/SpeechRecognition/test/DecoderTest.cpp | 86 +++++ samples/SpeechRecognition/test/MFCCTest.cpp | 102 ++++++ samples/SpeechRecognition/test/PreprocessTest.cpp | 136 +++++++ samples/common/cmake/aarch64-toolchain.cmake | 20 ++ .../cmake/arm-linux-gnueabihf-toolchain.cmake | 20 ++ samples/common/cmake/find_armnn.cmake | 35 ++ samples/common/cmake/find_catch.cmake | 16 + samples/common/cmake/find_opencv.cmake | 203 +++++++++++ .../include/ArmnnUtils/ArmnnNetworkExecutor.hpp | 214 +++++++++++ .../common/include/CVUtils/CvVideoFileWriter.hpp | 61 ++++ .../common/include/CVUtils/CvVideoFrameReader.hpp | 108 ++++++ samples/common/include/CVUtils/CvWindowOutput.hpp | 53 +++ samples/common/include/CVUtils/IFrameOutput.hpp | 48 +++ samples/common/include/CVUtils/IFrameReader.hpp | 45 +++ samples/common/include/Utils/CmdArgsParser.hpp | 25 ++ samples/common/include/Utils/Types.hpp | 54 +++ samples/common/src/CVUtils/CvVideoFileWriter.cpp | 38 ++ samples/common/src/CVUtils/CvVideoFrameReader.cpp | 98 +++++ samples/common/src/CVUtils/CvWindowOutput.cpp | 33 ++ samples/common/src/Utils/CmdArgsParser.cpp | 70 ++++ 75 files changed, 4253 insertions(+), 1471 deletions(-) delete mode 100644 samples/ObjectDetection/cmake/aarch64-toolchain.cmake delete mode 100644 samples/ObjectDetection/cmake/arm-linux-gnueabihf-toolchain.cmake delete mode 100644 samples/ObjectDetection/cmake/find_armnn.cmake delete mode 100644 samples/ObjectDetection/cmake/find_catch.cmake delete mode 100644 samples/ObjectDetection/cmake/find_opencv.cmake delete mode 100644 samples/ObjectDetection/include/ArmnnNetworkExecutor.hpp delete mode 100644 samples/ObjectDetection/include/CmdArgsParser.hpp delete mode 100644 samples/ObjectDetection/include/CvVideoFileWriter.hpp delete mode 100644 samples/ObjectDetection/include/CvVideoFrameReader.hpp delete mode 100644 samples/ObjectDetection/include/CvWindowOutput.hpp delete mode 100644 samples/ObjectDetection/include/IFrameOutput.hpp delete mode 100644 samples/ObjectDetection/include/IFrameReader.hpp delete mode 100644 samples/ObjectDetection/include/NetworkPipeline.hpp create mode 100644 samples/ObjectDetection/include/ObjectDetectionPipeline.hpp delete mode 100644 samples/ObjectDetection/include/Types.hpp delete mode 100644 samples/ObjectDetection/src/ArmnnNetworkExecutor.cpp delete mode 100644 samples/ObjectDetection/src/CmdArgsParser.cpp delete mode 100644 samples/ObjectDetection/src/CvVideoFileWriter.cpp delete mode 100644 samples/ObjectDetection/src/CvVideoFrameReader.cpp delete mode 100644 samples/ObjectDetection/src/CvWindowOutput.cpp delete mode 100644 samples/ObjectDetection/src/NetworkPipeline.cpp create mode 100644 samples/ObjectDetection/src/ObjectDetectionPipeline.cpp create mode 100644 samples/SpeechRecognition/CMakeLists.txt create mode 100644 samples/SpeechRecognition/Readme.md create mode 100644 samples/SpeechRecognition/cmake/unit_tests.cmake create mode 100644 samples/SpeechRecognition/include/AudioCapture.hpp create mode 100644 samples/SpeechRecognition/include/DataStructures.hpp create mode 100644 samples/SpeechRecognition/include/Decoder.hpp create mode 100644 samples/SpeechRecognition/include/MFCC.hpp create mode 100644 samples/SpeechRecognition/include/MathUtils.hpp create mode 100644 samples/SpeechRecognition/include/Preprocess.hpp create mode 100644 samples/SpeechRecognition/include/SlidingWindow.hpp create mode 100644 samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp create mode 100644 samples/SpeechRecognition/src/AudioCapture.cpp create mode 100644 samples/SpeechRecognition/src/Decoder.cpp create mode 100644 samples/SpeechRecognition/src/MFCC.cpp create mode 100644 samples/SpeechRecognition/src/Main.cpp create mode 100644 samples/SpeechRecognition/src/MathUtils.cpp create mode 100644 samples/SpeechRecognition/src/Preprocess.cpp create mode 100644 samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp create mode 100644 samples/SpeechRecognition/test/AudioCaptureTest.cpp create mode 100644 samples/SpeechRecognition/test/DecoderTest.cpp create mode 100644 samples/SpeechRecognition/test/MFCCTest.cpp create mode 100644 samples/SpeechRecognition/test/PreprocessTest.cpp create mode 100644 samples/common/cmake/aarch64-toolchain.cmake create mode 100644 samples/common/cmake/arm-linux-gnueabihf-toolchain.cmake create mode 100644 samples/common/cmake/find_armnn.cmake create mode 100644 samples/common/cmake/find_catch.cmake create mode 100644 samples/common/cmake/find_opencv.cmake create mode 100644 samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp create mode 100644 samples/common/include/CVUtils/CvVideoFileWriter.hpp create mode 100644 samples/common/include/CVUtils/CvVideoFrameReader.hpp create mode 100644 samples/common/include/CVUtils/CvWindowOutput.hpp create mode 100644 samples/common/include/CVUtils/IFrameOutput.hpp create mode 100644 samples/common/include/CVUtils/IFrameReader.hpp create mode 100644 samples/common/include/Utils/CmdArgsParser.hpp create mode 100644 samples/common/include/Utils/Types.hpp create mode 100644 samples/common/src/CVUtils/CvVideoFileWriter.cpp create mode 100644 samples/common/src/CVUtils/CvVideoFrameReader.cpp create mode 100644 samples/common/src/CVUtils/CvWindowOutput.cpp create mode 100644 samples/common/src/Utils/CmdArgsParser.cpp diff --git a/samples/ObjectDetection/CMakeLists.txt b/samples/ObjectDetection/CMakeLists.txt index 9e85fabe86..7e587f7ad3 100644 --- a/samples/ObjectDetection/CMakeLists.txt +++ b/samples/ObjectDetection/CMakeLists.txt @@ -38,12 +38,16 @@ if (NOT DEFINED DEPENDENCIES_DIR) set(DEPENDENCIES_DIR ${CMAKE_BINARY_DIR}/dependencies) endif() -include(cmake/find_opencv.cmake) -include(cmake/find_armnn.cmake) +include(../common/cmake/find_opencv.cmake) +include(../common/cmake/find_armnn.cmake) include_directories(include) +include_directories(../common/include/ArmnnUtils) +include_directories(../common/include/Utils) +include_directories(../common/include/CVUtils) file(GLOB SOURCES "src/*.cpp") +file(GLOB COMMON_SOURCES "../common/src/**/*.cpp") list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/Main.cpp) file(GLOB TEST_SOURCES "test/*.cpp") file(GLOB APP_MAIN "src/Main.cpp") @@ -55,7 +59,7 @@ endif() set(APP_TARGET_NAME "${CMAKE_PROJECT_NAME}") -add_executable("${APP_TARGET_NAME}" ${SOURCES} ${APP_MAIN}) +add_executable("${APP_TARGET_NAME}" ${SOURCES} ${COMMON_SOURCES} ${APP_MAIN}) if (NOT OPENCV_LIBS_FOUND) message("Building OpenCV libs") diff --git a/samples/ObjectDetection/Readme.md b/samples/ObjectDetection/Readme.md index bceaa4b250..408917eebc 100644 --- a/samples/ObjectDetection/Readme.md +++ b/samples/ObjectDetection/Readme.md @@ -230,7 +230,6 @@ Once the application executable is built, it can be executed with the following * --preferred-backends: Takes the preferred backends in preference order, separated by comma. For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]. Defaults to CpuRef **[OPTIONAL]** -* --help: Prints all the available options to screen ### Object Detection on a supplied video file @@ -379,8 +378,8 @@ tensor names and calling `GetNetworkOutputBindingInfo()`. Generic object detection pipeline has 3 steps to perform data pre-processing, run inference and decode inference results in the post-processing step. -See [`ObjDetectionPipeline`](./include/NetworkPipeline.hpp) and implementations for [`MobileNetSSDv1`](./include/NetworkPipeline.hpp) -and [`YoloV3Tiny`](./include/NetworkPipeline.hpp) for more details. +See [`ObjDetectionPipeline`](include/ObjectDetectionPipeline.hpp) and implementations for [`MobileNetSSDv1`](include/ObjectDetectionPipeline.hpp) +and [`YoloV3Tiny`](include/ObjectDetectionPipeline.hpp) for more details. #### Pre-processing the Captured Frame Each frame captured from source is read as an `cv::Mat` in BGR format but channels are swapped to RGB in a frame reader diff --git a/samples/ObjectDetection/cmake/aarch64-toolchain.cmake b/samples/ObjectDetection/cmake/aarch64-toolchain.cmake deleted file mode 100644 index bdd02f88c0..0000000000 --- a/samples/ObjectDetection/cmake/aarch64-toolchain.cmake +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -# SPDX-License-Identifier: MIT - -# specify the cross compiler -set(GNU_MACHINE "aarch64-linux-gnu") -set(CROSS_PREFIX "aarch64-linux-gnu-") - -set(CMAKE_C_COMPILER ${CROSS_PREFIX}gcc) -set(CMAKE_CXX_COMPILER ${CROSS_PREFIX}g++) -set(CMAKE_AR ${CROSS_PREFIX}ar) -set(CMAKE_STRIP ${CROSS_PREFIX}strip) -set(CMAKE_LINKER ${CROSS_PREFIX}ld) - -set(CMAKE_CROSSCOMPILING true) -set(CMAKE_SYSTEM_NAME Linux) - -set(CMAKE_SYSTEM_PROCESSOR aarch64) - -set(OPENCV_EXTRA_ARGS "-DENABLE_NEON=ON" - "-DCMAKE_TOOLCHAIN_FILE=platforms/linux/aarch64-gnu.toolchain.cmake") \ No newline at end of file diff --git a/samples/ObjectDetection/cmake/arm-linux-gnueabihf-toolchain.cmake b/samples/ObjectDetection/cmake/arm-linux-gnueabihf-toolchain.cmake deleted file mode 100644 index f66b964c35..0000000000 --- a/samples/ObjectDetection/cmake/arm-linux-gnueabihf-toolchain.cmake +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -# SPDX-License-Identifier: MIT - -# specify the cross compiler -set(GNU_MACHINE "arm-linux-gnueabihf") -set(CROSS_PREFIX "arm-linux-gnueabihf-") - -set(CMAKE_C_COMPILER ${CROSS_PREFIX}gcc) -set(CMAKE_CXX_COMPILER ${CROSS_PREFIX}g++) -set(CMAKE_AR ${CROSS_PREFIX}ar) -set(CMAKE_STRIP ${CROSS_PREFIX}strip) -set(CMAKE_LINKER ${CROSS_PREFIX}ld) - -set(CMAKE_CROSSCOMPILING true) -set(CMAKE_SYSTEM_NAME Linux) - -set(CMAKE_SYSTEM_PROCESSOR arm) - -set(OPENCV_EXTRA_ARGS "-DENABLE_NEON=ON" - "-DCMAKE_TOOLCHAIN_FILE=platforms/linux/arm.toolchain.cmake") \ No newline at end of file diff --git a/samples/ObjectDetection/cmake/find_armnn.cmake b/samples/ObjectDetection/cmake/find_armnn.cmake deleted file mode 100644 index 289e9127f6..0000000000 --- a/samples/ObjectDetection/cmake/find_armnn.cmake +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -# SPDX-License-Identifier: MIT -# Search for ArmNN built libraries in user-provided path first, then current repository, then system - -set(ARMNN_LIB_NAMES "libarmnn.so" - "libarmnnTfLiteParser.so") - -set(ARMNN_LIBS "") - -get_filename_component(PARENT_DIR ${PROJECT_SOURCE_DIR} DIRECTORY) -get_filename_component(REPO_DIR ${PARENT_DIR} DIRECTORY) - -foreach(armnn_lib ${ARMNN_LIB_NAMES}) - find_library(ARMNN_${armnn_lib} - NAMES - ${armnn_lib} - HINTS - ${ARMNN_LIB_DIR} ${REPO_DIR} - PATHS - ${ARMNN_LIB_DIR} ${REPO_DIR} - PATH_SUFFIXES - "lib" - "lib64") - if(ARMNN_${armnn_lib}) - message("Found library ${ARMNN_${armnn_lib}}") - list(APPEND ARMNN_LIBS ${ARMNN_${armnn_lib}}) - get_filename_component(LIB_DIR ${ARMNN_${armnn_lib}} DIRECTORY) - get_filename_component(LIB_PARENT_DIR ${LIB_DIR} DIRECTORY) - set(ARMNN_INCLUDE_DIR ${LIB_PARENT_DIR}/include) - endif() -endforeach() - -if(NOT ARMNN_LIBS) - message(FATAL_ERROR "Could not find ArmNN libraries ${ARMNN_LIB_NAMES}") -endif() diff --git a/samples/ObjectDetection/cmake/find_catch.cmake b/samples/ObjectDetection/cmake/find_catch.cmake deleted file mode 100644 index 584b8073bd..0000000000 --- a/samples/ObjectDetection/cmake/find_catch.cmake +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -# SPDX-License-Identifier: MIT - -#Test TPIP -set(TEST_TPIP ${DEPENDENCIES_DIR}/test) -file(MAKE_DIRECTORY ${TEST_TPIP}) -set(TEST_TPIP_INCLUDE ${TEST_TPIP}/include) -file(MAKE_DIRECTORY ${TEST_TPIP_INCLUDE}) - -ExternalProject_Add(catch2-headers - URL https://github.com/catchorg/Catch2/releases/download/v2.11.1/catch.hpp - DOWNLOAD_NO_EXTRACT 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND ${CMAKE_COMMAND} -E copy /catch.hpp ${TEST_TPIP_INCLUDE} - INSTALL_COMMAND "" - ) \ No newline at end of file diff --git a/samples/ObjectDetection/cmake/find_opencv.cmake b/samples/ObjectDetection/cmake/find_opencv.cmake deleted file mode 100644 index 92086e1316..0000000000 --- a/samples/ObjectDetection/cmake/find_opencv.cmake +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -# SPDX-License-Identifier: MIT - -set(OPENCV_VERSION 4.0.0) -set(FFMPEG_VERSION 4.2.1) -set(LIBX264_VERSION stable) - -set(OPENCV_LIB OpenCV${OPENCV_VERSION}) -set(FFMPEG_LIB ffmpeg${FFMPEG_VERSION}) -set(X264_LIB x264${LIBX264_VERSION}) - -set(OPENCV_NAMES - libopencv_core.so.${OPENCV_VERSION} - libopencv_imgproc.so.${OPENCV_VERSION} - libopencv_imgcodecs.so.${OPENCV_VERSION} - libopencv_videoio.so.${OPENCV_VERSION} - libopencv_video.so.${OPENCV_VERSION} - libopencv_highgui.so.${OPENCV_VERSION}) - -set(OPENCV_LIBS) -set(FFMPEG_LIBS) - -foreach(opencv_lib ${OPENCV_NAMES}) - find_library(OPENCV_${opencv_lib} - NAMES - ${opencv_lib} - HINTS - ${OPENCV_LIB_DIR} - PATHS - ${OPENCV_LIB_DIR} - PATH_SUFFIXES - "lib" - "lib64") - if(OPENCV_${opencv_lib}) - message("Found library ${OPENCV_${opencv_lib}}") - list(APPEND OPENCV_LIBS ${OPENCV_${opencv_lib}}) - get_filename_component(OPENCV_LIB_DIR ${OPENCV_${opencv_lib}} DIRECTORY) - get_filename_component(OPENCV_ROOT_DIR ${OPENCV_LIB_DIR} DIRECTORY) - set(OPENCV_INCLUDE_DIR ${OPENCV_ROOT_DIR}/include/opencv4) - endif() -endforeach() - -if(OPENCV_LIBS) - message("OpenCV libraries found") - set(OPENCV_LIBS_FOUND TRUE) -else() - set(OPENCV_ROOT_DIR ${DEPENDENCIES_DIR}/opencv) - set(OPENCV_DEPENDENCIES_ARGS) - set(OPENCV_EXTRA_LINKER_ARGS) - set(OPENCV_PKGCONFIG) - - if(CMAKE_CROSSCOMPILING) - set(FFMPEG_ROOT_DIR ${DEPENDENCIES_DIR}/ffmpeg) - set(LIBX264_ROOT_DIR ${DEPENDENCIES_DIR}/x264) - - if (CMAKE_BUILD_TYPE STREQUAL Debug) - set(CONFIGURE_DEBUG --enable-debug) - set(OPENCV_DEBUG "-DBUILD_WITH_DEBUG_INFO=ON") - endif() - - - ExternalProject_Add(${X264_LIB} - URL "https://code.videolan.org/videolan/x264/-/archive/${LIBX264_VERSION}/x264-${LIBX264_VERSION}.tar.gz" - DOWNLOAD_DIR ${LIBX264_ROOT_DIR} - PREFIX ${LIBX264_ROOT_DIR} - CONFIGURE_COMMAND /configure - --host=${GNU_MACHINE} - --enable-static - --enable-shared - --cross-prefix=${CROSS_PREFIX} - --prefix=${CMAKE_BINARY_DIR} - --extra-ldflags=-static-libstdc++ - --extra-cflags=-fPIC - ${CONFIGURE_DEBUG} - INSTALL_DIR ${CMAKE_BINARY_DIR} - BUILD_COMMAND $(MAKE) - INSTALL_COMMAND $(MAKE) install - ) - - set(FFMPEG_Config - --enable-shared - --enable-cross-compile - --cross-prefix=${CROSS_PREFIX} - --arch=${CMAKE_SYSTEM_PROCESSOR} - --target-os=linux - --prefix=${CMAKE_BINARY_DIR} - --enable-gpl - --enable-nonfree - --enable-libx264 - --extra-cflags=-I${CMAKE_BINARY_DIR}/include - --extra-cflags=-fPIC - --extra-ldflags=-L${CMAKE_BINARY_DIR}/lib - --extra-libs=-ldl - --extra-libs=-static-libstdc++ - ) - - ExternalProject_Add(${FFMPEG_LIB} - URL "https://github.com/FFmpeg/FFmpeg/archive/n${FFMPEG_VERSION}.tar.gz" - URL_HASH MD5=05792c611d1e3ebdf2c7003ff4467390 - DOWNLOAD_DIR ${FFMPEG_ROOT_DIR} - PREFIX ${FFMPEG_ROOT_DIR} - CONFIGURE_COMMAND /configure ${FFMPEG_Config} ${CONFIGURE_DEBUG} - INSTALL_DIR ${CMAKE_BINARY_DIR} - BUILD_COMMAND $(MAKE) VERBOSE=1 - INSTALL_COMMAND $(MAKE) install - ) - - set(OPENCV_DEPENDENCIES_ARGS "-static-libstdc++ -Wl,-rpath,${CMAKE_BINARY_DIR}/lib") - set(OPENCV_EXTRA_LINKER_ARGS "-DOPENCV_EXTRA_EXE_LINKER_FLAGS=${OPENCV_DEPENDENCIES_ARGS}") - - set(OPENCV_PKGCONFIG "PKG_CONFIG_LIBDIR=${CMAKE_BINARY_DIR}/lib/pkgconfig") - - set(FFMPEG_NAMES - libavcodec.so - libavformat.so - libavutil.so - libswscale.so - ) - - foreach(ffmpeg_lib ${FFMPEG_NAMES}) - add_library(FFMPEG_${ffmpeg_lib} SHARED IMPORTED) - set_target_properties(FFMPEG_${ffmpeg_lib} PROPERTIES IMPORTED_LOCATION ${CMAKE_BINARY_DIR}/lib/${ffmpeg_lib}) - list(APPEND OPENCV_LIBS FFMPEG_${ffmpeg_lib}) - endforeach() - - add_library(X264_lib264.so SHARED IMPORTED) - set_target_properties(X264_lib264.so PROPERTIES IMPORTED_LOCATION ${CMAKE_BINARY_DIR}/lib/libx264.so) - list(APPEND OPENCV_LIBS X264_lib264.so) - endif() - - set(OPENCV_CMAKE_ARGS - -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_C_FLAGS=-fPIC - -DCMAKE_CXX_FLAGS=-fPIC - -DWITH_GTK=OFF - -DWITH_JPEG=ON - -DWITH_IPP=OFF - -DBUILD_opencv_java_bindings_generator=OFF - -DBUILD_opencv_ml=OFF - -DBUILD_opencv_objdetect=OFF - -DBUILD_opencv_photo=OFF - -DBUILD_opencv_python_bindings_generator=OFF - -DBUILD_opencv_stitching=OFF - -DBUILD_opencv_gapi=OFF - -DBUILD_opencv_features2d=OFF - -DBUILD_opencv_dnn=OFF - -DBUILD_opencv_flann=OFF - -DBUILD_opencv_calib3d=OFF - -DBUILD_opencv_python2=OFF - -DBUILD_opencv_python3=OFF - -DBUILD_opencv_java=OFF - -DBUILD_opencv_js=OFF - -DBUILD_opencv_ts=OFF - -DBUILD_JPEG=ON - -DBUILD_JPEG_TURBO_DISABLE=ON - -DBUILD_PNG=ON - -DBUILD_TIFF=ON - -DZLIB_FOUND=OFF - -DBUILD_ZLIB=ON - -DBUILD_PERF_TESTS=OFF - -DBUILD_TESTS=OFF - -DBUILD_DOCS=OFF - -DBUILD_opencv_apps=OFF - -DBUILD_EXAMPLES=OFF - -DWITH_V4L=ON - -DWITH_LIBV4L=OFF - -DWITH_FFMPEG=ON - -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR} - -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_INSTALL_RPATH=\$ORIGIN:\$ORIGIN/lib:\$ORIGIN/../lib - -DCMAKE_SHARED_LINKER_FLAGS=-static-libstdc++ - ${OPENCV_DEBUG} - ) - - ExternalProject_Add(${OPENCV_LIB} - URL "https://codeload.github.com/opencv/opencv/tar.gz/${OPENCV_VERSION}" - URL_HASH MD5=f051c1ff7b327b60123d71b53801b316 - DOWNLOAD_DIR ${OPENCV_ROOT_DIR} - PREFIX ${OPENCV_ROOT_DIR} - CONFIGURE_COMMAND ${OPENCV_PKGCONFIG} - ${CMAKE_COMMAND} ${OPENCV_CMAKE_ARGS} ${OPENCV_EXTRA_ARGS} - ${OPENCV_EXTRA_LINKER_ARGS} ${OPENCV_ROOT_DIR}/src/${OPENCV_LIB} - INSTALL_DIR ${CMAKE_BINARY_DIR} - BUILD_COMMAND $(MAKE) - INSTALL_COMMAND $(MAKE) install - ) - - if(CMAKE_CROSSCOMPILING) - ExternalProject_Add_StepDependencies(${FFMPEG_LIB} build ${X264_LIB}) - ExternalProject_Add_StepDependencies(${OPENCV_LIB} build ${FFMPEG_LIB}) - endif() - - set(OPENCV_INCLUDE_DIR ${CMAKE_BINARY_DIR}/include/opencv4) - set(OPENCV_LIB_DIR ${CMAKE_BINARY_DIR}/lib) - - foreach(opencv_lib ${OPENCV_NAMES}) - add_library(OPENCV_${opencv_lib} SHARED IMPORTED) - set_target_properties(OPENCV_${opencv_lib} PROPERTIES IMPORTED_LOCATION ${OPENCV_LIB_DIR}/${opencv_lib}) - list(APPEND OPENCV_LIBS OPENCV_${opencv_lib}) - endforeach() - -endif() \ No newline at end of file diff --git a/samples/ObjectDetection/cmake/unit_tests.cmake b/samples/ObjectDetection/cmake/unit_tests.cmake index dcfa512893..1a8c466d6b 100644 --- a/samples/ObjectDetection/cmake/unit_tests.cmake +++ b/samples/ObjectDetection/cmake/unit_tests.cmake @@ -7,7 +7,7 @@ set(TEST_TARGET_NAME "${CMAKE_PROJECT_NAME}-tests") file(GLOB TEST_SOURCES "test/*") -include(cmake/find_catch.cmake) +include(../common/cmake/find_catch.cmake) file(DOWNLOAD "https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip" ${CMAKE_CURRENT_SOURCE_DIR}/test/resources/models.zip SHOW_PROGRESS) @@ -43,7 +43,7 @@ ExternalProject_Add(vtest INSTALL_COMMAND "" ) -add_executable("${TEST_TARGET_NAME}" ${SOURCES} ${TEST_SOURCES}) +add_executable("${TEST_TARGET_NAME}" ${SOURCES} ${TEST_SOURCES} ${COMMON_SOURCES}) add_dependencies( "${TEST_TARGET_NAME}" @@ -60,6 +60,6 @@ endif() target_include_directories("${TEST_TARGET_NAME}" PUBLIC ${TEST_TPIP_INCLUDE} ${ARMNN_INCLUDE_DIR} - ${OPENCV_INCLUDE_DIR} ${DEPENDENCIES_DIR} ${TEST_RESOURCES_DIR}) + ${OPENCV_INCLUDE_DIR} ${DEPENDENCIES_DIR} ${TEST_RESOURCES_DIR} ${COMMON_INCLUDE_DIR}) target_link_libraries("${TEST_TARGET_NAME}" PUBLIC ${ARMNN_LIBS} ${OPENCV_LIBS} ${FFMPEG_LIBS}) \ No newline at end of file diff --git a/samples/ObjectDetection/include/ArmnnNetworkExecutor.hpp b/samples/ObjectDetection/include/ArmnnNetworkExecutor.hpp deleted file mode 100644 index c75b68bbe1..0000000000 --- a/samples/ObjectDetection/include/ArmnnNetworkExecutor.hpp +++ /dev/null @@ -1,80 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include "Types.hpp" - -#include "armnn/ArmNN.hpp" -#include "armnnTfLiteParser/ITfLiteParser.hpp" -#include "armnnUtils/DataLayoutIndexed.hpp" -#include - -#include -#include - -namespace od -{ -/** -* @brief Used to load in a network through ArmNN and run inference on it against a given backend. -* -*/ -class ArmnnNetworkExecutor -{ -private: - armnn::IRuntimePtr m_Runtime; - armnn::NetworkId m_NetId{}; - mutable InferenceResults m_OutputBuffer; - armnn::InputTensors m_InputTensors; - armnn::OutputTensors m_OutputTensors; - std::vector m_outputBindingInfo; - - std::vector m_outputLayerNamesList; - - armnnTfLiteParser::BindingPointInfo m_inputBindingInfo; - - void PrepareTensors(const void* inputData, const size_t dataBytes); - - template - auto log_as_int(Enumeration value) - -> typename std::underlying_type::type - { - return static_cast::type>(value); - } - -public: - ArmnnNetworkExecutor() = delete; - - /** - * @brief Initializes the network with the given input data. Parsed through TfLiteParser and optimized for a - * given backend. - * - * Note that the output layers names order in m_outputLayerNamesList affects the order of the feature vectors - * in output of the Run method. - * - * * @param[in] modelPath - Relative path to the model file - * * @param[in] backends - The list of preferred backends to run inference on - */ - ArmnnNetworkExecutor(std::string& modelPath, - std::vector& backends); - - /** - * @brief Returns the aspect ratio of the associated model in the order of width, height. - */ - Size GetImageAspectRatio(); - - armnn::DataType GetInputDataType() const; - - /** - * @brief Runs inference on the provided input data, and stores the results in the provided InferenceResults object. - * - * @param[in] inputData - input frame data - * @param[in] dataBytes - input data size in bytes - * @param[out] results - Vector of DetectionResult objects used to store the output result. - */ - bool Run(const void* inputData, const size_t dataBytes, InferenceResults& outResults); - -}; -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/include/CmdArgsParser.hpp b/samples/ObjectDetection/include/CmdArgsParser.hpp deleted file mode 100644 index 6c22e6ff6d..0000000000 --- a/samples/ObjectDetection/include/CmdArgsParser.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// -#pragma once -#include -#include -#include - -const std::string MODEL_NAME = "--model-name"; -const std::string VIDEO_FILE_PATH = "--video-file-path"; -const std::string MODEL_FILE_PATH = "--model-file-path"; -const std::string OUTPUT_VIDEO_FILE_PATH = "--output-video-file-path"; -const std::string LABEL_PATH = "--label-path"; -const std::string PREFERRED_BACKENDS = "--preferred-backends"; -const std::string HELP = "--help"; - -/* - * The accepted options for this Object detection executable - */ -static std::map CMD_OPTIONS = { - {VIDEO_FILE_PATH, "[REQUIRED] Path to the video file to run object detection on"}, - {MODEL_FILE_PATH, "[REQUIRED] Path to the Object Detection model to use"}, - {LABEL_PATH, "[REQUIRED] Path to the label set for the provided model file. " - "Label file is should just be an ordered list, seperated by new line."}, - {MODEL_NAME, "[REQUIRED] The name of the model being used. Accepted options: YOLO_V3_TINY, SSD_MOBILE"}, - {OUTPUT_VIDEO_FILE_PATH, "[OPTIONAL] Path to the output video file with detections added in. " - "If specified will save file to disk, else displays the output to screen"}, - {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma." - " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]." - " Defaults to CpuAcc,CpuRef"} -}; - -/* - * Checks that a particular option was specified by the user - */ -bool CheckOptionSpecified(const std::map& options, const std::string& option); - - -/* - * Retrieves the user provided option - */ -std::string GetSpecifiedOption(const std::map& options, const std::string& option); - - -/* - * Parses all the command line options provided by the user and stores in a map. - */ -int ParseOptions(std::map& options, std::map& acceptedOptions, - char *argv[], int argc); \ No newline at end of file diff --git a/samples/ObjectDetection/include/CvVideoFileWriter.hpp b/samples/ObjectDetection/include/CvVideoFileWriter.hpp deleted file mode 100644 index ea1501b68e..0000000000 --- a/samples/ObjectDetection/include/CvVideoFileWriter.hpp +++ /dev/null @@ -1,61 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include "IFrameOutput.hpp" -#include - -namespace od -{ - -class CvVideoFileWriter : public IFrameOutput { -public: - /** - * @brief Default constructor. - * - * Underlying open cv video writer object will be instantiated. - */ - CvVideoFileWriter() = default; - - ~CvVideoFileWriter() override = default; - - /** - * @brief Initialises video file writer. - * - * Opens opencv writer with given params. FFMPEG backend is used. - * - * @param outputVideo path to the video file. - * @param encoding cv::CAP_PROP_FOURCC code. - * @param fps target frame rate. - * @param width target frame width. - * @param height target frame height. - * - */ - void Init(const std::string& outputVideo, int encoding, double fps, int width, int height); - - /** - * Writes frame to the file using opencv writer. - * - * @param frame data to write. - */ - void WriteFrame(std::shared_ptr& frame) override; - - /** - * Releases opencv writer. - */ - void Close() override; - - /** - * Checks if opencv writer was successfully opened. - * @return true is underlying writer is ready to be used, false otherwise. - */ - bool IsReady() const override; - -private: - cv::VideoWriter m_cvWriter{}; - bool m_ready = false; -}; -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/include/CvVideoFrameReader.hpp b/samples/ObjectDetection/include/CvVideoFrameReader.hpp deleted file mode 100644 index 081f92620e..0000000000 --- a/samples/ObjectDetection/include/CvVideoFrameReader.hpp +++ /dev/null @@ -1,108 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// -#pragma once - - -#include "IFrameReader.hpp" -#include - -namespace od -{ - -class CvVideoFrameReader : - public IFrameReader -{ -public: - /** - * @brief Default constructor. - * - * Underlying open cv video capture object will be instantiated. - */ - CvVideoFrameReader() = default; - - ~CvVideoFrameReader() override = default; - - /** - *@brief Initialises reader to capture frames from video file. - * - * @param source path to the video file or image sequence. - * - * @throws std::runtime_error if init failed - */ - void Init(const std::string& source); - - std::shared_ptr ReadFrame() override; - - bool IsExhausted(const std::shared_ptr & frame) const override; - - /** - * Returns effective video frame width supported by the source/set by the user. - * Must be called after Init method. - * @return frame width - */ - int GetSourceWidth() const; - - /** - * Returns effective video frame height supported by the source/set by the user. - * Must be called after Init method. - * @return frame height - */ - int GetSourceHeight() const; - - /** - * Returns effective fps value supported by the source/set by the user. - * @return fps value - */ - double GetSourceFps() const; - - /** - * Will query OpenCV to convert images to RGB - * Copy is actually default behaviour, but the set function needs to be called - * in order to know whether OpenCV supports conversion from our source format. - * @return boolean, - * true: OpenCV returns RGB - * false: OpenCV returns the fourcc format from GetSourceEncoding - */ - bool ConvertToRGB(); - - /** - * Returns 4-character code of codec. - * @return codec name - */ - std::string GetSourceEncoding() const; - - /** - * Get the fourcc int from its string name. - * @return codec int - */ - int GetSourceEncodingInt() const; - - int GetFrameCount() const; - -private: - cv::VideoCapture m_capture; - - void CheckIsOpen(const std::string& source); -}; - -class CvVideoFrameReaderRgbWrapper : - public IFrameReader -{ -public: - CvVideoFrameReaderRgbWrapper() = delete; - CvVideoFrameReaderRgbWrapper(const CvVideoFrameReaderRgbWrapper& o) = delete; - CvVideoFrameReaderRgbWrapper(CvVideoFrameReaderRgbWrapper&& o) = delete; - - CvVideoFrameReaderRgbWrapper(std::unique_ptr reader); - - std::shared_ptr ReadFrame() override; - - bool IsExhausted(const std::shared_ptr& frame) const override; - -private: - std::unique_ptr m_reader; -}; - -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/include/CvWindowOutput.hpp b/samples/ObjectDetection/include/CvWindowOutput.hpp deleted file mode 100644 index 317327ba62..0000000000 --- a/samples/ObjectDetection/include/CvWindowOutput.hpp +++ /dev/null @@ -1,53 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include "IFrameOutput.hpp" -#include - -namespace od -{ - -class CvWindowOutput : public IFrameOutput { -public: - - CvWindowOutput() = default; - - ~CvWindowOutput() override = default; - - /** - * @brief Creates a named window. - * - * Uses opencv to create a window with given name. - * - * @param windowName opencv window name. - * - */ - void Init(const std::string& windowName); - - /** - * Writes frame to the window. - * - * @param frame data to write. - */ - void WriteFrame(std::shared_ptr& frame) override; - - /** - * Releases all windows. - */ - void Close() override; - - /** - * Always true. - * @return true. - */ - bool IsReady() const override; - -private: - std::string m_windowName; - -}; -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/include/IDetectionResultDecoder.hpp b/samples/ObjectDetection/include/IDetectionResultDecoder.hpp index c0a29df33f..a8a3cbb23a 100644 --- a/samples/ObjectDetection/include/IDetectionResultDecoder.hpp +++ b/samples/ObjectDetection/include/IDetectionResultDecoder.hpp @@ -30,9 +30,9 @@ public: * * @return Vector of decoded detected objects. */ - virtual DetectedObjects Decode(const InferenceResults& results, - const Size& outputFrameSize, - const Size& resizedFrameSize, + virtual DetectedObjects Decode(const common::InferenceResults& results, + const common::Size& outputFrameSize, + const common::Size& resizedFrameSize, const std::vector& labels) = 0; }; diff --git a/samples/ObjectDetection/include/IFrameOutput.hpp b/samples/ObjectDetection/include/IFrameOutput.hpp deleted file mode 100644 index c8b4fe5a47..0000000000 --- a/samples/ObjectDetection/include/IFrameOutput.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include -#include - -namespace od -{ -/** - * @brief Frames output interface - * - * @tparam FrameDataT frame container data type - */ - template class IFrameOutput - { - - public: - /** - * @brief Writes frame to the selected output - * - * @param frame container - */ - virtual void WriteFrame(std::shared_ptr & frame) = 0; - - /** - * @brief Closes the frame output - */ - virtual void Close() = 0; - - /** - * @brief Checks if the frame sink is ready to write. - * - * @return True if frame sink is ready, False otherwise - */ - virtual bool IsReady() const = 0; - - /** - * @brief Default destructor - */ - virtual ~IFrameOutput() = default; - - }; - -}// namespace od diff --git a/samples/ObjectDetection/include/IFrameReader.hpp b/samples/ObjectDetection/include/IFrameReader.hpp deleted file mode 100644 index d371b7d2a5..0000000000 --- a/samples/ObjectDetection/include/IFrameReader.hpp +++ /dev/null @@ -1,45 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include -#include - -namespace od -{ -/** - * @brief Frame source reader interface - * - * @tparam FrameDataT frame container data type - */ -template class IFrameReader -{ - -public: - /** - * @brief Reads the next frame from the source - * - * @return pointer to the frame container - */ - virtual std::shared_ptr ReadFrame() = 0; - - /** - * @brief Checks if the frame source has more frames to read. - * - * @param[in] frame the pointer to the last frame captured with the ReadFrame method could be used in - * implementation specific logic to check frames source state. - * @return True if frame source was exhausted, False otherwise - */ - virtual bool IsExhausted(const std::shared_ptr & frame) const = 0; - - /** - * @brief Default destructor - */ - virtual ~IFrameReader() = default; - -}; - -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/include/ImageUtils.hpp b/samples/ObjectDetection/include/ImageUtils.hpp index 07e2b839f9..9bae568755 100644 --- a/samples/ObjectDetection/include/ImageUtils.hpp +++ b/samples/ObjectDetection/include/ImageUtils.hpp @@ -21,7 +21,7 @@ const cv::InterpolationFlags DefaultResizeFlag = cv::INTER_NEAREST; */ void AddInferenceOutputToFrame(od::DetectedObjects& decodedResults, cv::Mat& inputFrame, - std::vector>& labels); + std::vector>& labels); /** * @brief Function to resize a frame while keeping aspect ratio. @@ -30,7 +30,7 @@ void AddInferenceOutputToFrame(od::DetectedObjects& decodedResults, * @param[out] dest the frame we want to resize into. * @param[in] aspectRatio aspect ratio to use when resizing. */ -void ResizeFrame(const cv::Mat& frame, cv::Mat& dest, const od::Size& aspectRatio); +void ResizeFrame(const cv::Mat& frame, cv::Mat& dest, const common::Size& aspectRatio); /** * @brief Function to pad a frame. @@ -49,7 +49,7 @@ void PadFrame(const cv::Mat& src, cv::Mat& dest, int bottom, int right); * @param cache operation requires intermediate data container. * @param destSize size of the destination frame */ -void ResizeWithPad(const cv::Mat& frame, cv::Mat& dest, cv::Mat& cache, const od::Size& destSize); +void ResizeWithPad(const cv::Mat& frame, cv::Mat& dest, cv::Mat& cache, const common::Size& destSize); /** * @brief Function to retrieve the cv::scalar color from a RGB tuple. diff --git a/samples/ObjectDetection/include/NetworkPipeline.hpp b/samples/ObjectDetection/include/NetworkPipeline.hpp deleted file mode 100644 index c3408b494e..0000000000 --- a/samples/ObjectDetection/include/NetworkPipeline.hpp +++ /dev/null @@ -1,148 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include "ArmnnNetworkExecutor.hpp" -#include "YoloResultDecoder.hpp" -#include "SSDResultDecoder.hpp" -# include "ImageUtils.hpp" - -#include - -namespace od -{ -/** - * Generic object detection pipeline with 3 steps: data pre-processing, inference execution and inference - * result post-processing. - * - */ -class ObjDetectionPipeline { -public: - - /** - * Creates object detection pipeline with given network executor and decoder. - * @param executor - unique pointer to inference runner - * @param decoder - unique pointer to inference results decoder - */ - ObjDetectionPipeline(std::unique_ptr executor, - std::unique_ptr decoder); - - /** - * @brief Standard image pre-processing implementation. - * - * Re-sizes an image keeping aspect ratio, pads if necessary to fit the network input layer dimensions. - - * @param[in] frame - input image, expected data type is uint8. - * @param[out] processed - output image, data type is preserved. - */ - virtual void PreProcessing(const cv::Mat& frame, cv::Mat& processed); - - /** - * @brief Executes inference - * - * Calls inference runner provided during instance construction. - * - * @param[in] processed - input inference data. Data type should be aligned with input tensor. - * @param[out] result - raw floating point inference results. - */ - virtual void Inference(const cv::Mat& processed, InferenceResults& result); - - /** - * @brief Standard inference results post-processing implementation. - * - * Decodes inference results using decoder provided during construction. - * - * @param[in] inferenceResult - inference results to be decoded. - * @param[in] callback - a function to be called after successful inference results decoding. - */ - virtual void PostProcessing(InferenceResults& inferenceResult, - const std::function& callback); - -protected: - std::unique_ptr m_executor; - std::unique_ptr m_decoder; - Size m_inputImageSize{}; - cv::Mat m_processedFrame; -}; - -/** - * Specific to Yolo v3 tiny object detection pipeline implementation. - */ -class YoloV3Tiny: public ObjDetectionPipeline{ -public: - - /** - * Constructs object detection pipeline for Yolo v3 tiny network. - * - * Network input is expected to be uint8 or fp32. Data range [0, 255]. - * Network output is FP32. - * - * @param executor[in] - unique pointer to inference runner - * @param NMSThreshold[in] - non max suppression threshold for decoding step - * @param ClsThreshold[in] - class probability threshold for decoding step - * @param ObjectThreshold[in] - detected object score threshold for decoding step - */ - YoloV3Tiny(std::unique_ptr executor, - float NMSThreshold, float ClsThreshold, float ObjectThreshold); - - /** - * @brief Yolo v3 tiny image pre-processing implementation. - * - * On top of the standard pre-processing, converts input data type according to the network input tensor data type. - * Supported data types: uint8 and float32. - * - * @param[in] original - input image data - * @param[out] processed - image data ready to be used for inference. - */ - void PreProcessing(const cv::Mat& original, cv::Mat& processed); - -}; - -/** - * Specific to MobileNet SSD v1 object detection pipeline implementation. - */ -class MobileNetSSDv1: public ObjDetectionPipeline { - -public: - /** - * Constructs object detection pipeline for MobileNet SSD network. - * - * Network input is expected to be uint8 or fp32. Data range [-1, 1]. - * Network output is FP32. - * - * @param[in] - unique pointer to inference runner - * @paramp[in] objectThreshold - detected object score threshold for decoding step - */ - MobileNetSSDv1(std::unique_ptr executor, - float objectThreshold); - - /** - * @brief MobileNet SSD image pre-processing implementation. - * - * On top of the standard pre-processing, converts input data type according to the network input tensor data type - * and scales input data from [0, 255] to [-1, 1] for FP32 input. - * - * Supported input data types: uint8 and float32. - * - * @param[in] original - input image data - * @param processed[out] - image data ready to be used for inference. - */ - void PreProcessing(const cv::Mat& original, cv::Mat& processed); - -}; - -using IPipelinePtr = std::unique_ptr; - -/** - * Constructs object detection pipeline based on configuration provided. - * - * @param[in] config - object detection pipeline configuration. - * - * @return unique pointer to object detection pipeline. - */ -IPipelinePtr CreatePipeline(od::ODPipelineOptions& config); - -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/include/ObjectDetectionPipeline.hpp b/samples/ObjectDetection/include/ObjectDetectionPipeline.hpp new file mode 100644 index 0000000000..38de65b007 --- /dev/null +++ b/samples/ObjectDetection/include/ObjectDetectionPipeline.hpp @@ -0,0 +1,148 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "ArmnnNetworkExecutor.hpp" +#include "YoloResultDecoder.hpp" +#include "SSDResultDecoder.hpp" +# include "ImageUtils.hpp" + +#include + +namespace od +{ +/** + * Generic object detection pipeline with 3 steps: data pre-processing, inference execution and inference + * result post-processing. + * + */ +class ObjDetectionPipeline { +public: + + /** + * Creates object detection pipeline with given network executor and decoder. + * @param executor - unique pointer to inference runner + * @param decoder - unique pointer to inference results decoder + */ + ObjDetectionPipeline(std::unique_ptr> executor, + std::unique_ptr decoder); + + /** + * @brief Standard image pre-processing implementation. + * + * Re-sizes an image keeping aspect ratio, pads if necessary to fit the network input layer dimensions. + + * @param[in] frame - input image, expected data type is uint8. + * @param[out] processed - output image, data type is preserved. + */ + virtual void PreProcessing(const cv::Mat& frame, cv::Mat& processed); + + /** + * @brief Executes inference + * + * Calls inference runner provided during instance construction. + * + * @param[in] processed - input inference data. Data type should be aligned with input tensor. + * @param[out] result - raw floating point inference results. + */ + virtual void Inference(const cv::Mat& processed, common::InferenceResults& result); + + /** + * @brief Standard inference results post-processing implementation. + * + * Decodes inference results using decoder provided during construction. + * + * @param[in] inferenceResult - inference results to be decoded. + * @param[in] callback - a function to be called after successful inference results decoding. + */ + virtual void PostProcessing(common::InferenceResults& inferenceResult, + const std::function& callback); + +protected: + std::unique_ptr> m_executor; + std::unique_ptr m_decoder; + common::Size m_inputImageSize{}; + cv::Mat m_processedFrame; +}; + +/** + * Specific to Yolo v3 tiny object detection pipeline implementation. + */ +class YoloV3Tiny: public ObjDetectionPipeline{ +public: + + /** + * Constructs object detection pipeline for Yolo v3 tiny network. + * + * Network input is expected to be uint8 or fp32. Data range [0, 255]. + * Network output is FP32. + * + * @param executor[in] - unique pointer to inference runner + * @param NMSThreshold[in] - non max suppression threshold for decoding step + * @param ClsThreshold[in] - class probability threshold for decoding step + * @param ObjectThreshold[in] - detected object score threshold for decoding step + */ + YoloV3Tiny(std::unique_ptr> executor, + float NMSThreshold, float ClsThreshold, float ObjectThreshold); + + /** + * @brief Yolo v3 tiny image pre-processing implementation. + * + * On top of the standard pre-processing, converts input data type according to the network input tensor data type. + * Supported data types: uint8 and float32. + * + * @param[in] original - input image data + * @param[out] processed - image data ready to be used for inference. + */ + void PreProcessing(const cv::Mat& original, cv::Mat& processed); + +}; + +/** + * Specific to MobileNet SSD v1 object detection pipeline implementation. + */ +class MobileNetSSDv1: public ObjDetectionPipeline { + +public: + /** + * Constructs object detection pipeline for MobileNet SSD network. + * + * Network input is expected to be uint8 or fp32. Data range [-1, 1]. + * Network output is FP32. + * + * @param[in] - unique pointer to inference runner + * @paramp[in] objectThreshold - detected object score threshold for decoding step + */ + MobileNetSSDv1(std::unique_ptr> executor, + float objectThreshold); + + /** + * @brief MobileNet SSD image pre-processing implementation. + * + * On top of the standard pre-processing, converts input data type according to the network input tensor data type + * and scales input data from [0, 255] to [-1, 1] for FP32 input. + * + * Supported input data types: uint8 and float32. + * + * @param[in] original - input image data + * @param processed[out] - image data ready to be used for inference. + */ + void PreProcessing(const cv::Mat& original, cv::Mat& processed); + +}; + +using IPipelinePtr = std::unique_ptr; + +/** + * Constructs object detection pipeline based on configuration provided. + * + * @param[in] config - object detection pipeline configuration. + * + * @return unique pointer to object detection pipeline. + */ +IPipelinePtr CreatePipeline(common::PipelineOptions& config); + +}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/include/SSDResultDecoder.hpp b/samples/ObjectDetection/include/SSDResultDecoder.hpp index 65afb8d376..4c703c18fc 100644 --- a/samples/ObjectDetection/include/SSDResultDecoder.hpp +++ b/samples/ObjectDetection/include/SSDResultDecoder.hpp @@ -21,9 +21,9 @@ public: */ SSDResultDecoder(float ObjectThreshold); - DetectedObjects Decode(const InferenceResults& results, - const Size& outputFrameSize, - const Size& resizedFrameSize, + DetectedObjects Decode(const common::InferenceResults& results, + const common::Size& outputFrameSize, + const common::Size& resizedFrameSize, const std::vector& labels) override; private: diff --git a/samples/ObjectDetection/include/Types.hpp b/samples/ObjectDetection/include/Types.hpp deleted file mode 100644 index 801cff392a..0000000000 --- a/samples/ObjectDetection/include/Types.hpp +++ /dev/null @@ -1,50 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include -#include -#include -#include -#include - -namespace od -{ - -struct Size -{ - - uint32_t m_Width; - uint32_t m_Height; - - Size() : Size(0, 0) {} - - Size(uint32_t width, uint32_t height) : - m_Width{width}, m_Height{height} {} - - Size(const Size& other) - : Size(other.m_Width, other.m_Height) {} - - ~Size() = default; - - Size &operator=(const Size& other) = default; -}; - -struct BBoxColor -{ - std::tuple colorCode; -}; - -struct ODPipelineOptions -{ - std::string m_ModelName; - std::string m_ModelFilePath; - std::vector m_backends; -}; - -using InferenceResult = std::vector; -using InferenceResults = std::vector; -} \ No newline at end of file diff --git a/samples/ObjectDetection/include/YoloResultDecoder.hpp b/samples/ObjectDetection/include/YoloResultDecoder.hpp index 98435e3cc9..ae6cb5e710 100644 --- a/samples/ObjectDetection/include/YoloResultDecoder.hpp +++ b/samples/ObjectDetection/include/YoloResultDecoder.hpp @@ -26,9 +26,9 @@ public: */ YoloResultDecoder(float NMSThreshold, float ClsThreshold, float ObjectThreshold); - DetectedObjects Decode(const InferenceResults& results, - const Size& outputFrameSize, - const Size& resizedFrameSize, + DetectedObjects Decode(const common::InferenceResults& results, + const common::Size& outputFrameSize, + const common::Size& resizedFrameSize, const std::vector & labels) override; private: float m_NmsThreshold; diff --git a/samples/ObjectDetection/src/ArmnnNetworkExecutor.cpp b/samples/ObjectDetection/src/ArmnnNetworkExecutor.cpp deleted file mode 100644 index cb4c0c9f84..0000000000 --- a/samples/ObjectDetection/src/ArmnnNetworkExecutor.cpp +++ /dev/null @@ -1,140 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#include "ArmnnNetworkExecutor.hpp" -#include "Types.hpp" - -#include -#include - -namespace od -{ - -armnn::DataType ArmnnNetworkExecutor::GetInputDataType() const -{ - return m_inputBindingInfo.second.GetDataType(); -} - -ArmnnNetworkExecutor::ArmnnNetworkExecutor(std::string& modelPath, - std::vector& preferredBackends) -: m_Runtime(armnn::IRuntime::Create(armnn::IRuntime::CreationOptions())) -{ - // Import the TensorFlow lite model. - armnnTfLiteParser::ITfLiteParserPtr parser = armnnTfLiteParser::ITfLiteParser::Create(); - armnn::INetworkPtr network = parser->CreateNetworkFromBinaryFile(modelPath.c_str()); - - std::vector inputNames = parser->GetSubgraphInputTensorNames(0); - - m_inputBindingInfo = parser->GetNetworkInputBindingInfo(0, inputNames[0]); - - m_outputLayerNamesList = parser->GetSubgraphOutputTensorNames(0); - - std::vector outputBindings; - for(const std::string& name : m_outputLayerNamesList) - { - m_outputBindingInfo.push_back(std::move(parser->GetNetworkOutputBindingInfo(0, name))); - } - - std::vector errorMessages; - // optimize the network. - armnn::IOptimizedNetworkPtr optNet = Optimize(*network, - preferredBackends, - m_Runtime->GetDeviceSpec(), - armnn::OptimizerOptions(), - armnn::Optional&>(errorMessages)); - - if (!optNet) - { - const std::string errorMessage{"ArmnnNetworkExecutor: Failed to optimize network"}; - ARMNN_LOG(error) << errorMessage; - throw armnn::Exception(errorMessage); - } - - // Load the optimized network onto the m_Runtime device - std::string errorMessage; - if (armnn::Status::Success != m_Runtime->LoadNetwork(m_NetId, std::move(optNet), errorMessage)) - { - ARMNN_LOG(error) << errorMessage; - } - - //pre-allocate memory for output (the size of it never changes) - for (int it = 0; it < m_outputLayerNamesList.size(); ++it) - { - const armnn::DataType dataType = m_outputBindingInfo[it].second.GetDataType(); - const armnn::TensorShape& tensorShape = m_outputBindingInfo[it].second.GetShape(); - - InferenceResult oneLayerOutResult; - switch (dataType) - { - case armnn::DataType::Float32: - { - oneLayerOutResult.resize(tensorShape.GetNumElements(), 0); - break; - } - default: - { - errorMessage = "ArmnnNetworkExecutor: unsupported output tensor data type"; - ARMNN_LOG(error) << errorMessage << " " << log_as_int(dataType); - throw armnn::Exception(errorMessage); - } - } - - m_OutputBuffer.emplace_back(oneLayerOutResult); - - // Make ArmNN output tensors - m_OutputTensors.reserve(m_OutputBuffer.size()); - for (size_t it = 0; it < m_OutputBuffer.size(); ++it) - { - m_OutputTensors.emplace_back(std::make_pair( - m_outputBindingInfo[it].first, - armnn::Tensor(m_outputBindingInfo[it].second, - m_OutputBuffer.at(it).data()) - )); - } - } - -} - -void ArmnnNetworkExecutor::PrepareTensors(const void* inputData, const size_t dataBytes) -{ - assert(m_inputBindingInfo.second.GetNumBytes() >= dataBytes); - m_InputTensors.clear(); - m_InputTensors = {{ m_inputBindingInfo.first, armnn::ConstTensor(m_inputBindingInfo.second, inputData)}}; -} - -bool ArmnnNetworkExecutor::Run(const void* inputData, const size_t dataBytes, InferenceResults& outResults) -{ - /* Prepare tensors if they are not ready */ - ARMNN_LOG(debug) << "Preparing tensors..."; - this->PrepareTensors(inputData, dataBytes); - ARMNN_LOG(trace) << "Running inference..."; - - armnn::Status ret = m_Runtime->EnqueueWorkload(m_NetId, m_InputTensors, m_OutputTensors); - - std::stringstream inferenceFinished; - inferenceFinished << "Inference finished with code {" << log_as_int(ret) << "}\n"; - - ARMNN_LOG(trace) << inferenceFinished.str(); - - if (ret == armnn::Status::Failure) - { - ARMNN_LOG(error) << "Failed to perform inference."; - } - - outResults.reserve(m_outputLayerNamesList.size()); - outResults = m_OutputBuffer; - - return (armnn::Status::Success == ret); -} - -Size ArmnnNetworkExecutor::GetImageAspectRatio() -{ - const auto shape = m_inputBindingInfo.second.GetShape(); - assert(shape.GetNumDimensions() == 4); - armnnUtils::DataLayoutIndexed nhwc(armnn::DataLayout::NHWC); - return Size(shape[nhwc.GetWidthIndex()], - shape[nhwc.GetHeightIndex()]); -} -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/src/CmdArgsParser.cpp b/samples/ObjectDetection/src/CmdArgsParser.cpp deleted file mode 100644 index b8c74bc10f..0000000000 --- a/samples/ObjectDetection/src/CmdArgsParser.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#include "CmdArgsParser.hpp" -#include -/* - * Checks that a particular option was specified by the user - */ -bool CheckOptionSpecified(const std::map& options, const std::string& option) -{ - auto it = options.find(option); - return it!=options.end(); -} - -/* - * Retrieves the user provided option - */ -std::string GetSpecifiedOption(const std::map& options, const std::string& option) -{ - if (CheckOptionSpecified(options, option)){ - return options.at(option); - } - else - { - throw std::invalid_argument("Required option: " + option + " not defined."); - } -} - -/* - * Parses all the command line options provided by the user and stores in a map. - */ -int ParseOptions(std::map& options, std::map& acceptedOptions, - char *argv[], int argc) -{ - for (int i = 1; i < argc; ++i) - { - std::string currentOption = std::string(argv[i]); - auto it = acceptedOptions.find(currentOption); - if (it != acceptedOptions.end()) - { - if (i + 1 < argc && std::string(argv[i + 1]).rfind("--", 0) != 0) - { - std::string value = argv[++i]; - options.insert({it->first, value}); - } - else if (std::string(argv[i]) == HELP) - { - std::cout << "Available options" << std::endl; - for (auto & acceptedOption : acceptedOptions) - { - std::cout << acceptedOption.first << " : " << acceptedOption.second << std::endl; - } - return 2; - } - else - { - std::cerr << std::string(argv[i]) << " option requires one argument." << std::endl; - return 1; - } - } - else - { - std::cerr << "Unrecognised option: " << std::string(argv[i]) << std::endl; - return 1; - } - } - return 0; -} diff --git a/samples/ObjectDetection/src/CvVideoFileWriter.cpp b/samples/ObjectDetection/src/CvVideoFileWriter.cpp deleted file mode 100644 index ab80b95d49..0000000000 --- a/samples/ObjectDetection/src/CvVideoFileWriter.cpp +++ /dev/null @@ -1,38 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#include "CvVideoFileWriter.hpp" - -namespace od -{ - -void CvVideoFileWriter::Init(const std::string& outputVideo, int encoding, double fps, int width, int height) -{ - m_ready = m_cvWriter.open(outputVideo, cv::CAP_FFMPEG, - encoding, - fps, - cv::Size(width, height), true); -} - - -void CvVideoFileWriter::WriteFrame(std::shared_ptr& frame) -{ - if(m_cvWriter.isOpened()) - { - cv::cvtColor(*frame, *frame, cv::COLOR_RGB2BGR); - m_cvWriter.write(*frame); - } -} - -bool CvVideoFileWriter::IsReady() const -{ - return m_ready; -} - -void CvVideoFileWriter::Close() -{ - m_cvWriter.release(); -} -}// namespace od diff --git a/samples/ObjectDetection/src/CvVideoFrameReader.cpp b/samples/ObjectDetection/src/CvVideoFrameReader.cpp deleted file mode 100644 index 09b5050973..0000000000 --- a/samples/ObjectDetection/src/CvVideoFrameReader.cpp +++ /dev/null @@ -1,98 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - - -#include "CvVideoFrameReader.hpp" - -namespace od -{ - -std::shared_ptr CvVideoFrameReader::ReadFrame() -{ - // opencv copies data anyway - cv::Mat captureFrame; - m_capture.read(captureFrame); - return std::make_shared(std::move(captureFrame)); -} - -bool CvVideoFrameReader::IsExhausted(const std::shared_ptr& frame) const -{ - assert(frame!=nullptr); - return frame->empty(); -} - -void CvVideoFrameReader::CheckIsOpen(const std::string& source) -{ - if (!m_capture.isOpened()) - { - throw std::runtime_error("Failed to open video capture for the source = " + source); - } -} - -void CvVideoFrameReader::Init(const std::string& source) -{ - m_capture.open(source); - CheckIsOpen(source); -} - -int CvVideoFrameReader::GetSourceWidth() const -{ - return static_cast(lround(m_capture.get(cv::CAP_PROP_FRAME_WIDTH))); -} - -int CvVideoFrameReader::GetSourceHeight() const -{ - return static_cast(lround(m_capture.get(cv::CAP_PROP_FRAME_HEIGHT))); -} - -double CvVideoFrameReader::GetSourceFps() const -{ - return m_capture.get(cv::CAP_PROP_FPS); -} - -bool CvVideoFrameReader::ConvertToRGB() -{ - m_capture.set(cv::CAP_PROP_CONVERT_RGB, 1.0); - return static_cast(m_capture.get(cv::CAP_PROP_CONVERT_RGB)); -} - -std::string CvVideoFrameReader::GetSourceEncoding() const -{ - char fourccStr[5]; - auto fourcc = (int)m_capture.get(cv::CAP_PROP_FOURCC); - sprintf(fourccStr,"%c%c%c%c",fourcc & 0xFF, (fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF, (fourcc >> 24) & 0xFF); - return fourccStr; -} - -int CvVideoFrameReader::GetSourceEncodingInt() const -{ - return (int)m_capture.get(cv::CAP_PROP_FOURCC); -} - -int CvVideoFrameReader::GetFrameCount() const -{ - return static_cast(lround(m_capture.get(cv::CAP_PROP_FRAME_COUNT))); -}; - -std::shared_ptr CvVideoFrameReaderRgbWrapper::ReadFrame() -{ - auto framePtr = m_reader->ReadFrame(); - if (!IsExhausted(framePtr)) - { - cv::cvtColor(*framePtr, *framePtr, cv::COLOR_BGR2RGB); - } - return framePtr; -} - -bool CvVideoFrameReaderRgbWrapper::IsExhausted(const std::shared_ptr& frame) const -{ - return m_reader->IsExhausted(frame); -} - -CvVideoFrameReaderRgbWrapper::CvVideoFrameReaderRgbWrapper(std::unique_ptr reader): - m_reader(std::move(reader)) -{} - -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/src/CvWindowOutput.cpp b/samples/ObjectDetection/src/CvWindowOutput.cpp deleted file mode 100644 index a32147b19a..0000000000 --- a/samples/ObjectDetection/src/CvWindowOutput.cpp +++ /dev/null @@ -1,33 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#include "CvWindowOutput.hpp" - -namespace od -{ - -void CvWindowOutput::Init(const std::string& windowName) -{ - m_windowName = windowName; - cv::namedWindow(m_windowName, cv::WINDOW_AUTOSIZE); -} - -void CvWindowOutput::WriteFrame(std::shared_ptr& frame) -{ - cv::cvtColor(*frame, *frame, cv::COLOR_RGB2BGR); - cv::imshow( m_windowName, *frame); - cv::waitKey(30); -} - -void CvWindowOutput::Close() -{ - cv::destroyWindow(m_windowName); -} - -bool CvWindowOutput::IsReady() const -{ - return true; -} -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/src/ImageUtils.cpp b/samples/ObjectDetection/src/ImageUtils.cpp index 9a3ed17b63..05b8a66c05 100644 --- a/samples/ObjectDetection/src/ImageUtils.cpp +++ b/samples/ObjectDetection/src/ImageUtils.cpp @@ -15,7 +15,7 @@ static cv::Scalar GetScalarColorCode(std::tuple color) } void AddInferenceOutputToFrame(od::DetectedObjects& decodedResults, cv::Mat& inputFrame, - std::vector>& labels) + std::vector>& labels) { for(const od::DetectedObject& object : decodedResults) { @@ -86,7 +86,7 @@ void AddInferenceOutputToFrame(od::DetectedObjects& decodedResults, cv::Mat& inp } -void ResizeFrame(const cv::Mat& frame, cv::Mat& dest, const od::Size& aspectRatio) +void ResizeFrame(const cv::Mat& frame, cv::Mat& dest, const common::Size& aspectRatio) { if(&dest != &frame) { @@ -119,7 +119,7 @@ void PadFrame(const cv::Mat& src, cv::Mat& dest, const int bottom, const int rig } } -void ResizeWithPad(const cv::Mat& frame, cv::Mat& dest, cv::Mat& cache, const od::Size& destSize) +void ResizeWithPad(const cv::Mat& frame, cv::Mat& dest, cv::Mat& cache, const common::Size& destSize) { ResizeFrame(frame, cache, destSize); PadFrame(cache, dest,destSize.m_Height - cache.rows,destSize.m_Width - cache.cols); diff --git a/samples/ObjectDetection/src/Main.cpp b/samples/ObjectDetection/src/Main.cpp index 10abb65cce..e057981550 100644 --- a/samples/ObjectDetection/src/Main.cpp +++ b/samples/ObjectDetection/src/Main.cpp @@ -6,7 +6,7 @@ #include "CvVideoFrameReader.hpp" #include "CvWindowOutput.hpp" #include "CvVideoFileWriter.hpp" -#include "NetworkPipeline.hpp" +#include "ObjectDetectionPipeline.hpp" #include "CmdArgsParser.hpp" #include @@ -14,6 +14,30 @@ #include #include +const std::string MODEL_NAME = "--model-name"; +const std::string VIDEO_FILE_PATH = "--video-file-path"; +const std::string MODEL_FILE_PATH = "--model-file-path"; +const std::string OUTPUT_VIDEO_FILE_PATH = "--output-video-file-path"; +const std::string LABEL_PATH = "--label-path"; +const std::string PREFERRED_BACKENDS = "--preferred-backends"; +const std::string HELP = "--help"; + +/* + * The accepted options for this Object detection executable + */ +static std::map CMD_OPTIONS = { + {VIDEO_FILE_PATH, "[REQUIRED] Path to the video file to run object detection on"}, + {MODEL_FILE_PATH, "[REQUIRED] Path to the Object Detection model to use"}, + {LABEL_PATH, "[REQUIRED] Path to the label set for the provided model file. " + "Label file is should just be an ordered list, seperated by new line."}, + {MODEL_NAME, "[REQUIRED] The name of the model being used. Accepted options: YOLO_V3_TINY, SSD_MOBILE"}, + {OUTPUT_VIDEO_FILE_PATH, "[OPTIONAL] Path to the output video file with detections added in. " + "If specified will save file to disk, else displays the output to screen"}, + {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma." + " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]." + " Defaults to CpuAcc,CpuRef"} +}; + /* * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector */ @@ -34,10 +58,10 @@ std::vector GetPreferredBackendList(const std::string& preferr /* * Assigns a color to each label in the label set */ -std::vector> AssignColourToLabel(const std::string& pathToLabelFile) +std::vector> AssignColourToLabel(const std::string& pathToLabelFile) { std::ifstream in(pathToLabelFile); - std::vector> labels; + std::vector> labels; std::string str; std::default_random_engine generator; @@ -47,7 +71,7 @@ std::vector> AssignColourToLabel(const st { if(!str.empty()) { - od::BBoxColor c{ + common::BBoxColor c{ .colorCode = std::make_tuple(distribution(generator), distribution(generator), distribution(generator)) @@ -60,13 +84,13 @@ std::vector> AssignColourToLabel(const st return labels; } -std::tuple>, - std::unique_ptr>> +std::tuple>, + std::unique_ptr>> GetFrameSourceAndSink(const std::map& options) { - std::unique_ptr> readerPtr; + std::unique_ptr> readerPtr; - std::unique_ptr reader = std::make_unique(); + std::unique_ptr reader = std::make_unique(); reader->Init(GetSpecifiedOption(options, VIDEO_FILE_PATH)); auto enc = reader->GetSourceEncodingInt(); @@ -75,7 +99,7 @@ std::tuple>, auto h = reader->GetSourceHeight(); if (!reader->ConvertToRGB()) { - readerPtr = std::move(std::make_unique(std::move(reader))); + readerPtr = std::move(std::make_unique(std::move(reader))); } else { @@ -85,14 +109,14 @@ std::tuple>, if(CheckOptionSpecified(options, OUTPUT_VIDEO_FILE_PATH)) { std::string outputVideo = GetSpecifiedOption(options, OUTPUT_VIDEO_FILE_PATH); - auto writer = std::make_unique(); + auto writer = std::make_unique(); writer->Init(outputVideo, enc, fps, w, h); return std::make_tuple<>(std::move(readerPtr), std::move(writer)); } else { - auto writer = std::make_unique(); + auto writer = std::make_unique(); writer->Init("Processed Video"); return std::make_tuple<>(std::move(readerPtr), std::move(writer)); } @@ -109,7 +133,7 @@ int main(int argc, char *argv[]) } // Create the network options - od::ODPipelineOptions pipelineOptions; + common::PipelineOptions pipelineOptions; pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH); pipelineOptions.m_ModelName = GetSpecifiedOption(options, MODEL_NAME); @@ -127,8 +151,8 @@ int main(int argc, char *argv[]) od::IPipelinePtr objectDetectionPipeline = od::CreatePipeline(pipelineOptions); auto inputAndOutput = GetFrameSourceAndSink(options); - std::unique_ptr> reader = std::move(std::get<0>(inputAndOutput)); - std::unique_ptr> sink = std::move(std::get<1>(inputAndOutput)); + std::unique_ptr> reader = std::move(std::get<0>(inputAndOutput)); + std::unique_ptr> sink = std::move(std::get<1>(inputAndOutput)); if (!sink->IsReady()) { @@ -136,7 +160,7 @@ int main(int argc, char *argv[]) return 1; } - od::InferenceResults results; + common::InferenceResults results; std::shared_ptr frame = reader->ReadFrame(); diff --git a/samples/ObjectDetection/src/NetworkPipeline.cpp b/samples/ObjectDetection/src/NetworkPipeline.cpp deleted file mode 100644 index 7f05882fc4..0000000000 --- a/samples/ObjectDetection/src/NetworkPipeline.cpp +++ /dev/null @@ -1,102 +0,0 @@ -// -// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#include "NetworkPipeline.hpp" -#include "ImageUtils.hpp" - -namespace od -{ - -ObjDetectionPipeline::ObjDetectionPipeline(std::unique_ptr executor, - std::unique_ptr decoder) : - m_executor(std::move(executor)), - m_decoder(std::move(decoder)){} - -void od::ObjDetectionPipeline::Inference(const cv::Mat& processed, InferenceResults& result) -{ - m_executor->Run(processed.data, processed.total() * processed.elemSize(), result); -} - -void ObjDetectionPipeline::PostProcessing(InferenceResults& inferenceResult, - const std::function& callback) -{ - DetectedObjects detections = m_decoder->Decode(inferenceResult, m_inputImageSize, - m_executor->GetImageAspectRatio(), {}); - if (callback) - { - callback(detections); - } -} - -void ObjDetectionPipeline::PreProcessing(const cv::Mat& frame, cv::Mat& processed) -{ - m_inputImageSize.m_Height = frame.rows; - m_inputImageSize.m_Width = frame.cols; - ResizeWithPad(frame, processed, m_processedFrame, m_executor->GetImageAspectRatio()); -} - -MobileNetSSDv1::MobileNetSSDv1(std::unique_ptr executor, - float objectThreshold) : - ObjDetectionPipeline(std::move(executor), - std::make_unique(objectThreshold)) -{} - -void MobileNetSSDv1::PreProcessing(const cv::Mat& frame, cv::Mat& processed) -{ - ObjDetectionPipeline::PreProcessing(frame, processed); - if (m_executor->GetInputDataType() == armnn::DataType::Float32) - { - // [0, 255] => [-1.0, 1.0] - processed.convertTo(processed, CV_32FC3, 1 / 127.5, -1); - } -} - -YoloV3Tiny::YoloV3Tiny(std::unique_ptr executor, - float NMSThreshold, float ClsThreshold, float ObjectThreshold) : - ObjDetectionPipeline(std::move(executor), - std::move(std::make_unique(NMSThreshold, - ClsThreshold, - ObjectThreshold))) -{} - -void YoloV3Tiny::PreProcessing(const cv::Mat& frame, cv::Mat& processed) -{ - ObjDetectionPipeline::PreProcessing(frame, processed); - if (m_executor->GetInputDataType() == armnn::DataType::Float32) - { - processed.convertTo(processed, CV_32FC3); - } -} - -IPipelinePtr CreatePipeline(od::ODPipelineOptions& config) -{ - auto executor = std::make_unique(config.m_ModelFilePath, config.m_backends); - - if (config.m_ModelName == "SSD_MOBILE") - { - float detectionThreshold = 0.6; - - return std::make_unique(std::move(executor), - detectionThreshold - ); - } - else if (config.m_ModelName == "YOLO_V3_TINY") - { - float NMSThreshold = 0.6f; - float ClsThreshold = 0.6f; - float ObjectThreshold = 0.6f; - return std::make_unique(std::move(executor), - NMSThreshold, - ClsThreshold, - ObjectThreshold - ); - } - else - { - throw std::invalid_argument("Unknown Model name: " + config.m_ModelName + " supplied by user."); - } - -} -}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/src/ObjectDetectionPipeline.cpp b/samples/ObjectDetection/src/ObjectDetectionPipeline.cpp new file mode 100644 index 0000000000..077caa40cb --- /dev/null +++ b/samples/ObjectDetection/src/ObjectDetectionPipeline.cpp @@ -0,0 +1,102 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "ObjectDetectionPipeline.hpp" +#include "ImageUtils.hpp" + +namespace od +{ + +ObjDetectionPipeline::ObjDetectionPipeline(std::unique_ptr> executor, + std::unique_ptr decoder) : + m_executor(std::move(executor)), + m_decoder(std::move(decoder)){} + +void od::ObjDetectionPipeline::Inference(const cv::Mat& processed, common::InferenceResults& result) +{ + m_executor->Run(processed.data, processed.total() * processed.elemSize(), result); +} + +void ObjDetectionPipeline::PostProcessing(common::InferenceResults& inferenceResult, + const std::function& callback) +{ + DetectedObjects detections = m_decoder->Decode(inferenceResult, m_inputImageSize, + m_executor->GetImageAspectRatio(), {}); + if (callback) + { + callback(detections); + } +} + +void ObjDetectionPipeline::PreProcessing(const cv::Mat& frame, cv::Mat& processed) +{ + m_inputImageSize.m_Height = frame.rows; + m_inputImageSize.m_Width = frame.cols; + ResizeWithPad(frame, processed, m_processedFrame, m_executor->GetImageAspectRatio()); +} + +MobileNetSSDv1::MobileNetSSDv1(std::unique_ptr> executor, + float objectThreshold) : + ObjDetectionPipeline(std::move(executor), + std::make_unique(objectThreshold)) +{} + +void MobileNetSSDv1::PreProcessing(const cv::Mat& frame, cv::Mat& processed) +{ + ObjDetectionPipeline::PreProcessing(frame, processed); + if (m_executor->GetInputDataType() == armnn::DataType::Float32) + { + // [0, 255] => [-1.0, 1.0] + processed.convertTo(processed, CV_32FC3, 1 / 127.5, -1); + } +} + +YoloV3Tiny::YoloV3Tiny(std::unique_ptr> executor, + float NMSThreshold, float ClsThreshold, float ObjectThreshold) : + ObjDetectionPipeline(std::move(executor), + std::move(std::make_unique(NMSThreshold, + ClsThreshold, + ObjectThreshold))) +{} + +void YoloV3Tiny::PreProcessing(const cv::Mat& frame, cv::Mat& processed) +{ + ObjDetectionPipeline::PreProcessing(frame, processed); + if (m_executor->GetInputDataType() == armnn::DataType::Float32) + { + processed.convertTo(processed, CV_32FC3); + } +} + +IPipelinePtr CreatePipeline(common::PipelineOptions& config) +{ + auto executor = std::make_unique>(config.m_ModelFilePath, config.m_backends); + + if (config.m_ModelName == "SSD_MOBILE") + { + float detectionThreshold = 0.6; + + return std::make_unique(std::move(executor), + detectionThreshold + ); + } + else if (config.m_ModelName == "YOLO_V3_TINY") + { + float NMSThreshold = 0.6f; + float ClsThreshold = 0.6f; + float ObjectThreshold = 0.6f; + return std::make_unique(std::move(executor), + NMSThreshold, + ClsThreshold, + ObjectThreshold + ); + } + else + { + throw std::invalid_argument("Unknown Model name: " + config.m_ModelName + " supplied by user."); + } + +} +}// namespace od \ No newline at end of file diff --git a/samples/ObjectDetection/src/SSDResultDecoder.cpp b/samples/ObjectDetection/src/SSDResultDecoder.cpp index a3319212e5..6dfd1abf84 100644 --- a/samples/ObjectDetection/src/SSDResultDecoder.cpp +++ b/samples/ObjectDetection/src/SSDResultDecoder.cpp @@ -12,9 +12,9 @@ namespace od { -DetectedObjects SSDResultDecoder::Decode(const InferenceResults& networkResults, - const Size& outputFrameSize, - const Size& resizedFrameSize, +DetectedObjects SSDResultDecoder::Decode(const common::InferenceResults& networkResults, + const common::Size& outputFrameSize, + const common::Size& resizedFrameSize, const std::vector& labels) { // SSD network outputs 4 tensors: bounding boxes, labels, probabilities, number of detections. diff --git a/samples/ObjectDetection/src/YoloResultDecoder.cpp b/samples/ObjectDetection/src/YoloResultDecoder.cpp index ffbf7cb68d..f177802f8a 100644 --- a/samples/ObjectDetection/src/YoloResultDecoder.cpp +++ b/samples/ObjectDetection/src/YoloResultDecoder.cpp @@ -13,9 +13,9 @@ namespace od { -DetectedObjects YoloResultDecoder::Decode(const InferenceResults& networkResults, - const Size& outputFrameSize, - const Size& resizedFrameSize, +DetectedObjects YoloResultDecoder::Decode(const common::InferenceResults& networkResults, + const common::Size& outputFrameSize, + const common::Size& resizedFrameSize, const std::vector& labels) { @@ -33,7 +33,7 @@ DetectedObjects YoloResultDecoder::Decode(const InferenceResults& networkResults DetectedObjects detectedObjects; DetectedObjects resultsAfterNMS; - for (const InferenceResult& result : networkResults) + for (const common::InferenceResult& result : networkResults) { for (unsigned int i = 0; i < m_numBoxes; ++i) { diff --git a/samples/ObjectDetection/test/FrameReaderTest.cpp b/samples/ObjectDetection/test/FrameReaderTest.cpp index a4bda227b3..a02fa7fd4e 100644 --- a/samples/ObjectDetection/test/FrameReaderTest.cpp +++ b/samples/ObjectDetection/test/FrameReaderTest.cpp @@ -20,7 +20,7 @@ SCENARIO("Read frames from video file using CV frame reader", "[framereader]") { std::string file = testResources + "/" + "Megamind.avi"; WHEN("Frame reader is initialised") { - od::CvVideoFrameReader reader; + common::CvVideoFrameReader reader; THEN("no exception is thrown") { reader.Init(file); @@ -92,7 +92,7 @@ SCENARIO("Read frames from video file using CV frame reader", "[framereader]") { WHEN("Frame reader is initialised") { - od::CvVideoFrameReader reader; + common::CvVideoFrameReader reader; THEN("exception is thrown") { REQUIRE_THROWS(reader.Init(file)); diff --git a/samples/ObjectDetection/test/ImageUtilsTest.cpp b/samples/ObjectDetection/test/ImageUtilsTest.cpp index e486ae192b..4490cffda9 100644 --- a/samples/ObjectDetection/test/ImageUtilsTest.cpp +++ b/samples/ObjectDetection/test/ImageUtilsTest.cpp @@ -96,9 +96,9 @@ TEST_CASE("Test Adding Inference output to frame") std::string testResources = TEST_RESOURCE_DIR; REQUIRE(testResources != ""); - std::vector> labels; + std::vector> labels; - od::BBoxColor c + common::BBoxColor c { .colorCode = std::make_tuple (0, 0, 255) }; diff --git a/samples/ObjectDetection/test/PipelineTest.cpp b/samples/ObjectDetection/test/PipelineTest.cpp index 289f44f5e9..bc5824e483 100644 --- a/samples/ObjectDetection/test/PipelineTest.cpp +++ b/samples/ObjectDetection/test/PipelineTest.cpp @@ -4,7 +4,7 @@ // #include #include -#include +#include "ObjectDetectionPipeline.hpp" #include "Types.hpp" static std::string GetResourceFilePath(const std::string& filename) @@ -32,14 +32,14 @@ TEST_CASE("Test Network Execution SSD_MOBILE") std::string testResources = TEST_RESOURCE_DIR; REQUIRE(testResources != ""); // Create the network options - od::ODPipelineOptions options; + common::PipelineOptions options; options.m_ModelFilePath = GetResourceFilePath("detect.tflite"); options.m_ModelName = "SSD_MOBILE"; options.m_backends = {"CpuAcc", "CpuRef"}; od::IPipelinePtr objectDetectionPipeline = od::CreatePipeline(options); - od::InferenceResults results; + common::InferenceResults results; cv::Mat processed; cv::Mat inputFrame = cv::imread(GetResourceFilePath("basketball1.png"), cv::IMREAD_COLOR); cv::cvtColor(inputFrame, inputFrame, cv::COLOR_BGR2RGB); diff --git a/samples/SpeechRecognition/CMakeLists.txt b/samples/SpeechRecognition/CMakeLists.txt new file mode 100644 index 0000000000..6c6b0b6dfc --- /dev/null +++ b/samples/SpeechRecognition/CMakeLists.txt @@ -0,0 +1,62 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT + +cmake_minimum_required(VERSION 3.0.2) + +set(CMAKE_C_STANDARD 99) +set(CMAKE_CXX_STANDARD 14) + +# Make the standard a requirement => prevent fallback to previous +# supported standard +set(CMAKE_C_STANDARD_REQUIRED ON) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# We want to pass standard C/C++ flags, without gnu extensions +set(CMAKE_C_EXTENSIONS OFF) +set(CMAKE_CXX_EXTENSIONS OFF) + +project (speech-recognition-example) + +set(CMAKE_C_FLAGS_DEBUG "-DDEBUG -O0 -g -fPIC -pthread") +set(CMAKE_C_FLAGS_RELEASE "-DNDEBUG -O3 -fPIC -pthread") + +set(CMAKE_CXX_FLAGS_DEBUG "-DDEBUG -O0 -g -fPIC -pthread") +set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -fPIC -pthread") + +include(ExternalProject) + +# Build in release mode by default +if (NOT CMAKE_BUILD_TYPE STREQUAL Debug) + set(CMAKE_BUILD_TYPE Release CACHE INTERNAL "") +endif() + +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) + +if (NOT DEFINED DEPENDENCIES_DIR) + set(DEPENDENCIES_DIR ${CMAKE_BINARY_DIR}/dependencies) +endif() + +include(../common/cmake/find_armnn.cmake) + +include_directories(include) +include_directories(../common/include/ArmnnUtils) +include_directories(../common/include/Utils) + +file(GLOB SOURCES "src/*.cpp") +file(GLOB COMMON_UTILS_SOURCES "../common/src/Utils/*.cpp") +list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/Main.cpp) +file(GLOB TEST_SOURCES "test/*.cpp") +file(GLOB APP_MAIN "src/Main.cpp") + +if(BUILD_UNIT_TESTS) + include(cmake/unit_tests.cmake) +endif() + +set(APP_TARGET_NAME "${CMAKE_PROJECT_NAME}") + +add_executable("${APP_TARGET_NAME}" ${COMMON_UTILS_SOURCES} ${SOURCES} ${APP_MAIN}) + +target_link_libraries("${APP_TARGET_NAME}" PUBLIC ${ARMNN_LIBS} -lsndfile -lsamplerate) +target_include_directories("${APP_TARGET_NAME}" PUBLIC ${ARMNN_INCLUDE_DIR} ) diff --git a/samples/SpeechRecognition/Readme.md b/samples/SpeechRecognition/Readme.md new file mode 100644 index 0000000000..656ba55a79 --- /dev/null +++ b/samples/SpeechRecognition/Readme.md @@ -0,0 +1,245 @@ +# Speech Recognition Example + +## Introduction +This is a sample code showing automatic speech recognition using Arm NN public C++ API. The compiled application can take + + * an audio file + +as input and produce + * recognised text to the console + +as output + +## Dependencies + +This example utilises `libsndfile`, `libasound` and `libsamplerate` libraries to capture the raw audio data from file, and to re-sample to the expected +sample rate. Top level inference API is provided by Arm NN library. + +### Arm NN + +Speech Recognition example build system does not trigger Arm NN compilation. Thus, before building the application, +please ensure that Arm NN libraries and header files are available on your build platform. +The application executable binary dynamically links with the following Arm NN libraries: +* libarmnn.so +* libarmnnTfLiteParser.so + +The build script searches for available Arm NN libraries in the following order: +1. Inside custom user directory specified by ARMNN_LIB_DIR cmake option. +2. Inside the current Arm NN repository, assuming that Arm NN was built following [these instructions](../../BuildGuideCrossCompilation.md). +3. Inside default locations for system libraries, assuming Arm NN was installed from deb packages. + +Arm NN header files will be searched in parent directory of found libraries files under `include` directory, i.e. +libraries found in `/usr/lib` or `/usr/lib64` and header files in `/usr/include` (or `${ARMNN_LIB_DIR}/include`). + +Please see [find_armnn.cmake](./cmake/find_armnn.cmake) for implementation details. + +## Building +There is one flow for building this application: +* native build on a host platform + +### Build Options +* ARMNN_LIB_DIR - point to the custom location of the Arm NN libs and headers. +* BUILD_UNIT_TESTS - set to `1` to build tests. Additionally to the main application, `speech-recognition-example-tests` +unit tests executable will be created. + +### Native Build +To build this application on a host platform, firstly ensure that required dependencies are installed: +For example, for raspberry PI: +```commandline +sudo apt-get update +sudo apt-get -yq install libsndfile1-dev +sudo apt-get -yq install libasound2-dev +sudo apt-get -yq install libsamplerate-dev +``` + +To build demo application, create a build directory: +```commandline +mkdir build +cd build +``` +If you have already installed Arm NN and and the required libraries: + +Inside build directory, run cmake and make commands: +```commandline +cmake .. +make +``` +This will build the following in bin directory: +* `speech-recognition-example` - application executable + +If you have custom Arm NN location, use `ARMNN_LIB_DIR` options: +```commandline +cmake -DARMNN_LIB_DIR=/path/to/armnn .. +make +``` +## Executing + +Once the application executable is built, it can be executed with the following options: +* --audio-file-path: Path to the audio file to run speech recognition on **[REQUIRED]** +* --model-file-path: Path to the Speech Recognition model to use **[REQUIRED]** + +* --preferred-backends: Takes the preferred backends in preference order, separated by comma. + For example: `CpuAcc,GpuAcc,CpuRef`. Accepted options: [`CpuAcc`, `CpuRef`, `GpuAcc`]. + Defaults to `CpuRef` **[OPTIONAL]** + +### Speech Recognition on a supplied audio file + +To run speech recognition on a supplied audio file and output the result to console: +```commandline +./speech-recognition-example --audio-file-path /path/to/audio/file --model-file-path /path/to/model/file +``` +--- + +# Application Overview +This section provides a walkthrough of the application, explaining in detail the steps: +1. Initialisation + 1. Reading from Audio Source +2. Creating a Network + 1. Creating Parser and Importing Graph + 3. Optimizing Graph for Compute Device + 4. Creating Input and Output Binding Information +3. Speech Recognition pipeline + 1. Pre-processing the Captured Audio + 2. Making Input and Output Tensors + 3. Executing Inference + 4. Postprocessing + 5. Decoding and Processing Inference Output + +### Initialisation + +##### Reading from Audio Source +After parsing user arguments, the chosen audio file is loaded into an AudioCapture object. +We use [`AudioCapture`](./include/AudioCapture.hpp) in our main function to capture appropriately sized audio blocks from the source using the +`Next()` function. + +The `AudioCapture` object also re-samples the audio input to a desired sample rate, and sets the number of channels used to one channel (i.e `mono`) + +### Creating a Network + +All operations with Arm NN and networks are encapsulated in [`ArmnnNetworkExecutor`](./include/ArmnnNetworkExecutor.hpp) +class. + +##### Creating Parser and Importing Graph +The first step with Arm NN SDK is to import a graph from file by using the appropriate parser. + +The Arm NN SDK provides parsers for reading graphs from a variety of model formats. In our application we specifically +focus on `.tflite, .pb, .onnx` models. + +Based on the extension of the provided model file, the corresponding parser is created and the network file loaded with +`CreateNetworkFromBinaryFile()` method. The parser will handle the creation of the underlying Arm NN graph. + +Current example accepts tflite format model files, we use `ITfLiteParser`: +```c++ +#include "armnnTfLiteParser/ITfLiteParser.hpp" + +armnnTfLiteParser::ITfLiteParserPtr parser = armnnTfLiteParser::ITfLiteParser::Create(); +armnn::INetworkPtr network = parser->CreateNetworkFromBinaryFile(modelPath.c_str()); +``` + +##### Optimizing Graph for Compute Device +Arm NN supports optimized execution on multiple CPU and GPU devices. Prior to executing a graph, we must select the +appropriate device context. We do this by creating a runtime context with default options with `IRuntime()`. + +For example: +```c++ +#include "armnn/ArmNN.hpp" + +auto runtime = armnn::IRuntime::Create(armnn::IRuntime::CreationOptions()); +``` + +We can optimize the imported graph by specifying a list of backends in order of preference and implement +backend-specific optimizations. The backends are identified by a string unique to the backend, +for example `CpuAcc, GpuAcc, CpuRef`. + +For example: +```c++ +std::vector backends{"CpuAcc", "GpuAcc", "CpuRef"}; +``` + +Internally and transparently, Arm NN splits the graph into subgraph based on backends, it calls a optimize subgraphs +function on each of them and, if possible, substitutes the corresponding subgraph in the original graph with +its optimized version. + +Using the `Optimize()` function we optimize the graph for inference and load the optimized network onto the compute +device with `LoadNetwork()`. This function creates the backend-specific workloads +for the layers and a backend specific workload factory which is called to create the workloads. + +For example: +```c++ +armnn::IOptimizedNetworkPtr optNet = Optimize(*network, + backends, + m_Runtime->GetDeviceSpec(), + armnn::OptimizerOptions()); +std::string errorMessage; +runtime->LoadNetwork(0, std::move(optNet), errorMessage)); +std::cerr << errorMessage << std::endl; +``` + +##### Creating Input and Output Binding Information +Parsers can also be used to extract the input information for the network. By calling `GetSubgraphInputTensorNames` +we extract all the input names and, with `GetNetworkInputBindingInfo`, bind the input points of the graph. +For example: +```c++ +std::vector inputNames = parser->GetSubgraphInputTensorNames(0); +auto inputBindingInfo = parser->GetNetworkInputBindingInfo(0, inputNames[0]); +``` +The input binding information contains all the essential information about the input. It is a tuple consisting of +integer identifiers for bindable layers (inputs, outputs) and the tensor info (data type, quantization information, +number of dimensions, total number of elements). + +Similarly, we can get the output binding information for an output layer by using the parser to retrieve output +tensor names and calling `GetNetworkOutputBindingInfo()`. + +### Speech Recognition pipeline + +The speech recognition pipeline has 3 steps to perform, data pre-processing, run inference and decode inference results +in the post-processing step. + +See [`SpeechRecognitionPipeline`](include/SpeechRecognitionPipeline.hpp) for more details. + +#### Pre-processing the Audio Input +Each frame captured from source is read and stored by the AudioCapture object. +It's `Next()` function provides us with the correctly positioned window of data, sized appropriately for the given model, to pre-process before inference. + +```c++ +std::vector audioBlock = capture.Next(); +... +std::vector preprocessedData = asrPipeline->PreProcessing(audioBlock, preprocessor); +``` + +The `MFCC` class is then used to extract the Mel-frequency Cepstral Coefficients (MFCCs, [see Wikipedia](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum)) from each stored audio frame in the provided window of audio, to be used as features for the network. MFCCs are the result of computing the dot product of the Discrete Cosine Transform (DCT) Matrix and the log of the Mel energy. + +After all the MFCCs needed for an inference have been extracted from the audio data, we convolve them with 1-dimensional Savitzky-Golay filters to compute the first and second MFCC derivatives with respect to time. The MFCCs and the derivatives are concatenated to make the input tensor for the model + + +#### Executing Inference +```c++ +common::InferenceResults results; +... +asrPipeline->Inference(preprocessedData, results); +``` +Inference step will call `ArmnnNetworkExecutor::Run` method that will prepare input tensors and execute inference. +A compute device performs inference for the loaded network using the `EnqueueWorkload()` function of the runtime context. +For example: +```c++ +//const void* inputData = ...; +//outputTensors were pre-allocated before + +armnn::InputTensors inputTensors = {{ inputBindingInfo.first,armnn::ConstTensor(inputBindingInfo.second, inputData)}}; +runtime->EnqueueWorkload(0, inputTensors, outputTensors); +``` +We allocate memory for output data once and map it to output tensor objects. After successful inference, we read data +from the pre-allocated output data buffer. See [`ArmnnNetworkExecutor::ArmnnNetworkExecutor`](./src/ArmnnNetworkExecutor.cpp) +and [`ArmnnNetworkExecutor::Run`](./src/ArmnnNetworkExecutor.cpp) for more details. + +#### Postprocessing + +##### Decoding and Processing Inference Output +The output from the inference must be decoded to obtain the recognised characters from the speech. +A simple greedy decoder classifies the results by taking the highest element of the output as a key for the labels dictionary. +The value returned is a character which is appended to a list, and the list is filtered to remove unwanted characters. + +```c++ +asrPipeline->PostProcessing(results, isFirstWindow, !capture.HasNext(), currentRContext); +``` +The produced string is displayed on the console. \ No newline at end of file diff --git a/samples/SpeechRecognition/cmake/unit_tests.cmake b/samples/SpeechRecognition/cmake/unit_tests.cmake new file mode 100644 index 0000000000..47c4f4b579 --- /dev/null +++ b/samples/SpeechRecognition/cmake/unit_tests.cmake @@ -0,0 +1,34 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT + +set(TEST_RESOURCES_DIR ${CMAKE_SOURCE_DIR}/test/resources) +file(MAKE_DIRECTORY ${TEST_RESOURCES_DIR}) +add_definitions (-DTEST_RESOURCE_DIR="${TEST_RESOURCES_DIR}") +set(TEST_TARGET_NAME "${CMAKE_PROJECT_NAME}-tests") + +file(GLOB TEST_SOURCES "test/*") + +file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/test/resources) +include(../common/cmake/find_catch.cmake) + +add_executable("${TEST_TARGET_NAME}" ${COMMON_UTILS_SOURCES} ${SOURCES} ${TEST_SOURCES} ) + +ExternalProject_Add(passport + URL https://raw.githubusercontent.com/Azure-Samples/cognitive-services-speech-sdk/master/sampledata/audiofiles/myVoiceIsMyPassportVerifyMe04.wav + DOWNLOAD_NO_EXTRACT 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND ${CMAKE_COMMAND} -E copy /myVoiceIsMyPassportVerifyMe04.wav ${CMAKE_CURRENT_SOURCE_DIR}/test/resources + INSTALL_COMMAND "" + ) + +add_dependencies( + "${TEST_TARGET_NAME}" + "passport" + "catch2-headers" +) + +target_include_directories("${TEST_TARGET_NAME}" PUBLIC ${TEST_TPIP_INCLUDE} + ${ARMNN_INCLUDE_DIR} + ${DEPENDENCIES_DIR} ${TEST_RESOURCES_DIR} ${COMMON_INCLUDE_DIR}) + +target_link_libraries("${TEST_TARGET_NAME}" PUBLIC ${ARMNN_LIBS} -lsndfile -lsamplerate) \ No newline at end of file diff --git a/samples/SpeechRecognition/include/AudioCapture.hpp b/samples/SpeechRecognition/include/AudioCapture.hpp new file mode 100644 index 0000000000..90c2eccacf --- /dev/null +++ b/samples/SpeechRecognition/include/AudioCapture.hpp @@ -0,0 +1,62 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include +#include + +#include + +#include + +#include + +#include "SlidingWindow.hpp" + +namespace asr +{ + +/** +* @brief Class used to capture the audio data loaded from file, and to provide a method of + * extracting correctly positioned and appropriately sized audio windows +* +*/ + class AudioCapture + { + public: + + SlidingWindow m_window; + int lastReadIdx= 0; + + /** + * @brief Default constructor + */ + AudioCapture() + {}; + + /** + * @brief Function to load the audio data captured from the + * input file to memory. + */ + std::vector LoadAudioFile(std::string filePath); + + /** + * @brief Function to initialize the sliding window. This will set its position in memory, its + * window size and its stride. + */ + void InitSlidingWindow(float* data, size_t dataSize, int minSamples, size_t stride); + + /** + * Checks whether there is another block of audio in memory to read + */ + bool HasNext(); + + /** + * Retrieves the next block of audio if its available + */ + std::vector Next(); + }; +} // namespace asr \ No newline at end of file diff --git a/samples/SpeechRecognition/include/DataStructures.hpp b/samples/SpeechRecognition/include/DataStructures.hpp new file mode 100644 index 0000000000..9922265299 --- /dev/null +++ b/samples/SpeechRecognition/include/DataStructures.hpp @@ -0,0 +1,102 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + +#include +#include + +/** + * Class Array2d is a data structure that represents a two dimensional array. + * The data is allocated in contiguous memory, arranged row-wise + * and individual elements can be accessed with the () operator. + * For example a two dimensional array D of size (M, N) can be accessed: + * + * _|<------------- col size = N -------->| + * | D(r=0, c=0) D(r=0, c=1)... D(r=0, c=N) + * | D(r=1, c=0) D(r=1, c=1)... D(r=1, c=N) + * | ... + * row size = M ... + * | ... + * _ D(r=M, c=0) D(r=M, c=1)... D(r=M, c=N) + * + */ +template +class Array2d +{ +private: + size_t m_rows; + size_t m_cols; + T* m_data; + +public: + /** + * Creates the array2d with the given sizes. + * + * @param rows number of rows. + * @param cols number of columns. + */ + Array2d(unsigned rows, unsigned cols) + { + if (rows == 0 || cols == 0) { + printf("Array2d constructor has 0 size.\n"); + m_data = nullptr; + return; + } + m_rows = rows; + m_cols = cols; + m_data = new T[rows * cols]; + } + + ~Array2d() + { + delete[] m_data; + } + + T& operator() (unsigned int row, unsigned int col) + { + return m_data[m_cols * row + col]; + } + + T operator() (unsigned int row, unsigned int col) const + { + return m_data[m_cols * row + col]; + } + + /** + * Gets rows number of the current array2d. + * @return number of rows. + */ + size_t size(size_t dim) + { + switch (dim) + { + case 0: + return m_rows; + case 1: + return m_cols; + default: + return 0; + } + } + + /** + * Gets the array2d total size. + */ + size_t totalSize() + { + return m_rows * m_cols; + } + + /** + * array2d iterator. + */ + using iterator=T*; + using const_iterator=T const*; + + iterator begin() { return m_data; } + iterator end() { return m_data + totalSize(); } + const_iterator begin() const { return m_data; } + const_iterator end() const { return m_data + totalSize(); }; +}; diff --git a/samples/SpeechRecognition/include/Decoder.hpp b/samples/SpeechRecognition/include/Decoder.hpp new file mode 100644 index 0000000000..69d97ccf64 --- /dev/null +++ b/samples/SpeechRecognition/include/Decoder.hpp @@ -0,0 +1,63 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include +#include +#include +#include + +# pragma once + +namespace asr +{ +/** +* @brief Class used to Decode the output of the ASR inference +* +*/ + class Decoder + { + public: + std::map m_labels; + /** + * @brief Default constructor + * @param[in] labels - map of labels to be used for decoding to text. + */ + Decoder(std::map& labels); + + /** + * @brief Function to decode the output into a text string + * @param[in] output - the output vector to decode. + */ + template + std::string DecodeOutput(std::vector& contextToProcess) + { + int rowLength = 29; + + std::vector unfilteredText; + + for(int row = 0; row < contextToProcess.size()/rowLength; ++row) + { + std::vector rowVector; + for(int j = 0; j < rowLength; ++j) + { + rowVector.emplace_back(static_cast(contextToProcess[row * rowLength + j])); + } + + int max_index = std::distance(rowVector.begin(),std::max_element(rowVector.begin(), rowVector.end())); + unfilteredText.emplace_back(this->m_labels.at(max_index)[0]); + } + + std::string filteredText = FilterCharacters(unfilteredText); + return filteredText; + } + + /** + * @brief Function to filter out unwanted characters + * @param[in] unfiltered - the unfiltered output to be processed. + */ + std::string FilterCharacters(std::vector& unfiltered); + }; +} // namespace asr diff --git a/samples/SpeechRecognition/include/MFCC.hpp b/samples/SpeechRecognition/include/MFCC.hpp new file mode 100644 index 0000000000..14b6d9fe79 --- /dev/null +++ b/samples/SpeechRecognition/include/MFCC.hpp @@ -0,0 +1,244 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include +#include +#include +#include +#include + +/* MFCC's consolidated parameters */ +class MfccParams +{ +public: + float m_samplingFreq; + int m_numFbankBins; + float m_melLoFreq; + float m_melHiFreq; + int m_numMfccFeatures; + int m_frameLen; + int m_frameLenPadded; + bool m_useHtkMethod; + int m_numMfccVectors; + + /** @brief Constructor */ + MfccParams(const float samplingFreq, const int numFbankBins, + const float melLoFreq, const float melHiFreq, + const int numMfccFeats, const int frameLen, + const bool useHtkMethod, const int numMfccVectors); + + /* Delete the default constructor */ + MfccParams() = delete; + + /* Default destructor */ + ~MfccParams() = default; + + /** @brief String representation of parameters */ + std::string Str(); +}; + +/** + * @brief Class for MFCC feature extraction. + * Based on https://github.com/ARM-software/ML-KWS-for-MCU/blob/master/Deployment/Source/MFCC/mfcc.cpp + * This class is designed to be generic and self-sufficient but + * certain calculation routines can be overridden to accommodate + * use-case specific requirements. + */ +class MFCC +{ + +public: + + /** + * @brief Extract MFCC features for one single small frame of + * audio data e.g. 640 samples. + * @param[in] audioData - Vector of audio samples to calculate + * features for. + * @return Vector of extracted MFCC features. + **/ + std::vector MfccCompute(const std::vector& audioData); + + MfccParams _m_params; + + /** + * @brief Constructor + * @param[in] params - MFCC parameters + */ + MFCC(const MfccParams& params); + + /* Delete the default constructor */ + MFCC() = delete; + + /** @brief Default destructor */ + ~MFCC() = default; + + /** @brief Initialise */ + void Init(); + + /** + * @brief Extract MFCC features and quantise for one single small + * frame of audio data e.g. 640 samples. + * @param[in] audioData - Vector of audio samples to calculate + * features for. + * @param[in] quantScale - quantisation scale. + * @param[in] quantOffset - quantisation offset + * @return Vector of extracted quantised MFCC features. + **/ + template + std::vector MfccComputeQuant(const std::vector& audioData, + const float quantScale, + const int quantOffset) + { + this->_MfccComputePreFeature(audioData); + float minVal = std::numeric_limits::min(); + float maxVal = std::numeric_limits::max(); + + std::vector mfccOut(this->_m_params.m_numMfccFeatures); + const size_t numFbankBins = this->_m_params.m_numFbankBins; + + /* Take DCT. Uses matrix mul. */ + for (size_t i = 0, j = 0; i < mfccOut.size(); ++i, j += numFbankBins) + { + float sum = 0; + for (size_t k = 0; k < numFbankBins; ++k) + { + sum += this->_m_dctMatrix[j + k] * this->_m_melEnergies[k]; + } + /* Quantize to T. */ + sum = std::round((sum / quantScale) + quantOffset); + mfccOut[i] = static_cast(std::min(std::max(sum, minVal), maxVal)); + } + + return mfccOut; + } + + /* Constants */ + static constexpr float logStep = 1.8562979903656 / 27.0; + static constexpr float freqStep = 200.0 / 3; + static constexpr float minLogHz = 1000.0; + static constexpr float minLogMel = minLogHz / freqStep; + +protected: + /** + * @brief Project input frequency to Mel Scale. + * @param[in] freq - input frequency in floating point + * @param[in] useHTKmethod - bool to signal if HTK method is to be + * used for calculation + * @return Mel transformed frequency in floating point + **/ + static float MelScale(const float freq, + const bool useHTKMethod = true); + + /** + * @brief Inverse Mel transform - convert MEL warped frequency + * back to normal frequency + * @param[in] freq - Mel frequency in floating point + * @param[in] useHTKmethod - bool to signal if HTK method is to be + * used for calculation + * @return Real world frequency in floating point + **/ + static float InverseMelScale(const float melFreq, + const bool useHTKMethod = true); + + /** + * @brief Populates MEL energies after applying the MEL filter + * bank weights and adding them up to be placed into + * bins, according to the filter bank's first and last + * indices (pre-computed for each filter bank element + * by _CreateMelFilterBank function). + * @param[in] fftVec Vector populated with FFT magnitudes + * @param[in] melFilterBank 2D Vector with filter bank weights + * @param[in] filterBankFilterFirst Vector containing the first indices of filter bank + * to be used for each bin. + * @param[in] filterBankFilterLast Vector containing the last indices of filter bank + * to be used for each bin. + * @param[out] melEnergies Pre-allocated vector of MEL energies to be + * populated. + * @return true if successful, false otherwise + */ + virtual bool ApplyMelFilterBank( + std::vector& fftVec, + std::vector>& melFilterBank, + std::vector& filterBankFilterFirst, + std::vector& filterBankFilterLast, + std::vector& melEnergies); + + /** + * @brief Converts the Mel energies for logarithmic scale + * @param[in/out] melEnergies - 1D vector of Mel energies + **/ + virtual void ConvertToLogarithmicScale(std::vector& melEnergies); + + /** + * @brief Create a matrix used to calculate Discrete Cosine + * Transform. + * @param[in] inputLength - input length of the buffer on which + * DCT will be performed + * @param[in] coefficientCount - Total coefficients per input + * length + * @return 1D vector with inputLength x coefficientCount elements + * populated with DCT coefficients. + */ + virtual std::vector CreateDCTMatrix( + const int32_t inputLength, + const int32_t coefficientCount); + + /** + * @brief Given the low and high Mel values, get the normaliser + * for weights to be applied when populating the filter + * bank. + * @param[in] leftMel - low Mel frequency value + * @param[in] rightMel - high Mel frequency value + * @param[in] useHTKMethod - bool to signal if HTK method is to be + * used for calculation + */ + virtual float GetMelFilterBankNormaliser( + const float& leftMel, + const float& rightMel, + const bool useHTKMethod); + +private: + + std::vector _m_frame; + std::vector _m_buffer; + std::vector _m_melEnergies; + std::vector _m_windowFunc; + std::vector> _m_melFilterBank; + std::vector _m_dctMatrix; + std::vector _m_filterBankFilterFirst; + std::vector _m_filterBankFilterLast; + bool _m_filterBankInitialised; + + /** + * @brief Initialises the filter banks and the DCT matrix **/ + void _InitMelFilterBank(); + + /** + * @brief Signals whether the instance of MFCC has had its + * required buffers initialised + * @return True if initialised, false otherwise + **/ + bool _IsMelFilterBankInited(); + + /** + * @brief Create mel filter banks for MFCC calculation. + * @return 2D vector of floats + **/ + std::vector> _CreateMelFilterBank(); + + /** + * @brief Computes and populates internal memeber buffers used + * in MFCC feature calculation + * @param[in] audioData - 1D vector of 16-bit audio data + */ + void _MfccComputePreFeature(const std::vector& audioData); + + /** @brief Computes the magnitude from an interleaved complex array */ + void _ConvertToPowerSpectrum(); + +}; + diff --git a/samples/SpeechRecognition/include/MathUtils.hpp b/samples/SpeechRecognition/include/MathUtils.hpp new file mode 100644 index 0000000000..5f81fb6507 --- /dev/null +++ b/samples/SpeechRecognition/include/MathUtils.hpp @@ -0,0 +1,85 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include +#include +#include + +class MathUtils +{ + +public: + + /** + * @brief Computes the FFT for the input vector + * @param[in] input Floating point vector of input elements + * @param[out] fftOutput Output buffer to be populated by computed + * FFTs + * @return none + */ + static void FftF32(std::vector& input, + std::vector& fftOutput); + + + /** + * @brief Computes the dot product of two 1D floating point + * vectors. + * result = sum(srcA[0]*srcB[0] + srcA[1]*srcB[1] + ..) + * @param[in] srcPtrA pointer to the first element of first + * array + * @param[in] srcPtrB pointer to the first element of second + * array + * @param[in] srcLen Number of elements in the array/vector + * @return dot product + */ + static float DotProductF32(float* srcPtrA, float* srcPtrB, + const int srcLen); + + /** + * @brief Computes the squared magnitude of floating point + * complex number array. + * @param[in] ptrSrc pointer to the first element of input + * array + * @param[in] srcLen Number of elements in the array/vector + * @param[out] ptrDst Output buffer to be populated + * @param[in] dstLen output buffer len (for sanity check only) + * @return true if successful, false otherwise + */ + static bool ComplexMagnitudeSquaredF32(float* ptrSrc, + const int srcLen, + float* ptrDst, + const int dstLen); + + /** + * @brief Computes the natural logarithms of input floating point + * vector + * @param[in] input Floating point input vector + * @param[out] output Pre-allocated buffer to be populated with + * natural log values of each input element + * @return none + */ + static void VecLogarithmF32(std::vector & input, + std::vector & output); + + /** + * @brief Gets the mean of a floating point array of elements + * @param[in] ptrSrc pointer to the first element + * @param[in] srcLen Number of elements in the array/vector + * @return average value + */ + static float MeanF32(float* ptrSrc, const uint32_t srcLen); + + /** + * @brief Gets the standard deviation of a floating point array + * of elements + * @param[in] ptrSrc pointer to the first element + * @param[in] srcLen Number of elements in the array/vector + * @param[in] mean pre-computed mean value + * @return standard deviation value + */ + static float StdDevF32(float* ptrSrc, const uint32_t srcLen, + const float mean); +}; diff --git a/samples/SpeechRecognition/include/Preprocess.hpp b/samples/SpeechRecognition/include/Preprocess.hpp new file mode 100644 index 0000000000..80c568439b --- /dev/null +++ b/samples/SpeechRecognition/include/Preprocess.hpp @@ -0,0 +1,175 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "DataStructures.hpp" +#include "SlidingWindow.hpp" +#include +#include "MFCC.hpp" + +/* Class to facilitate pre-processing calculation for Wav2Letter model + * for ASR */ +using AudioWindow = SlidingWindow ; + +class Preprocess +{ +public: + + MFCC _m_mfcc; /* MFCC instance */ + + /* Actual buffers to be populated */ + Array2d _m_mfccBuf; /* Contiguous buffer 1D: MFCC */ + Array2d _m_delta1Buf; /* Contiguous buffer 1D: Delta 1 */ + Array2d _m_delta2Buf; /* Contiguous buffer 1D: Delta 2 */ + + uint32_t _m_windowLen; /* Window length for MFCC */ + uint32_t _m_windowStride; /* Window stride len for MFCC */ + AudioWindow _m_window; /* Sliding window */ + + /** + * @brief Constructor + * @param[in] numMfccFeatures number of MFCC features per window + * @param[in] windowLen number of elements in a window + * @param[in] windowStride stride (in number of elements) for + * moving the window + * @param[in] numMfccVectors number of MFCC vectors per window + */ + Preprocess( + const uint32_t windowLen, + const uint32_t windowStride, + const MFCC mfccInst); + Preprocess() = delete; + ~Preprocess(); + + /** + * @brief Calculates the features required from audio data. This + * includes MFCC, first and second order deltas, + * normalisation and finally, quantisation. The tensor is + * populated with feature from a given window placed along + * in a single row. + * @param[in] audioData pointer to the first element of audio data + * @param[in] audioDataLen number of elements in the audio data + * @param[in] tensor tensor to be populated + * @return true if successful, false in case of error. + */ + bool Invoke(const float* audioData, + const uint32_t audioDataLen, + std::vector& output, + int quantOffset, + float quantScale); + + +protected: + /** + * @brief Computes the first and second order deltas for the + * MFCC buffers - they are assumed to be populated. + * + * @param[in] mfcc MFCC buffers + * @param[out] delta1 result of the first diff computation + * @param[out] delta2 result of the second diff computation + * + * @return true if successful, false otherwise + */ + static bool _ComputeDeltas(Array2d& mfcc, + Array2d& delta1, + Array2d& delta2); + + /** + * @brief Given a 2D vector of floats, computes the mean + * @param[in] vec vector of vector of floats + * @return mean value + */ + static float _GetMean(Array2d& vec); + + /** + * @brief Given a 2D vector of floats, computes the stddev + * @param[in] vec vector of vector of floats + * @param[in] mean mean value of the vector passed in + * @return stddev value + */ + static float _GetStdDev(Array2d& vec, + const float mean); + + /** + * @brief Given a 2D vector of floats, normalises it using + * the mean and the stddev + * @param[in/out] vec vector of vector of floats + * @return + */ + static void _NormaliseVec(Array2d& vec); + + /** + * @brief Normalises the MFCC and delta buffers + * @return + */ + void _Normalise(); + + /** + * @brief Given the quantisation and data type limits, computes + * the quantised values of a floating point input data. + * @param[in] elem Element to be quantised + * @param[in] quantScale Scale + * @param[in] quantOffset Offset + * @param[in] minVal Numerical limit - minimum + * @param[in] maxVal Numerical limit - maximum + * @return floating point quantised value + */ + static float _GetQuantElem( + const float elem, + const float quantScale, + const int quantOffset, + const float minVal, + const float maxVal); + + /** + * @brief Quantises the MFCC and delta buffers, and places them + * in the output buffer. While doing so, it transposes + * the data. Reason: Buffers in this class are arranged + * for "time" axis to be row major. Primary reason for + * this being the convolution speed up (as we can use + * contiguous memory). The output, however, requires the + * time axis to be in column major arrangement. + * @param[in] outputBuf pointer to the output buffer + * @param[in] outputBufSz output buffer's size + * @param[in] quantScale quantisation scale + * @param[in] quantOffset quantisation offset + */ + template + bool _Quantise(T* outputBuf, int quantOffset, float quantScale) + { + /* Populate */ + T* outputBufMfcc = outputBuf; + T* outputBufD1 = outputBuf + this->_m_mfcc._m_params.m_numMfccFeatures; + T* outputBufD2 = outputBufD1 + this->_m_mfcc._m_params.m_numMfccFeatures; + const uint32_t ptrIncr = this->_m_mfcc._m_params.m_numMfccFeatures * 2; /* (3 vectors - 1 vector) */ + + const float minVal = std::numeric_limits::min(); + const float maxVal = std::numeric_limits::max(); + + /* We need to do a transpose while copying and concatenating + * the tensor*/ + for (uint32_t j = 0; j < this->_m_mfcc._m_params.m_numMfccVectors; ++j) { + for (uint32_t i = 0; i < this->_m_mfcc._m_params.m_numMfccFeatures; ++i) + { + *outputBufMfcc++ = static_cast(this->_GetQuantElem( + this->_m_mfccBuf(i, j), quantScale, + quantOffset, minVal, maxVal)); + *outputBufD1++ = static_cast(this->_GetQuantElem( + this->_m_delta1Buf(i, j), quantScale, + quantOffset, minVal, maxVal)); + *outputBufD2++ = static_cast(this->_GetQuantElem( + this->_m_delta2Buf(i, j), quantScale, + quantOffset, minVal, maxVal)); + } + outputBufMfcc += ptrIncr; + outputBufD1 += ptrIncr; + outputBufD2 += ptrIncr; + } + + return true; + } +}; + diff --git a/samples/SpeechRecognition/include/SlidingWindow.hpp b/samples/SpeechRecognition/include/SlidingWindow.hpp new file mode 100644 index 0000000000..791a0b7fc0 --- /dev/null +++ b/samples/SpeechRecognition/include/SlidingWindow.hpp @@ -0,0 +1,161 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +template +class SlidingWindow +{ +protected: + T* m_start = nullptr; + size_t m_dataSize = 0; + size_t m_size = 0; + size_t m_stride = 0; + size_t m_count = 0; +public: + + /** + * Creates the window slider through the given data. + * + * @param data pointer to the data to slide through. + * @param dataSize size in T type elements wise. + * @param windowSize sliding window size in T type wise elements. + * @param stride stride size in T type wise elements. + */ + SlidingWindow(T* data, size_t dataSize, + size_t windowSize, size_t stride) + { + m_start = data; + m_dataSize = dataSize; + m_size = windowSize; + m_stride = stride; + } + + SlidingWindow() = default; + + ~SlidingWindow() = default; + + /** + * Get the next data window. + * @return pointer to the next window, if next window is not available nullptr is returned. + */ + virtual T* Next() + { + if (HasNext()) + { + m_count++; + return m_start + Index() * m_stride; + } + else + { + return nullptr; + } + } + + /** + * Checks if the next data portion is available. + * @return true if next data portion is available + */ + bool HasNext() + { + return this->m_count < 1 + this->FractionalTotalStrides() && (this->NextWindowStartIndex() < this->m_dataSize); + } + + /** + * Resest the slider to the initial position. + */ + virtual void Reset() + { + m_count = 0; + } + + /** + * Resest the slider to the initial position. + */ + virtual size_t GetWindowSize() + { + return m_size; + } + + /** + * Resets the slider to the start of the new data. + * New data size MUST be the same as the old one. + * @param newStart pointer to the new data to slide through. + */ + virtual void Reset(T* newStart) + { + m_start = newStart; + Reset(); + } + + /** + * Gets current index of the sliding window. + * @return current position of the sliding window in number of strides + */ + size_t Index() + { + return m_count == 0? 0: m_count - 1; + } + + /** + * Gets the index from the start of the data where the next window will begin. + * While Index() returns the index of sliding window itself this function returns the index of the data + * element itself. + * @return Index from the start of the data where the next sliding window will begin. + */ + virtual size_t NextWindowStartIndex() + { + return m_count == 0? 0: ((m_count) * m_stride); + } + + /** + * Go to given sliding window index. + * @param index new position of the sliding window. if index is invalid (greater than possible range of strides) + * then next call to Next() will return nullptr. + */ + void FastForward(size_t index) + { + m_count = index; + } + + /** + * Calculates whole number of times the window can stride through the given data. + * @return maximum number of strides. + */ + size_t TotalStrides() + { + if (m_size > m_dataSize) + { + return 0; + } + return ((m_dataSize - m_size)/m_stride); + } + + /** + * Calculates number of times the window can stride through the given data. May not be a whole number. + * @return Number of strides to cover all data. + */ + float FractionalTotalStrides() + { + if(this->m_size > this->m_dataSize) + { + return this->m_dataSize / this->m_size; + } + else + { + return ((this->m_dataSize - this->m_size)/ static_cast(this->m_stride)); + } + + } + + /** + * Calculates the remaining data left to be processed + * @return The remaining unprocessed data + */ + int RemainingData() + { + return this->m_dataSize - this->NextWindowStartIndex(); + } +}; \ No newline at end of file diff --git a/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp new file mode 100644 index 0000000000..47ce30416f --- /dev/null +++ b/samples/SpeechRecognition/include/SpeechRecognitionPipeline.hpp @@ -0,0 +1,139 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "ArmnnNetworkExecutor.hpp" +#include "Decoder.hpp" +#include "MFCC.hpp" +#include "Preprocess.hpp" + +namespace asr +{ +/** + * Generic Speech Recognition pipeline with 3 steps: data pre-processing, inference execution and inference + * result post-processing. + * + */ +class ASRPipeline +{ +public: + + /** + * Creates speech recognition pipeline with given network executor and decoder. + * @param executor - unique pointer to inference runner + * @param decoder - unique pointer to inference results decoder + */ + ASRPipeline(std::unique_ptr> executor, + std::unique_ptr decoder); + + /** + * @brief Standard audio pre-processing implementation. + * + * Preprocesses and prepares the data for inference by + * extracting the MFCC features. + + * @param[in] audio - the raw audio data + * @param[out] preprocessor - the preprocessor object, which handles the data prepreration + */ + template + std::vector PreProcessing(std::vector& audio, Preprocess& preprocessor) + { + int audioDataToPreProcess = preprocessor._m_windowLen + + ((preprocessor._m_mfcc._m_params.m_numMfccVectors -1) *preprocessor._m_windowStride); + int outputBufferSize = preprocessor._m_mfcc._m_params.m_numMfccVectors + * preprocessor._m_mfcc._m_params.m_numMfccFeatures * 3; + std::vector outputBuffer(outputBufferSize); + preprocessor.Invoke(audio.data(), audioDataToPreProcess, outputBuffer, m_executor->GetQuantizationOffset(), + m_executor->GetQuantizationScale()); + return outputBuffer; + } + + /** + * @brief Executes inference + * + * Calls inference runner provided during instance construction. + * + * @param[in] preprocessedData - input inference data. Data type should be aligned with input tensor. + * @param[out] result - raw inference results. + */ + template + void Inference(const std::vector& preprocessedData, common::InferenceResults& result) + { + size_t data_bytes = sizeof(std::vector) + (sizeof(T) * preprocessedData.size()); + m_executor->Run(preprocessedData.data(), data_bytes, result); + } + + /** + * @brief Standard inference results post-processing implementation. + * + * Decodes inference results using decoder provided during construction. + * + * @param[in] inferenceResult - inference results to be decoded. + * @param[in] isFirstWindow - for checking if this is the first window of the sliding window. + * @param[in] isLastWindow - for checking if this is the last window of the sliding window. + * @param[in] currentRContext - the right context of the output text. To be output if it is the last window. + */ + template + void PostProcessing(common::InferenceResults& inferenceResult, + bool& isFirstWindow, + bool isLastWindow, + std::string currentRContext) + { + int rowLength = 29; + int middleContextStart = 49; + int middleContextEnd = 99; + int leftContextStart = 0; + int rightContextStart = 100; + int rightContextEnd = 148; + + std::vector contextToProcess; + + // If isFirstWindow we keep the left context of the output + if(isFirstWindow) + { + std::vector chunk(&inferenceResult[0][leftContextStart], + &inferenceResult[0][middleContextEnd * rowLength]); + contextToProcess = chunk; + } + // Else we only keep the middle context of the output + else + { + std::vector chunk(&inferenceResult[0][middleContextStart * rowLength], + &inferenceResult[0][middleContextEnd * rowLength]); + contextToProcess = chunk; + } + std::string output = this->m_decoder->DecodeOutput(contextToProcess); + isFirstWindow = false; + std::cout << output << std::flush; + + // If this is the last window, we print the right context of the output + if(isLastWindow) + { + std::vector rContext(&inferenceResult[0][rightContextStart*rowLength], + &inferenceResult[0][rightContextEnd * rowLength]); + currentRContext = this->m_decoder->DecodeOutput(rContext); + std::cout << currentRContext << std::endl; + } + } + +protected: + std::unique_ptr> m_executor; + std::unique_ptr m_decoder; +}; + +using IPipelinePtr = std::unique_ptr; + +/** + * Constructs speech recognition pipeline based on configuration provided. + * + * @param[in] config - speech recognition pipeline configuration. + * @param[in] labels - asr labels + * + * @return unique pointer to asr pipeline. + */ +IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map& labels); + +}// namespace asr \ No newline at end of file diff --git a/samples/SpeechRecognition/src/AudioCapture.cpp b/samples/SpeechRecognition/src/AudioCapture.cpp new file mode 100644 index 0000000000..f3b9092218 --- /dev/null +++ b/samples/SpeechRecognition/src/AudioCapture.cpp @@ -0,0 +1,104 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "AudioCapture.hpp" +#include +#include +#include + +namespace asr +{ + std::vector AudioCapture::LoadAudioFile(std::string filePath) + { + SF_INFO inputSoundFileInfo; + SNDFILE* infile = NULL; + infile = sf_open(filePath.c_str(), SFM_READ, &inputSoundFileInfo); + + float audioIn[inputSoundFileInfo.channels * inputSoundFileInfo.frames]; + sf_read_float(infile, audioIn, inputSoundFileInfo.channels * inputSoundFileInfo.frames); + + float sampleRate = 16000.0f; + float srcRatio = sampleRate / (float)inputSoundFileInfo.samplerate; + int outputFrames = ceil(inputSoundFileInfo.frames * srcRatio); + float dataOut[outputFrames]; + + // Convert to mono + float monoData[inputSoundFileInfo.frames]; + for(int i = 0; i < inputSoundFileInfo.frames; i++) + { + float val = 0.0f; + for(int j = 0; j < inputSoundFileInfo.channels; j++) + monoData[i] += audioIn[i * inputSoundFileInfo.channels + j]; + monoData[i] /= inputSoundFileInfo.channels; + } + + // Resample + SRC_DATA srcData; + srcData.data_in = monoData; + srcData.input_frames = inputSoundFileInfo.frames; + srcData.data_out = dataOut; + srcData.output_frames = outputFrames; + srcData.src_ratio = srcRatio; + + src_simple(&srcData, SRC_SINC_BEST_QUALITY, 1); + + // Convert to Vector + std::vector processedInput; + + for(int i = 0; i < srcData.output_frames_gen; ++i) + { + processedInput.push_back(srcData.data_out[i]); + } + + sf_close(infile); + + return processedInput; + } + + void AudioCapture::InitSlidingWindow(float* data, size_t dataSize, int minSamples, size_t stride) + { + this->m_window = SlidingWindow(data, dataSize, minSamples, stride); + } + + bool AudioCapture::HasNext() + { + return m_window.HasNext(); + } + + std::vector AudioCapture::Next() + { + if (this->m_window.HasNext()) + { + int remainingData = this->m_window.RemainingData(); + const float* windowData = this->m_window.Next(); + + size_t windowSize = this->m_window.GetWindowSize(); + + if(remainingData < windowSize) + { + std::vector mfccAudioData(windowSize, 0.0f); + for(int i = 0; i < remainingData; ++i) + { + mfccAudioData[i] = *windowData; + if(i < remainingData - 1) + { + ++windowData; + } + } + return mfccAudioData; + } + else + { + std::vector mfccAudioData(windowData, windowData + windowSize); + return mfccAudioData; + } + } + else + { + throw std::out_of_range("Error, end of audio data reached."); + } + } +} //namespace asr + diff --git a/samples/SpeechRecognition/src/Decoder.cpp b/samples/SpeechRecognition/src/Decoder.cpp new file mode 100644 index 0000000000..663d4db5b5 --- /dev/null +++ b/samples/SpeechRecognition/src/Decoder.cpp @@ -0,0 +1,37 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "Decoder.hpp" + +namespace asr { + + Decoder::Decoder(std::map& labels): + m_labels(labels) + {} + + std::string Decoder::FilterCharacters(std::vector& unfiltered) + { + std::string filtered = ""; + + for(int i = 0; i < unfiltered.size(); ++i) + { + if (unfiltered.at(i) == '$') + { + continue; + } + + else if (i + 1 < unfiltered.size() && unfiltered.at(i) == unfiltered.at(i + 1)) + { + continue; + } + else + { + filtered += unfiltered.at(i); + } + } + return filtered; + } +}// namespace + diff --git a/samples/SpeechRecognition/src/MFCC.cpp b/samples/SpeechRecognition/src/MFCC.cpp new file mode 100644 index 0000000000..234b14d3be --- /dev/null +++ b/samples/SpeechRecognition/src/MFCC.cpp @@ -0,0 +1,397 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include + +#include "MFCC.hpp" +#include "MathUtils.hpp" + + +MfccParams::MfccParams( + const float samplingFreq, + const int numFbankBins, + const float melLoFreq, + const float melHiFreq, + const int numMfccFeats, + const int frameLen, + const bool useHtkMethod, + const int numMfccVectors): + m_samplingFreq(samplingFreq), + m_numFbankBins(numFbankBins), + m_melLoFreq(melLoFreq), + m_melHiFreq(melHiFreq), + m_numMfccFeatures(numMfccFeats), + m_frameLen(frameLen), + m_numMfccVectors(numMfccVectors), + + /* Smallest power of 2 >= frame length. */ + m_frameLenPadded(pow(2, ceil((log(frameLen)/log(2))))), + m_useHtkMethod(useHtkMethod) +{} + +std::string MfccParams::Str() +{ + char strC[1024]; + snprintf(strC, sizeof(strC) - 1, "\n \ + \n\t Sampling frequency: %f\ + \n\t Number of filter banks: %u\ + \n\t Mel frequency limit (low): %f\ + \n\t Mel frequency limit (high): %f\ + \n\t Number of MFCC features: %u\ + \n\t Frame length: %u\ + \n\t Padded frame length: %u\ + \n\t Using HTK for Mel scale: %s\n", + this->m_samplingFreq, this->m_numFbankBins, this->m_melLoFreq, + this->m_melHiFreq, this->m_numMfccFeatures, this->m_frameLen, + this->m_frameLenPadded, this->m_useHtkMethod ? "yes" : "no"); + return std::string{strC}; +} + +MFCC::MFCC(const MfccParams& params): + _m_params(params), + _m_filterBankInitialised(false) +{ + this->_m_buffer = std::vector( + this->_m_params.m_frameLenPadded, 0.0); + this->_m_frame = std::vector( + this->_m_params.m_frameLenPadded, 0.0); + this->_m_melEnergies = std::vector( + this->_m_params.m_numFbankBins, 0.0); + + this->_m_windowFunc = std::vector(this->_m_params.m_frameLen); + const float multiplier = 2 * M_PI / this->_m_params.m_frameLen; + + /* Create window function. */ + for (size_t i = 0; i < this->_m_params.m_frameLen; i++) + { + this->_m_windowFunc[i] = (0.5 - (0.5 * cos(static_cast(i) * multiplier))); + } +} + +void MFCC::Init() +{ + this->_InitMelFilterBank(); +} + +float MFCC::MelScale(const float freq, const bool useHTKMethod) +{ + if (useHTKMethod) + { + return 1127.0f * logf (1.0f + freq / 700.0f); + } + else + { + /* Slaney formula for mel scale. */ + float mel = freq / freqStep; + + if (freq >= minLogHz) + { + mel = minLogMel + logf(freq / minLogHz) / logStep; + } + return mel; + } +} + +float MFCC::InverseMelScale(const float melFreq, const bool useHTKMethod) +{ + if (useHTKMethod) + { + return 700.0f * (expf (melFreq / 1127.0f) - 1.0f); + } + else + { + /* Slaney formula for mel scale. */ + float freq = freqStep * melFreq; + + if (melFreq >= minLogMel) + { + freq = minLogHz * expf(logStep * (melFreq - minLogMel)); + } + return freq; + } +} + + +bool MFCC::ApplyMelFilterBank( + std::vector& fftVec, + std::vector>& melFilterBank, + std::vector& filterBankFilterFirst, + std::vector& filterBankFilterLast, + std::vector& melEnergies) +{ + const size_t numBanks = melEnergies.size(); + + if (numBanks != filterBankFilterFirst.size() || + numBanks != filterBankFilterLast.size()) + { + printf("unexpected filter bank lengths\n"); + return false; + } + + for (size_t bin = 0; bin < numBanks; ++bin) + { + auto filterBankIter = melFilterBank[bin].begin(); + float melEnergy = 1e-10; /* Avoid log of zero at later stages */ + const int32_t firstIndex = filterBankFilterFirst[bin]; + const int32_t lastIndex = filterBankFilterLast[bin]; + + for (int32_t i = firstIndex; i <= lastIndex; ++i) + { + melEnergy += (*filterBankIter++ * fftVec[i]); + } + + melEnergies[bin] = melEnergy; + } + + return true; +} + +void MFCC::ConvertToLogarithmicScale(std::vector& melEnergies) +{ + float maxMelEnergy = -FLT_MAX; + + /* Container for natural logarithms of mel energies */ + std::vector vecLogEnergies(melEnergies.size(), 0.f); + + /* Because we are taking natural logs, we need to multiply by log10(e). + * Also, for wav2letter model, we scale our log10 values by 10 */ + constexpr float multiplier = 10.0 * /* default scalar */ + 0.4342944819032518; /* log10f(std::exp(1.0))*/ + + /* Take log of the whole vector */ + MathUtils::VecLogarithmF32(melEnergies, vecLogEnergies); + + /* Scale the log values and get the max */ + for (auto iterM = melEnergies.begin(), iterL = vecLogEnergies.begin(); + iterM != melEnergies.end(); ++iterM, ++iterL) + { + *iterM = *iterL * multiplier; + + /* Save the max mel energy. */ + if (*iterM > maxMelEnergy) + { + maxMelEnergy = *iterM; + } + } + + /* Clamp the mel energies */ + constexpr float maxDb = 80.0; + const float clampLevelLowdB = maxMelEnergy - maxDb; + for (auto iter = melEnergies.begin(); iter != melEnergies.end(); ++iter) + { + *iter = std::max(*iter, clampLevelLowdB); + } +} + +void MFCC::_ConvertToPowerSpectrum() +{ + const uint32_t halfDim = this->_m_params.m_frameLenPadded / 2; + + /* Handle this special case. */ + float firstEnergy = this->_m_buffer[0] * this->_m_buffer[0]; + float lastEnergy = this->_m_buffer[1] * this->_m_buffer[1]; + + MathUtils::ComplexMagnitudeSquaredF32( + this->_m_buffer.data(), + this->_m_buffer.size(), + this->_m_buffer.data(), + this->_m_buffer.size()/2); + + this->_m_buffer[0] = firstEnergy; + this->_m_buffer[halfDim] = lastEnergy; +} + +std::vector MFCC::CreateDCTMatrix( + const int32_t inputLength, + const int32_t coefficientCount) +{ + std::vector dctMatix(inputLength * coefficientCount); + + /* Orthonormal normalization. */ + const float normalizerK0 = 2 * sqrt(1.0 / static_cast(4*inputLength)); + const float normalizer = 2 * sqrt(1.0 / static_cast(2*inputLength)); + + const float angleIncr = M_PI/inputLength; + float angle = angleIncr; /* we start using it at k = 1 loop */ + + /* First row of DCT will use normalizer K0 */ + for (int32_t n = 0; n < inputLength; ++n) + { + dctMatix[n] = normalizerK0; + } + + /* Second row (index = 1) onwards, we use standard normalizer */ + for (int32_t k = 1, m = inputLength; k < coefficientCount; ++k, m += inputLength) + { + for (int32_t n = 0; n < inputLength; ++n) + { + dctMatix[m+n] = normalizer * + cos((n + 0.5) * angle); + } + angle += angleIncr; + } + return dctMatix; +} + +float MFCC::GetMelFilterBankNormaliser( + const float& leftMel, + const float& rightMel, + const bool useHTKMethod) +{ +/* Slaney normalization for mel weights. */ + return (2.0f / (MFCC::InverseMelScale(rightMel, useHTKMethod) - + MFCC::InverseMelScale(leftMel, useHTKMethod))); +} + +void MFCC::_InitMelFilterBank() +{ + if (!this->_IsMelFilterBankInited()) + { + this->_m_melFilterBank = this->_CreateMelFilterBank(); + this->_m_dctMatrix = this->CreateDCTMatrix( + this->_m_params.m_numFbankBins, + this->_m_params.m_numMfccFeatures); + this->_m_filterBankInitialised = true; + } +} + +bool MFCC::_IsMelFilterBankInited() +{ + return this->_m_filterBankInitialised; +} + +void MFCC::_MfccComputePreFeature(const std::vector& audioData) +{ + this->_InitMelFilterBank(); + + /* TensorFlow way of normalizing .wav data to (-1, 1). */ + constexpr float normaliser = 1.0; + for (size_t i = 0; i < this->_m_params.m_frameLen; i++) + { + this->_m_frame[i] = static_cast(audioData[i]) * normaliser; + } + + /* Apply window function to input frame. */ + for(size_t i = 0; i < this->_m_params.m_frameLen; i++) + { + this->_m_frame[i] *= this->_m_windowFunc[i]; + } + + /* Set remaining frame values to 0. */ + std::fill(this->_m_frame.begin() + this->_m_params.m_frameLen,this->_m_frame.end(), 0); + + /* Compute FFT. */ + MathUtils::FftF32(this->_m_frame, this->_m_buffer); + + /* Convert to power spectrum. */ + this->_ConvertToPowerSpectrum(); + + /* Apply mel filterbanks. */ + if (!this->ApplyMelFilterBank(this->_m_buffer, + this->_m_melFilterBank, + this->_m_filterBankFilterFirst, + this->_m_filterBankFilterLast, + this->_m_melEnergies)) + { + printf("Failed to apply MEL filter banks\n"); + } + + /* Convert to logarithmic scale */ + this->ConvertToLogarithmicScale(this->_m_melEnergies); +} + +std::vector MFCC::MfccCompute(const std::vector& audioData) +{ + this->_MfccComputePreFeature(audioData); + + std::vector mfccOut(this->_m_params.m_numMfccFeatures); + + float * ptrMel = this->_m_melEnergies.data(); + float * ptrDct = this->_m_dctMatrix.data(); + float * ptrMfcc = mfccOut.data(); + + /* Take DCT. Uses matrix mul. */ + for (size_t i = 0, j = 0; i < mfccOut.size(); + ++i, j += this->_m_params.m_numFbankBins) + { + *ptrMfcc++ = MathUtils::DotProductF32( + ptrDct + j, + ptrMel, + this->_m_params.m_numFbankBins); + } + + return mfccOut; +} + +std::vector> MFCC::_CreateMelFilterBank() +{ + size_t numFftBins = this->_m_params.m_frameLenPadded / 2; + float fftBinWidth = static_cast(this->_m_params.m_samplingFreq) / this->_m_params.m_frameLenPadded; + + float melLowFreq = MFCC::MelScale(this->_m_params.m_melLoFreq, + this->_m_params.m_useHtkMethod); + float melHighFreq = MFCC::MelScale(this->_m_params.m_melHiFreq, + this->_m_params.m_useHtkMethod); + float melFreqDelta = (melHighFreq - melLowFreq) / (this->_m_params.m_numFbankBins + 1); + + std::vector thisBin = std::vector(numFftBins); + std::vector> melFilterBank( + this->_m_params.m_numFbankBins); + this->_m_filterBankFilterFirst = + std::vector(this->_m_params.m_numFbankBins); + this->_m_filterBankFilterLast = + std::vector(this->_m_params.m_numFbankBins); + + for (size_t bin = 0; bin < this->_m_params.m_numFbankBins; bin++) + { + float leftMel = melLowFreq + bin * melFreqDelta; + float centerMel = melLowFreq + (bin + 1) * melFreqDelta; + float rightMel = melLowFreq + (bin + 2) * melFreqDelta; + + int32_t firstIndex = -1; + int32_t lastIndex = -1; + const float normaliser = this->GetMelFilterBankNormaliser(leftMel, rightMel, this->_m_params.m_useHtkMethod); + + for (size_t i = 0; i < numFftBins; i++) + { + float freq = (fftBinWidth * i); /* Center freq of this fft bin. */ + float mel = MFCC::MelScale(freq, this->_m_params.m_useHtkMethod); + thisBin[i] = 0.0; + + if (mel > leftMel && mel < rightMel) + { + float weight; + if (mel <= centerMel) + { + weight = (mel - leftMel) / (centerMel - leftMel); + } + else + { + weight = (rightMel - mel) / (rightMel - centerMel); + } + + thisBin[i] = weight * normaliser; + if (firstIndex == -1) + { + firstIndex = i; + } + lastIndex = i; + } + } + + this->_m_filterBankFilterFirst[bin] = firstIndex; + this->_m_filterBankFilterLast[bin] = lastIndex; + + /* Copy the part we care about. */ + for (int32_t i = firstIndex; i <= lastIndex; i++) + { + melFilterBank[bin].push_back(thisBin[i]); + } + } + + return melFilterBank; +} + diff --git a/samples/SpeechRecognition/src/Main.cpp b/samples/SpeechRecognition/src/Main.cpp new file mode 100644 index 0000000000..de37e23b40 --- /dev/null +++ b/samples/SpeechRecognition/src/Main.cpp @@ -0,0 +1,157 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#include +#include +#include +#include +#include + +#include "CmdArgsParser.hpp" +#include "ArmnnNetworkExecutor.hpp" +#include "AudioCapture.hpp" +#include "Preprocess.hpp" +#include "Decoder.hpp" +#include "SpeechRecognitionPipeline.hpp" + + +using InferenceResult = std::vector; +using InferenceResults = std::vector; + +const std::string AUDIO_FILE_PATH = "--audio-file-path"; +const std::string MODEL_FILE_PATH = "--model-file-path"; +const std::string LABEL_PATH = "--label-path"; +const std::string PREFERRED_BACKENDS = "--preferred-backends"; +const std::string HELP = "--help"; + +std::map labels = { + {0, "a" }, + {1, "b" }, + {2, "c" }, + {3, "d" }, + {4, "e" }, + {5, "f" }, + {6, "g" }, + {7, "h" }, + {8, "i" }, + {9, "j" }, + {10,"k" }, + {11,"l" }, + {12,"m" }, + {13,"n" }, + {14,"o" }, + {15,"p" }, + {16,"q" }, + {17,"r" }, + {18,"s" }, + {19,"t" }, + {20,"u" }, + {21,"v" }, + {22,"w" }, + {23,"x" }, + {24,"y" }, + {25,"z" }, + {26, "\'" }, + {27, " "}, + {28,"$" } +}; + +/* + * The accepted options for this Speech Recognition executable + */ +static std::map CMD_OPTIONS = { + {AUDIO_FILE_PATH, "[REQUIRED] Path to the Audio file to run speech recognition on"}, + {MODEL_FILE_PATH, "[REQUIRED] Path to the Speech Recognition model to use"}, + {PREFERRED_BACKENDS, "[OPTIONAL] Takes the preferred backends in preference order, separated by comma." + " For example: CpuAcc,GpuAcc,CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]." + " Defaults to CpuAcc,CpuRef"} +}; + +/* + * Reads the user supplied backend preference, splits it by comma, and returns an ordered vector + */ +std::vector GetPreferredBackendList(const std::string& preferredBackends) +{ + std::vector backends; + std::stringstream ss(preferredBackends); + + while(ss.good()) + { + std::string backend; + std::getline( ss, backend, ',' ); + backends.emplace_back(backend); + } + return backends; +} + +int main(int argc, char *argv[]) +{ + // Wav2Letter ASR SETTINGS + int SAMP_FREQ = 16000; + int FRAME_LEN_MS = 32; + int FRAME_LEN_SAMPLES = SAMP_FREQ * FRAME_LEN_MS * 0.001; + int NUM_MFCC_FEATS = 13; + int MFCC_WINDOW_LEN = 512; + int MFCC_WINDOW_STRIDE = 160; + const int NUM_MFCC_VECTORS = 296; + int SAMPLES_PER_INFERENCE = MFCC_WINDOW_LEN + ((NUM_MFCC_VECTORS -1) * MFCC_WINDOW_STRIDE); + int MEL_LO_FREQ = 0; + int MEL_HI_FREQ = 8000; + int NUM_FBANK_BIN = 128; + int INPUT_WINDOW_LEFT_CONTEXT = 98; + int INPUT_WINDOW_RIGHT_CONTEXT = 98; + int INPUT_WINDOW_INNER_CONTEXT = NUM_MFCC_VECTORS - + (INPUT_WINDOW_LEFT_CONTEXT + INPUT_WINDOW_RIGHT_CONTEXT); + int SLIDING_WINDOW_OFFSET = INPUT_WINDOW_INNER_CONTEXT * MFCC_WINDOW_STRIDE; + + + MfccParams mfccParams(SAMP_FREQ, NUM_FBANK_BIN, + MEL_LO_FREQ, MEL_HI_FREQ, NUM_MFCC_FEATS, FRAME_LEN_SAMPLES, false, NUM_MFCC_VECTORS); + + MFCC mfccInst = MFCC(mfccParams); + + Preprocess preprocessor(MFCC_WINDOW_LEN, MFCC_WINDOW_STRIDE, mfccInst); + + bool isFirstWindow = true; + std::string currentRContext = ""; + + std::map options; + + int result = ParseOptions(options, CMD_OPTIONS, argv, argc); + if (result != 0) + { + return result; + } + + // Create the network options + common::PipelineOptions pipelineOptions; + pipelineOptions.m_ModelFilePath = GetSpecifiedOption(options, MODEL_FILE_PATH); + + if (CheckOptionSpecified(options, PREFERRED_BACKENDS)) + { + pipelineOptions.m_backends = GetPreferredBackendList((GetSpecifiedOption(options, PREFERRED_BACKENDS))); + } + else + { + pipelineOptions.m_backends = {"CpuAcc", "CpuRef"}; + } + + asr::IPipelinePtr asrPipeline = asr::CreatePipeline(pipelineOptions, labels); + + asr::AudioCapture capture; + std::vector audioData = capture.LoadAudioFile(GetSpecifiedOption(options, AUDIO_FILE_PATH)); + capture.InitSlidingWindow(audioData.data(), audioData.size(), SAMPLES_PER_INFERENCE, SLIDING_WINDOW_OFFSET); + + while (capture.HasNext()) + { + std::vector audioBlock = capture.Next(); + InferenceResults results; + + std::vector preprocessedData = asrPipeline->PreProcessing(audioBlock, preprocessor); + asrPipeline->Inference(preprocessedData, results); + asrPipeline->PostProcessing(results, isFirstWindow, !capture.HasNext(), currentRContext); + } + + return 0; +} \ No newline at end of file diff --git a/samples/SpeechRecognition/src/MathUtils.cpp b/samples/SpeechRecognition/src/MathUtils.cpp new file mode 100644 index 0000000000..bf9908343a --- /dev/null +++ b/samples/SpeechRecognition/src/MathUtils.cpp @@ -0,0 +1,112 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "MathUtils.hpp" +#include +#include +#include + +void MathUtils::FftF32(std::vector& input, + std::vector& fftOutput) +{ + const int inputLength = input.size(); + + for (int k = 0; k <= inputLength / 2; k++) + { + float sumReal = 0, sumImag = 0; + + for (int t = 0; t < inputLength; t++) + { + float angle = 2 * M_PI * t * k / inputLength; + sumReal += input[t] * cosf(angle); + sumImag += -input[t] * sinf(angle); + } + + /* Arrange output to [real0, realN/2, real1, im1, real2, im2, ...] */ + if (k == 0) + { + fftOutput[0] = sumReal; + } + else if (k == inputLength / 2) + { + fftOutput[1] = sumReal; + } + else + { + fftOutput[k*2] = sumReal; + fftOutput[k*2 + 1] = sumImag; + }; + } +} + +float MathUtils::DotProductF32(float* srcPtrA, float* srcPtrB, + const int srcLen) +{ + float output = 0.f; + + for (int i = 0; i < srcLen; ++i) + { + output += *srcPtrA++ * *srcPtrB++; + } + return output; +} + +bool MathUtils::ComplexMagnitudeSquaredF32(float* ptrSrc, + const int srcLen, + float* ptrDst, + const int dstLen) +{ + if (dstLen < srcLen/2) + { + printf("dstLen must be greater than srcLen/2"); + return false; + } + + for (int j = 0; j < srcLen; ++j) + { + const float real = *ptrSrc++; + const float im = *ptrSrc++; + *ptrDst++ = real*real + im*im; + } + return true; +} + +void MathUtils::VecLogarithmF32(std::vector & input, + std::vector & output) +{ + for (auto in = input.begin(), out = output.begin(); + in != input.end(); ++in, ++out) + { + *out = logf(*in); + } +} + +float MathUtils::MeanF32(float* ptrSrc, const uint32_t srcLen) +{ + if (!srcLen) + { + return 0.f; + } + + float acc = std::accumulate(ptrSrc, ptrSrc + srcLen, 0.0); + return acc/srcLen; +} + +float MathUtils::StdDevF32(float* ptrSrc, const uint32_t srcLen, + const float mean) +{ + if (!srcLen) + { + return 0.f; + } + auto VarianceFunction = [=](float acc, const float value) { + return acc + (((value - mean) * (value - mean))/ srcLen); + }; + + float acc = std::accumulate(ptrSrc, ptrSrc + srcLen, 0.0, + VarianceFunction); + return sqrtf(acc); +} + diff --git a/samples/SpeechRecognition/src/Preprocess.cpp b/samples/SpeechRecognition/src/Preprocess.cpp new file mode 100644 index 0000000000..86279619d7 --- /dev/null +++ b/samples/SpeechRecognition/src/Preprocess.cpp @@ -0,0 +1,192 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include +#include +#include + +#include "MathUtils.hpp" +#include "Preprocess.hpp" + +Preprocess::Preprocess( + const uint32_t windowLen, + const uint32_t windowStride, + const MFCC mfccInst): + _m_mfcc(mfccInst), + _m_mfccBuf(mfccInst._m_params.m_numMfccFeatures, mfccInst._m_params.m_numMfccVectors), + _m_delta1Buf(mfccInst._m_params.m_numMfccFeatures, mfccInst._m_params.m_numMfccVectors), + _m_delta2Buf(mfccInst._m_params.m_numMfccFeatures, mfccInst._m_params.m_numMfccVectors), + _m_windowLen(windowLen), + _m_windowStride(windowStride) +{ + if (mfccInst._m_params.m_numMfccFeatures > 0 && windowLen > 0) + { + this->_m_mfcc.Init(); + } +} + +Preprocess::~Preprocess() +{ +} + +bool Preprocess::Invoke( const float* audioData, const uint32_t audioDataLen, std::vector& output, + int quantOffset, float quantScale) +{ + this->_m_window = SlidingWindow( + audioData, audioDataLen, + this->_m_windowLen, this->_m_windowStride); + + uint32_t mfccBufIdx = 0; + + // Init buffers with 0 + std::fill(_m_mfccBuf.begin(), _m_mfccBuf.end(), 0.f); + std::fill(_m_delta1Buf.begin(), _m_delta1Buf.end(), 0.f); + std::fill(_m_delta2Buf.begin(), _m_delta2Buf.end(), 0.f); + + /* While we can slide over the window */ + while (this->_m_window.HasNext()) + { + const float* mfccWindow = this->_m_window.Next(); + auto mfccAudioData = std::vector( + mfccWindow, + mfccWindow + this->_m_windowLen); + + auto mfcc = this->_m_mfcc.MfccCompute(mfccAudioData); + for (size_t i = 0; i < this->_m_mfccBuf.size(0); ++i) + { + this->_m_mfccBuf(i, mfccBufIdx) = mfcc[i]; + } + ++mfccBufIdx; + } + + /* Pad MFCC if needed by repeating last feature vector */ + while (mfccBufIdx != this->_m_mfcc._m_params.m_numMfccVectors) + { + memcpy(&this->_m_mfccBuf(0, mfccBufIdx), + &this->_m_mfccBuf(0, mfccBufIdx-1), sizeof(float)*this->_m_mfcc._m_params.m_numMfccFeatures); + ++mfccBufIdx; + } + + /* Compute first and second order deltas from MFCCs */ + this->_ComputeDeltas(this->_m_mfccBuf, + this->_m_delta1Buf, + this->_m_delta2Buf); + + /* Normalise */ + this->_Normalise(); + + return this->_Quantise(output.data(), quantOffset, quantScale); +} + +bool Preprocess::_ComputeDeltas(Array2d& mfcc, + Array2d& delta1, + Array2d& delta2) +{ + const std::vector delta1Coeffs = + {6.66666667e-02, 5.00000000e-02, 3.33333333e-02, + 1.66666667e-02, -3.46944695e-18, -1.66666667e-02, + -3.33333333e-02, -5.00000000e-02, -6.66666667e-02}; + + const std::vector delta2Coeffs = + {0.06060606, 0.01515152, -0.01731602, + -0.03679654, -0.04329004, -0.03679654, + -0.01731602, 0.01515152, 0.06060606}; + + if (delta1.size(0) == 0 || delta2.size(0) != delta1.size(0) || + mfcc.size(0) == 0 || mfcc.size(1) == 0) + { + return false; + } + + /* Get the middle index; coeff vec len should always be odd */ + const size_t coeffLen = delta1Coeffs.size(); + const size_t fMidIdx = (coeffLen - 1)/2; + const size_t numFeatures = mfcc.size(0); + const size_t numFeatVectors = mfcc.size(1); + + /* iterate through features in MFCC vector*/ + for (size_t i = 0; i < numFeatures; ++i) + { + /* for each feature, iterate through time (t) samples representing feature evolution and + * calculate d/dt and d^2/dt^2, using 1d convolution with differential kernels. + * Convolution padding = valid, result size is `time length - kernel length + 1`. + * The result is padded with 0 from both sides to match the size of initial time samples data. + * + * For the small filter, conv1d implementation as a simple loop is efficient enough. + * Filters of a greater size would need CMSIS-DSP functions to be used, like arm_fir_f32. + */ + + for (size_t j = fMidIdx; j < numFeatVectors - fMidIdx; ++j) + { + float d1 = 0; + float d2 = 0; + const size_t mfccStIdx = j - fMidIdx; + + for (size_t k = 0, m = coeffLen - 1; k < coeffLen; ++k, --m) + { + + d1 += mfcc(i,mfccStIdx + k) * delta1Coeffs[m]; + d2 += mfcc(i,mfccStIdx + k) * delta2Coeffs[m]; + } + + delta1(i,j) = d1; + delta2(i,j) = d2; + } + } + + return true; +} + +float Preprocess::_GetMean(Array2d& vec) +{ + return MathUtils::MeanF32(vec.begin(), vec.totalSize()); +} + +float Preprocess::_GetStdDev(Array2d& vec, const float mean) +{ + return MathUtils::StdDevF32(vec.begin(), vec.totalSize(), mean); +} + +void Preprocess::_NormaliseVec(Array2d& vec) +{ + auto mean = Preprocess::_GetMean(vec); + auto stddev = Preprocess::_GetStdDev(vec, mean); + + if (stddev == 0) + { + std::fill(vec.begin(), vec.end(), 0); + } + else + { + const float stddevInv = 1.f/stddev; + const float normalisedMean = mean/stddev; + + auto NormalisingFunction = [=](float &value) { + value = value * stddevInv - normalisedMean; + }; + std::for_each(vec.begin(), vec.end(), NormalisingFunction); + } +} + +void Preprocess::_Normalise() +{ + Preprocess::_NormaliseVec(this->_m_mfccBuf); + Preprocess::_NormaliseVec(this->_m_delta1Buf); + Preprocess::_NormaliseVec(this->_m_delta2Buf); +} + +float Preprocess::_GetQuantElem( + const float elem, + const float quantScale, + const int quantOffset, + const float minVal, + const float maxVal) +{ + float val = std::round((elem/quantScale) + quantOffset); + float maxim = std::max(val, minVal); + float returnVal = std::min(std::max(val, minVal), maxVal); + return returnVal; +} \ No newline at end of file diff --git a/samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp b/samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp new file mode 100644 index 0000000000..1b822d6a88 --- /dev/null +++ b/samples/SpeechRecognition/src/SpeechRecognitionPipeline.cpp @@ -0,0 +1,26 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "SpeechRecognitionPipeline.hpp" +#include "ArmnnNetworkExecutor.hpp" + +namespace asr +{ +ASRPipeline::ASRPipeline(std::unique_ptr> executor, + std::unique_ptr decoder + ) : + m_executor(std::move(executor)), + m_decoder(std::move(decoder)){} + +IPipelinePtr CreatePipeline(common::PipelineOptions& config, std::map& labels) +{ + auto executor = std::make_unique>(config.m_ModelFilePath, config.m_backends); + + auto decoder = std::make_unique(labels); + + return std::make_unique(std::move(executor), std::move(decoder)); +} + +}// namespace asr \ No newline at end of file diff --git a/samples/SpeechRecognition/test/AudioCaptureTest.cpp b/samples/SpeechRecognition/test/AudioCaptureTest.cpp new file mode 100644 index 0000000000..94b4e7cb7a --- /dev/null +++ b/samples/SpeechRecognition/test/AudioCaptureTest.cpp @@ -0,0 +1,61 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#define CATCH_CONFIG_MAIN +#include +#include + +#include "AudioCapture.hpp" + +TEST_CASE("Test capture of audio file") +{ + std::string testResources = TEST_RESOURCE_DIR; + REQUIRE(testResources != ""); + std::string file = testResources + "/" + "myVoiceIsMyPassportVerifyMe04.wav"; + asr::AudioCapture capture; + std::vector audioData = capture.LoadAudioFile(file); + capture.InitSlidingWindow(audioData.data(), audioData.size(), 47712, 16000); + + std::vector firstAudioBlock = capture.Next(); + float actual1 = firstAudioBlock.at(0); + float actual2 = firstAudioBlock.at(47000); + CHECK(std::to_string(actual1) == "0.000352"); + CHECK(std::to_string(actual2) == "-0.056441"); + CHECK(firstAudioBlock.size() == 47712); + + CHECK(capture.HasNext() == true); + + std::vector secondAudioBlock = capture.Next(); + float actual3 = secondAudioBlock.at(0); + float actual4 = secondAudioBlock.at(47000); + CHECK(std::to_string(actual3) == "0.102077"); + CHECK(std::to_string(actual4) == "0.000194"); + CHECK(capture.HasNext() == true); + + std::vector thirdAudioBlock = capture.Next(); + float actual5 = thirdAudioBlock.at(0); + float actual6 = thirdAudioBlock.at(33500); + float actual7 = thirdAudioBlock.at(33600); + CHECK(std::to_string(actual5) == "-0.076416"); + CHECK(std::to_string(actual6) == "-0.000275"); + CHECK(std::to_string(actual7) == "0.000000"); + CHECK(capture.HasNext() == false); +} + +TEST_CASE("Test sliding window of audio capture") +{ + std::string testResources = TEST_RESOURCE_DIR; + REQUIRE(testResources != ""); + std::string file = testResources + "/" + "myVoiceIsMyPassportVerifyMe04.wav"; + asr::AudioCapture capture; + std::vector audioData = capture.LoadAudioFile(file); + capture.InitSlidingWindow(audioData.data(), audioData.size(), 47712, 16000); + capture.Next(); + capture.Next(); + + CHECK(capture.HasNext() == true); + capture.Next(); + CHECK(capture.HasNext() == false); +} diff --git a/samples/SpeechRecognition/test/DecoderTest.cpp b/samples/SpeechRecognition/test/DecoderTest.cpp new file mode 100644 index 0000000000..13a3905b99 --- /dev/null +++ b/samples/SpeechRecognition/test/DecoderTest.cpp @@ -0,0 +1,86 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include +#include "Decoder.hpp" + +std::map labels = { + {0, "a" }, + {1, "b" }, + {2, "c" }, + {3, "d" }, + {4, "e" }, + {5, "f" }, + {6, "g" }, + {7, "h" }, + {8, "i" }, + {9, "j" }, + {10,"k" }, + {11,"l" }, + {12,"m" }, + {13,"n" }, + {14,"o" }, + {15,"p" }, + {16,"q" }, + {17,"r" }, + {18,"s" }, + {19,"t" }, + {20,"u" }, + {21,"v" }, + {22,"w" }, + {23,"x" }, + {24,"y" }, + {25,"z" }, + {26, "\'" }, + {27, " "}, + {28,"$" } +}; + +TEST_CASE("Test Wav2Letter output decoder") +{ + + std::vector outputValues = + { + 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2 + }; + + std::vector convertedValues; + + for(uint16_t outputVal : outputValues) + { + convertedValues.emplace_back(static_cast(outputVal)); + } + + asr::Decoder decoder(labels); + std::string text = decoder.DecodeOutput(convertedValues); + CHECK(text == "hello"); +} + + diff --git a/samples/SpeechRecognition/test/MFCCTest.cpp b/samples/SpeechRecognition/test/MFCCTest.cpp new file mode 100644 index 0000000000..2a552643d5 --- /dev/null +++ b/samples/SpeechRecognition/test/MFCCTest.cpp @@ -0,0 +1,102 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include + +#include "MFCC.hpp" + +const std::vector testWav = std::vector{ + -3.0f, 0.0f, 1.0f, -1.0f, 2.0f, 3.0f, -2.0f, 2.0f, + 1.0f, -2.0f, 0.0f, 3.0f, -1.0f, 8.0f, 3.0f, 2.0f, + -1.0f, -1.0f, 2.0f, 7.0f, 3.0f, 5.0f, 6.0f, 6.0f, + 6.0f, 12.0f, 5.0f, 6.0f, 3.0f, 3.0f, 5.0f, 4.0f, + 4.0f, 6.0f, 7.0f, 7.0f, 7.0f, 3.0f, 7.0f, 2.0f, + 8.0f, 4.0f, 4.0f, 2.0f, -4.0f, -1.0f, -1.0f, -4.0f, + 2.0f, 1.0f, -1.0f, -4.0f, 0.0f, -7.0f, -6.0f, -2.0f, + -5.0f, 1.0f, -5.0f, -1.0f, -7.0f, -3.0f, -3.0f, -7.0f, + 0.0f, -3.0f, 3.0f, -5.0f, 0.0f, 1.0f, -2.0f, -2.0f, + -3.0f, -3.0f, -7.0f, -3.0f, -2.0f, -6.0f, -5.0f, -8.0f, + -2.0f, -8.0f, 4.0f, -9.0f, -4.0f, -9.0f, -5.0f, -5.0f, + -3.0f, -9.0f, -3.0f, -9.0f, -1.0f, -7.0f, -4.0f, 1.0f, + -3.0f, 2.0f, -8.0f, -4.0f, -4.0f, -5.0f, 1.0f, -3.0f, + -1.0f, 0.0f, -1.0f, -2.0f, -3.0f, -2.0f, -4.0f, -1.0f, + 1.0f, -1.0f, 3.0f, 0.0f, 3.0f, 2.0f, 0.0f, 0.0f, + 0.0f, -3.0f, 1.0f, 1.0f, 0.0f, 8.0f, 3.0f, 4.0f, + 1.0f, 5.0f, 6.0f, 4.0f, 7.0f, 3.0f, 3.0f, 0.0f, + 3.0f, 6.0f, 7.0f, 6.0f, 4.0f, 5.0f, 9.0f, 9.0f, + 5.0f, 5.0f, 8.0f, 1.0f, 6.0f, 9.0f, 6.0f, 6.0f, + 7.0f, 1.0f, 8.0f, 1.0f, 5.0f, 0.0f, 5.0f, 5.0f, + 0.0f, 3.0f, 2.0f, 7.0f, 2.0f, -3.0f, 3.0f, 0.0f, + 3.0f, 0.0f, 0.0f, 0.0f, 2.0f, 0.0f, -1.0f, -1.0f, + -2.0f, -3.0f, -8.0f, 0.0f, 1.0f, 0.0f, -3.0f, -3.0f, + -3.0f, -2.0f, -3.0f, -3.0f, -4.0f, -6.0f, -2.0f, -8.0f, + -9.0f, -4.0f, -1.0f, -5.0f, -3.0f, -3.0f, -4.0f, -3.0f, + -6.0f, 3.0f, 0.0f, -1.0f, -2.0f, -9.0f, -4.0f, -2.0f, + 2.0f, -1.0f, 3.0f, -5.0f, -5.0f, -2.0f, 0.0f, -2.0f, + 0.0f, -1.0f, -3.0f, 1.0f, -2.0f, 9.0f, 4.0f, 5.0f, + 2.0f, 2.0f, 1.0f, 0.0f, -6.0f, -2.0f, 0.0f, 0.0f, + 0.0f, -1.0f, 4.0f, -4.0f, 3.0f, -7.0f, -1.0f, 5.0f, + -6.0f, -1.0f, -5.0f, 4.0f, 3.0f, 9.0f, -2.0f, 1.0f, + 3.0f, 0.0f, 0.0f, -2.0f, 1.0f, 2.0f, 1.0f, 1.0f, + 0.0f, 3.0f, 2.0f, -1.0f, 3.0f, -3.0f, 7.0f, 0.0f, + 0.0f, 3.0f, 2.0f, 2.0f, -2.0f, 3.0f, -2.0f, 2.0f, + -3.0f, 4.0f, -1.0f, -1.0f, -5.0f, -1.0f, -3.0f, -2.0f, + 1.0f, -1.0f, 3.0f, 2.0f, 4.0f, 1.0f, 2.0f, -2.0f, + 0.0f, 2.0f, 7.0f, 0.0f, 8.0f, -3.0f, 6.0f, -3.0f, + 6.0f, 1.0f, 2.0f, -3.0f, -1.0f, -1.0f, -1.0f, 1.0f, + -2.0f, 2.0f, 1.0f, 2.0f, 0.0f, -2.0f, 3.0f, -2.0f, + 3.0f, -2.0f, 1.0f, 0.0f, -3.0f, -1.0f, -2.0f, -4.0f, + -6.0f, -5.0f, -8.0f, -1.0f, -4.0f, 0.0f, -3.0f, -1.0f, + -1.0f, -1.0f, 0.0f, -2.0f, -3.0f, -7.0f, -1.0f, 0.0f, + 1.0f, 5.0f, 0.0f, 5.0f, 1.0f, 1.0f, -3.0f, 0.0f, + -6.0f, 3.0f, -8.0f, 4.0f, -8.0f, 6.0f, -6.0f, 1.0f, + -6.0f, -2.0f, -5.0f, -6.0f, 0.0f, -5.0f, 4.0f, -1.0f, + 4.0f, -2.0f, 1.0f, 2.0f, 1.0f, 0.0f, -2.0f, 0.0f, + 0.0f, 2.0f, -2.0f, 2.0f, -5.0f, 2.0f, 0.0f, -2.0f, + 1.0f, -2.0f, 0.0f, 5.0f, 1.0f, 0.0f, 1.0f, 5.0f, + 0.0f, 8.0f, 3.0f, 2.0f, 2.0f, 0.0f, 5.0f, -2.0f, + 3.0f, 1.0f, 0.0f, 1.0f, 0.0f, -2.0f, -1.0f, -3.0f, + 1.0f, -1.0f, 3.0f, 0.0f, 3.0f, 0.0f, -2.0f, -1.0f, + -4.0f, -4.0f, -4.0f, -1.0f, -4.0f, -4.0f, -3.0f, -6.0f, + -3.0f, -7.0f, -3.0f, -1.0f, -2.0f, 0.0f, -5.0f, -4.0f, + -7.0f, -3.0f, -2.0f, -2.0f, 1.0f, 2.0f, 2.0f, 8.0f, + 5.0f, 4.0f, 2.0f, 4.0f, 3.0f, 5.0f, 0.0f, 3.0f, + 3.0f, 6.0f, 4.0f, 2.0f, 2.0f, -2.0f, 4.0f, -2.0f, + 3.0f, 3.0f, 2.0f, 1.0f, 1.0f, 4.0f, -5.0f, 2.0f, + -3.0f, 0.0f, -1.0f, 1.0f, -2.0f, 2.0f, 5.0f, 1.0f, + 4.0f, 2.0f, 3.0f, 1.0f, -1.0f, 1.0f, 0.0f, 6.0f, + 0.0f, -2.0f, -1.0f, 1.0f, -1.0f, 2.0f, -5.0f, -1.0f, + -5.0f, -1.0f, -6.0f, -3.0f, -3.0f, 2.0f, 4.0f, 0.0f, + -1.0f, -5.0f, 3.0f, -4.0f, -1.0f, -3.0f, -4.0f, 1.0f, + -4.0f, 1.0f, -1.0f, -1.0f, 0.0f, -5.0f, -4.0f, -2.0f, + -1.0f, -1.0f, -3.0f, -7.0f, -3.0f, -3.0f, 4.0f, 4.0f +}; + +TEST_CASE("Test MFCC") +{ + int sampFreq = 16000; + int frameLenMs = 32; + int frameLenSamples = sampFreq * frameLenMs * 0.001; + int numMfccFeats = 13; + + std::vector fullAudioData; + + for (auto f : testWav) + { + fullAudioData.emplace_back( f / (1<<15)); + } + + + MfccParams mfccParams(sampFreq, 128, 0, 8000, numMfccFeats, frameLenSamples, false, 1); + + MFCC mfccInst = MFCC(mfccParams); + auto mfccOutput = mfccInst.MfccCompute(fullAudioData); + + std::vector expected = { -834.96564f, 21.02699f, 18.62856f, 7.3412f, 18.90791f, -5.36034f, 6.52351f, + -11.27064f, 8.37522f, 12.0672f, 8.30833f, -13.50008f, -18.1761f}; + + REQUIRE_THAT(mfccOutput, Catch::Approx(expected).epsilon(1.e-5) ); +} \ No newline at end of file diff --git a/samples/SpeechRecognition/test/PreprocessTest.cpp b/samples/SpeechRecognition/test/PreprocessTest.cpp new file mode 100644 index 0000000000..2b98831fda --- /dev/null +++ b/samples/SpeechRecognition/test/PreprocessTest.cpp @@ -0,0 +1,136 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include +#include + +#include "Preprocess.hpp" +#include "DataStructures.hpp" + +void PopulateTestWavVector(std::vector& vec) +{ + constexpr int int16max = std::numeric_limits::max(); + int val = 0; + for (size_t i = 0; i < vec.size(); ++i, ++val) + { + + /* We want a differential filter response from both - order 1 + * and 2 => Don't have a linear signal here - we use a signal + * using squares for example. Alternate sign flips might work + * just as well and will be computationally less work! */ + int valsq = val * val; + if (valsq > int16max) + { + val = 0; + valsq = 0; + } + vec[i] = valsq; + } +} + +TEST_CASE("Preprocessing calculation INT8") +{ + /*Test Constants: */ + const uint32_t windowLen = 512; + const uint32_t windowStride = 160; + const float quantScale = 0.1410219967365265; + const int quantOffset = -11; + int numMfccVectors = 10; + const int sampFreq = 16000; + const int frameLenMs = 32; + const int frameLenSamples = sampFreq * frameLenMs * 0.001; + const int numMfccFeats = 13; + const int audioDataToPreProcess = 512 + ((numMfccVectors -1) * windowStride); + int outputBufferSize = numMfccVectors * numMfccFeats * 3; + + /* Test wav memory */ + std::vector testWav1((windowStride * numMfccVectors) + + (windowLen - windowStride)); + /* Populate with dummy input */ + PopulateTestWavVector(testWav1); + + MfccParams mfccParams(sampFreq, 128, 0, 8000, numMfccFeats, frameLenSamples, false, numMfccVectors); + + MFCC mfccInst = MFCC(mfccParams); + + std::vector fullAudioData; + + for(int i = 0; i < 4; ++i) + { + for (auto f : testWav1) + { + fullAudioData.emplace_back(static_cast(f) / (1<<15)); + } + } + + Preprocess prep(frameLenSamples, windowStride, mfccInst); + + std::vector outputBuffer(outputBufferSize); + + prep.Invoke(fullAudioData.data(), audioDataToPreProcess, outputBuffer, quantOffset, quantScale); + + int8_t expectedResult[numMfccVectors][numMfccFeats*3] = + { + /* Feature vec 0 */ + -32, 4, -9, -8, -10, -10, -11, -11, -11, -11, -12, -11, -11, /* MFCCs */ + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, /* Delta 1 */ + -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, /* Delta 2 */ + + /* Feature vec 1 */ + -31, 4, -9, -8, -10, -10, -11, -11, -11, -11, -12, -11, -11, + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, + -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, + + /* Feature vec 2 */ + -31, 4, -9, -9, -10, -10, -11, -11, -11, -11, -12, -12, -12, + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, + -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, + + /* Feature vec 3 */ + -31, 4, -9, -9, -10, -10, -11, -11, -11, -11, -11, -12, -12, + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, + -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, + + /* Feature vec 4 : this should have valid delta 1 and delta 2 */ + -31, 4, -9, -9, -10, -10, -11, -11, -11, -11, -11, -12, -12, + -38, -29, -9, 1, -2, -7, -8, -8, -12, -16, -14, -5, 5, + -68, -50, -13, 5, 0, -9, -9, -8, -13, -20, -19, -3, 15, + + /* Feature vec 5 : this should have valid delta 1 and delta 2 */ + -31, 4, -9, -8, -10, -10, -11, -11, -11, -11, -11, -12, -12, + -62, -45, -11, 5, 0, -8, -9, -8, -12, -19, -17, -3, 13, + -27, -22, -13, -9, -11, -12, -12, -11, -11, -13, -13, -10, -6, + + /* Feature vec 6 */ + -31, 4, -9, -8, -10, -10, -11, -11, -11, -11, -12, -11, -11, + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, + -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, + + /* Feature vec 7 */ + -32, 4, -9, -8, -10, -10, -11, -11, -11, -12, -12, -11, -11, + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, + -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, + + /* Feature vec 8 */ + -32, 4, -9, -8, -10, -10, -11, -11, -11, -12, -12, -11, -11, + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, + -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, + + /* Feature vec 9 */ + -31, 4, -9, -8, -10, -10, -11, -11, -11, -11, -12, -11, -11, + -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, -11, + -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10, -10 + }; + + /* Check that the elements have been calculated correctly */ + for (uint32_t j = 0; j < numMfccVectors; ++j) + { + for (uint32_t i = 0; i < numMfccFeats * 3; ++i) + { + size_t tensorIdx = (j * numMfccFeats * 3) + i; + CHECK(static_cast(outputBuffer.at(tensorIdx) == static_cast(expectedResult[j][i]))); + } + } +} diff --git a/samples/common/cmake/aarch64-toolchain.cmake b/samples/common/cmake/aarch64-toolchain.cmake new file mode 100644 index 0000000000..bdd02f88c0 --- /dev/null +++ b/samples/common/cmake/aarch64-toolchain.cmake @@ -0,0 +1,20 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT + +# specify the cross compiler +set(GNU_MACHINE "aarch64-linux-gnu") +set(CROSS_PREFIX "aarch64-linux-gnu-") + +set(CMAKE_C_COMPILER ${CROSS_PREFIX}gcc) +set(CMAKE_CXX_COMPILER ${CROSS_PREFIX}g++) +set(CMAKE_AR ${CROSS_PREFIX}ar) +set(CMAKE_STRIP ${CROSS_PREFIX}strip) +set(CMAKE_LINKER ${CROSS_PREFIX}ld) + +set(CMAKE_CROSSCOMPILING true) +set(CMAKE_SYSTEM_NAME Linux) + +set(CMAKE_SYSTEM_PROCESSOR aarch64) + +set(OPENCV_EXTRA_ARGS "-DENABLE_NEON=ON" + "-DCMAKE_TOOLCHAIN_FILE=platforms/linux/aarch64-gnu.toolchain.cmake") \ No newline at end of file diff --git a/samples/common/cmake/arm-linux-gnueabihf-toolchain.cmake b/samples/common/cmake/arm-linux-gnueabihf-toolchain.cmake new file mode 100644 index 0000000000..f66b964c35 --- /dev/null +++ b/samples/common/cmake/arm-linux-gnueabihf-toolchain.cmake @@ -0,0 +1,20 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT + +# specify the cross compiler +set(GNU_MACHINE "arm-linux-gnueabihf") +set(CROSS_PREFIX "arm-linux-gnueabihf-") + +set(CMAKE_C_COMPILER ${CROSS_PREFIX}gcc) +set(CMAKE_CXX_COMPILER ${CROSS_PREFIX}g++) +set(CMAKE_AR ${CROSS_PREFIX}ar) +set(CMAKE_STRIP ${CROSS_PREFIX}strip) +set(CMAKE_LINKER ${CROSS_PREFIX}ld) + +set(CMAKE_CROSSCOMPILING true) +set(CMAKE_SYSTEM_NAME Linux) + +set(CMAKE_SYSTEM_PROCESSOR arm) + +set(OPENCV_EXTRA_ARGS "-DENABLE_NEON=ON" + "-DCMAKE_TOOLCHAIN_FILE=platforms/linux/arm.toolchain.cmake") \ No newline at end of file diff --git a/samples/common/cmake/find_armnn.cmake b/samples/common/cmake/find_armnn.cmake new file mode 100644 index 0000000000..289e9127f6 --- /dev/null +++ b/samples/common/cmake/find_armnn.cmake @@ -0,0 +1,35 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT +# Search for ArmNN built libraries in user-provided path first, then current repository, then system + +set(ARMNN_LIB_NAMES "libarmnn.so" + "libarmnnTfLiteParser.so") + +set(ARMNN_LIBS "") + +get_filename_component(PARENT_DIR ${PROJECT_SOURCE_DIR} DIRECTORY) +get_filename_component(REPO_DIR ${PARENT_DIR} DIRECTORY) + +foreach(armnn_lib ${ARMNN_LIB_NAMES}) + find_library(ARMNN_${armnn_lib} + NAMES + ${armnn_lib} + HINTS + ${ARMNN_LIB_DIR} ${REPO_DIR} + PATHS + ${ARMNN_LIB_DIR} ${REPO_DIR} + PATH_SUFFIXES + "lib" + "lib64") + if(ARMNN_${armnn_lib}) + message("Found library ${ARMNN_${armnn_lib}}") + list(APPEND ARMNN_LIBS ${ARMNN_${armnn_lib}}) + get_filename_component(LIB_DIR ${ARMNN_${armnn_lib}} DIRECTORY) + get_filename_component(LIB_PARENT_DIR ${LIB_DIR} DIRECTORY) + set(ARMNN_INCLUDE_DIR ${LIB_PARENT_DIR}/include) + endif() +endforeach() + +if(NOT ARMNN_LIBS) + message(FATAL_ERROR "Could not find ArmNN libraries ${ARMNN_LIB_NAMES}") +endif() diff --git a/samples/common/cmake/find_catch.cmake b/samples/common/cmake/find_catch.cmake new file mode 100644 index 0000000000..584b8073bd --- /dev/null +++ b/samples/common/cmake/find_catch.cmake @@ -0,0 +1,16 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT + +#Test TPIP +set(TEST_TPIP ${DEPENDENCIES_DIR}/test) +file(MAKE_DIRECTORY ${TEST_TPIP}) +set(TEST_TPIP_INCLUDE ${TEST_TPIP}/include) +file(MAKE_DIRECTORY ${TEST_TPIP_INCLUDE}) + +ExternalProject_Add(catch2-headers + URL https://github.com/catchorg/Catch2/releases/download/v2.11.1/catch.hpp + DOWNLOAD_NO_EXTRACT 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND ${CMAKE_COMMAND} -E copy /catch.hpp ${TEST_TPIP_INCLUDE} + INSTALL_COMMAND "" + ) \ No newline at end of file diff --git a/samples/common/cmake/find_opencv.cmake b/samples/common/cmake/find_opencv.cmake new file mode 100644 index 0000000000..92086e1316 --- /dev/null +++ b/samples/common/cmake/find_opencv.cmake @@ -0,0 +1,203 @@ +# Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +# SPDX-License-Identifier: MIT + +set(OPENCV_VERSION 4.0.0) +set(FFMPEG_VERSION 4.2.1) +set(LIBX264_VERSION stable) + +set(OPENCV_LIB OpenCV${OPENCV_VERSION}) +set(FFMPEG_LIB ffmpeg${FFMPEG_VERSION}) +set(X264_LIB x264${LIBX264_VERSION}) + +set(OPENCV_NAMES + libopencv_core.so.${OPENCV_VERSION} + libopencv_imgproc.so.${OPENCV_VERSION} + libopencv_imgcodecs.so.${OPENCV_VERSION} + libopencv_videoio.so.${OPENCV_VERSION} + libopencv_video.so.${OPENCV_VERSION} + libopencv_highgui.so.${OPENCV_VERSION}) + +set(OPENCV_LIBS) +set(FFMPEG_LIBS) + +foreach(opencv_lib ${OPENCV_NAMES}) + find_library(OPENCV_${opencv_lib} + NAMES + ${opencv_lib} + HINTS + ${OPENCV_LIB_DIR} + PATHS + ${OPENCV_LIB_DIR} + PATH_SUFFIXES + "lib" + "lib64") + if(OPENCV_${opencv_lib}) + message("Found library ${OPENCV_${opencv_lib}}") + list(APPEND OPENCV_LIBS ${OPENCV_${opencv_lib}}) + get_filename_component(OPENCV_LIB_DIR ${OPENCV_${opencv_lib}} DIRECTORY) + get_filename_component(OPENCV_ROOT_DIR ${OPENCV_LIB_DIR} DIRECTORY) + set(OPENCV_INCLUDE_DIR ${OPENCV_ROOT_DIR}/include/opencv4) + endif() +endforeach() + +if(OPENCV_LIBS) + message("OpenCV libraries found") + set(OPENCV_LIBS_FOUND TRUE) +else() + set(OPENCV_ROOT_DIR ${DEPENDENCIES_DIR}/opencv) + set(OPENCV_DEPENDENCIES_ARGS) + set(OPENCV_EXTRA_LINKER_ARGS) + set(OPENCV_PKGCONFIG) + + if(CMAKE_CROSSCOMPILING) + set(FFMPEG_ROOT_DIR ${DEPENDENCIES_DIR}/ffmpeg) + set(LIBX264_ROOT_DIR ${DEPENDENCIES_DIR}/x264) + + if (CMAKE_BUILD_TYPE STREQUAL Debug) + set(CONFIGURE_DEBUG --enable-debug) + set(OPENCV_DEBUG "-DBUILD_WITH_DEBUG_INFO=ON") + endif() + + + ExternalProject_Add(${X264_LIB} + URL "https://code.videolan.org/videolan/x264/-/archive/${LIBX264_VERSION}/x264-${LIBX264_VERSION}.tar.gz" + DOWNLOAD_DIR ${LIBX264_ROOT_DIR} + PREFIX ${LIBX264_ROOT_DIR} + CONFIGURE_COMMAND /configure + --host=${GNU_MACHINE} + --enable-static + --enable-shared + --cross-prefix=${CROSS_PREFIX} + --prefix=${CMAKE_BINARY_DIR} + --extra-ldflags=-static-libstdc++ + --extra-cflags=-fPIC + ${CONFIGURE_DEBUG} + INSTALL_DIR ${CMAKE_BINARY_DIR} + BUILD_COMMAND $(MAKE) + INSTALL_COMMAND $(MAKE) install + ) + + set(FFMPEG_Config + --enable-shared + --enable-cross-compile + --cross-prefix=${CROSS_PREFIX} + --arch=${CMAKE_SYSTEM_PROCESSOR} + --target-os=linux + --prefix=${CMAKE_BINARY_DIR} + --enable-gpl + --enable-nonfree + --enable-libx264 + --extra-cflags=-I${CMAKE_BINARY_DIR}/include + --extra-cflags=-fPIC + --extra-ldflags=-L${CMAKE_BINARY_DIR}/lib + --extra-libs=-ldl + --extra-libs=-static-libstdc++ + ) + + ExternalProject_Add(${FFMPEG_LIB} + URL "https://github.com/FFmpeg/FFmpeg/archive/n${FFMPEG_VERSION}.tar.gz" + URL_HASH MD5=05792c611d1e3ebdf2c7003ff4467390 + DOWNLOAD_DIR ${FFMPEG_ROOT_DIR} + PREFIX ${FFMPEG_ROOT_DIR} + CONFIGURE_COMMAND /configure ${FFMPEG_Config} ${CONFIGURE_DEBUG} + INSTALL_DIR ${CMAKE_BINARY_DIR} + BUILD_COMMAND $(MAKE) VERBOSE=1 + INSTALL_COMMAND $(MAKE) install + ) + + set(OPENCV_DEPENDENCIES_ARGS "-static-libstdc++ -Wl,-rpath,${CMAKE_BINARY_DIR}/lib") + set(OPENCV_EXTRA_LINKER_ARGS "-DOPENCV_EXTRA_EXE_LINKER_FLAGS=${OPENCV_DEPENDENCIES_ARGS}") + + set(OPENCV_PKGCONFIG "PKG_CONFIG_LIBDIR=${CMAKE_BINARY_DIR}/lib/pkgconfig") + + set(FFMPEG_NAMES + libavcodec.so + libavformat.so + libavutil.so + libswscale.so + ) + + foreach(ffmpeg_lib ${FFMPEG_NAMES}) + add_library(FFMPEG_${ffmpeg_lib} SHARED IMPORTED) + set_target_properties(FFMPEG_${ffmpeg_lib} PROPERTIES IMPORTED_LOCATION ${CMAKE_BINARY_DIR}/lib/${ffmpeg_lib}) + list(APPEND OPENCV_LIBS FFMPEG_${ffmpeg_lib}) + endforeach() + + add_library(X264_lib264.so SHARED IMPORTED) + set_target_properties(X264_lib264.so PROPERTIES IMPORTED_LOCATION ${CMAKE_BINARY_DIR}/lib/libx264.so) + list(APPEND OPENCV_LIBS X264_lib264.so) + endif() + + set(OPENCV_CMAKE_ARGS + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_C_FLAGS=-fPIC + -DCMAKE_CXX_FLAGS=-fPIC + -DWITH_GTK=OFF + -DWITH_JPEG=ON + -DWITH_IPP=OFF + -DBUILD_opencv_java_bindings_generator=OFF + -DBUILD_opencv_ml=OFF + -DBUILD_opencv_objdetect=OFF + -DBUILD_opencv_photo=OFF + -DBUILD_opencv_python_bindings_generator=OFF + -DBUILD_opencv_stitching=OFF + -DBUILD_opencv_gapi=OFF + -DBUILD_opencv_features2d=OFF + -DBUILD_opencv_dnn=OFF + -DBUILD_opencv_flann=OFF + -DBUILD_opencv_calib3d=OFF + -DBUILD_opencv_python2=OFF + -DBUILD_opencv_python3=OFF + -DBUILD_opencv_java=OFF + -DBUILD_opencv_js=OFF + -DBUILD_opencv_ts=OFF + -DBUILD_JPEG=ON + -DBUILD_JPEG_TURBO_DISABLE=ON + -DBUILD_PNG=ON + -DBUILD_TIFF=ON + -DZLIB_FOUND=OFF + -DBUILD_ZLIB=ON + -DBUILD_PERF_TESTS=OFF + -DBUILD_TESTS=OFF + -DBUILD_DOCS=OFF + -DBUILD_opencv_apps=OFF + -DBUILD_EXAMPLES=OFF + -DWITH_V4L=ON + -DWITH_LIBV4L=OFF + -DWITH_FFMPEG=ON + -DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_INSTALL_RPATH=\$ORIGIN:\$ORIGIN/lib:\$ORIGIN/../lib + -DCMAKE_SHARED_LINKER_FLAGS=-static-libstdc++ + ${OPENCV_DEBUG} + ) + + ExternalProject_Add(${OPENCV_LIB} + URL "https://codeload.github.com/opencv/opencv/tar.gz/${OPENCV_VERSION}" + URL_HASH MD5=f051c1ff7b327b60123d71b53801b316 + DOWNLOAD_DIR ${OPENCV_ROOT_DIR} + PREFIX ${OPENCV_ROOT_DIR} + CONFIGURE_COMMAND ${OPENCV_PKGCONFIG} + ${CMAKE_COMMAND} ${OPENCV_CMAKE_ARGS} ${OPENCV_EXTRA_ARGS} + ${OPENCV_EXTRA_LINKER_ARGS} ${OPENCV_ROOT_DIR}/src/${OPENCV_LIB} + INSTALL_DIR ${CMAKE_BINARY_DIR} + BUILD_COMMAND $(MAKE) + INSTALL_COMMAND $(MAKE) install + ) + + if(CMAKE_CROSSCOMPILING) + ExternalProject_Add_StepDependencies(${FFMPEG_LIB} build ${X264_LIB}) + ExternalProject_Add_StepDependencies(${OPENCV_LIB} build ${FFMPEG_LIB}) + endif() + + set(OPENCV_INCLUDE_DIR ${CMAKE_BINARY_DIR}/include/opencv4) + set(OPENCV_LIB_DIR ${CMAKE_BINARY_DIR}/lib) + + foreach(opencv_lib ${OPENCV_NAMES}) + add_library(OPENCV_${opencv_lib} SHARED IMPORTED) + set_target_properties(OPENCV_${opencv_lib} PROPERTIES IMPORTED_LOCATION ${OPENCV_LIB_DIR}/${opencv_lib}) + list(APPEND OPENCV_LIBS OPENCV_${opencv_lib}) + endforeach() + +endif() \ No newline at end of file diff --git a/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp b/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp new file mode 100644 index 0000000000..96cc1d0184 --- /dev/null +++ b/samples/common/include/ArmnnUtils/ArmnnNetworkExecutor.hpp @@ -0,0 +1,214 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "Types.hpp" + +#include "armnn/ArmNN.hpp" +#include "armnnTfLiteParser/ITfLiteParser.hpp" +#include "armnnUtils/DataLayoutIndexed.hpp" +#include + +#include +#include + +namespace common +{ +/** +* @brief Used to load in a network through ArmNN and run inference on it against a given backend. +* +*/ +template +class ArmnnNetworkExecutor +{ +private: + armnn::IRuntimePtr m_Runtime; + armnn::NetworkId m_NetId{}; + mutable InferenceResults m_OutputBuffer; + armnn::InputTensors m_InputTensors; + armnn::OutputTensors m_OutputTensors; + std::vector m_outputBindingInfo; + + std::vector m_outputLayerNamesList; + + armnnTfLiteParser::BindingPointInfo m_inputBindingInfo; + + void PrepareTensors(const void* inputData, const size_t dataBytes); + + template + auto log_as_int(Enumeration value) + -> typename std::underlying_type::type + { + return static_cast::type>(value); + } + +public: + ArmnnNetworkExecutor() = delete; + + /** + * @brief Initializes the network with the given input data. Parsed through TfLiteParser and optimized for a + * given backend. + * + * Note that the output layers names order in m_outputLayerNamesList affects the order of the feature vectors + * in output of the Run method. + * + * * @param[in] modelPath - Relative path to the model file + * * @param[in] backends - The list of preferred backends to run inference on + */ + ArmnnNetworkExecutor(std::string& modelPath, + std::vector& backends); + + /** + * @brief Returns the aspect ratio of the associated model in the order of width, height. + */ + Size GetImageAspectRatio(); + + armnn::DataType GetInputDataType() const; + + float GetQuantizationScale(); + + int GetQuantizationOffset(); + + /** + * @brief Runs inference on the provided input data, and stores the results in the provided InferenceResults object. + * + * @param[in] inputData - input frame data + * @param[in] dataBytes - input data size in bytes + * @param[out] results - Vector of DetectionResult objects used to store the output result. + */ + bool Run(const void* inputData, const size_t dataBytes, common::InferenceResults& outResults); + +}; + +template +ArmnnNetworkExecutor::ArmnnNetworkExecutor(std::string& modelPath, + std::vector& preferredBackends) + : m_Runtime(armnn::IRuntime::Create(armnn::IRuntime::CreationOptions())) +{ + // Import the TensorFlow lite model. + armnnTfLiteParser::ITfLiteParserPtr parser = armnnTfLiteParser::ITfLiteParser::Create(); + armnn::INetworkPtr network = parser->CreateNetworkFromBinaryFile(modelPath.c_str()); + + std::vector inputNames = parser->GetSubgraphInputTensorNames(0); + + m_inputBindingInfo = parser->GetNetworkInputBindingInfo(0, inputNames[0]); + + m_outputLayerNamesList = parser->GetSubgraphOutputTensorNames(0); + + std::vector outputBindings; + for(const std::string& name : m_outputLayerNamesList) + { + m_outputBindingInfo.push_back(std::move(parser->GetNetworkOutputBindingInfo(0, name))); + } + std::vector errorMessages; + // optimize the network. + armnn::IOptimizedNetworkPtr optNet = Optimize(*network, + preferredBackends, + m_Runtime->GetDeviceSpec(), + armnn::OptimizerOptions(), + armnn::Optional&>(errorMessages)); + + if (!optNet) + { + const std::string errorMessage{"ArmnnNetworkExecutor: Failed to optimize network"}; + ARMNN_LOG(error) << errorMessage; + throw armnn::Exception(errorMessage); + } + + // Load the optimized network onto the m_Runtime device + std::string errorMessage; + if (armnn::Status::Success != m_Runtime->LoadNetwork(m_NetId, std::move(optNet), errorMessage)) + { + ARMNN_LOG(error) << errorMessage; + throw armnn::Exception(errorMessage); + } + + //pre-allocate memory for output (the size of it never changes) + for (int it = 0; it < m_outputLayerNamesList.size(); ++it) + { + const armnn::DataType dataType = m_outputBindingInfo[it].second.GetDataType(); + const armnn::TensorShape& tensorShape = m_outputBindingInfo[it].second.GetShape(); + + std::vector oneLayerOutResult; + oneLayerOutResult.resize(tensorShape.GetNumElements(), 0); + m_OutputBuffer.emplace_back(oneLayerOutResult); + + // Make ArmNN output tensors + m_OutputTensors.reserve(m_OutputBuffer.size()); + for (size_t it = 0; it < m_OutputBuffer.size(); ++it) + { + m_OutputTensors.emplace_back(std::make_pair( + m_outputBindingInfo[it].first, + armnn::Tensor(m_outputBindingInfo[it].second, + m_OutputBuffer.at(it).data()) + )); + } + } + +} + +template +armnn::DataType ArmnnNetworkExecutor::GetInputDataType() const +{ + return m_inputBindingInfo.second.GetDataType(); +} + +template +void ArmnnNetworkExecutor::PrepareTensors(const void* inputData, const size_t dataBytes) +{ + assert(m_inputBindingInfo.second.GetNumBytes() >= dataBytes); + m_InputTensors.clear(); + m_InputTensors = {{ m_inputBindingInfo.first, armnn::ConstTensor(m_inputBindingInfo.second, inputData)}}; +} + +template +bool ArmnnNetworkExecutor::Run(const void* inputData, const size_t dataBytes, InferenceResults& outResults) +{ + /* Prepare tensors if they are not ready */ + ARMNN_LOG(debug) << "Preparing tensors..."; + this->PrepareTensors(inputData, dataBytes); + ARMNN_LOG(trace) << "Running inference..."; + + armnn::Status ret = m_Runtime->EnqueueWorkload(m_NetId, m_InputTensors, m_OutputTensors); + + std::stringstream inferenceFinished; + inferenceFinished << "Inference finished with code {" << log_as_int(ret) << "}\n"; + + ARMNN_LOG(trace) << inferenceFinished.str(); + + if (ret == armnn::Status::Failure) + { + ARMNN_LOG(error) << "Failed to perform inference."; + } + + outResults.reserve(m_outputLayerNamesList.size()); + outResults = m_OutputBuffer; + + return (armnn::Status::Success == ret); +} + +template +float ArmnnNetworkExecutor::GetQuantizationScale() +{ + return this->m_inputBindingInfo.second.GetQuantizationScale(); +} + +template +int ArmnnNetworkExecutor::GetQuantizationOffset() +{ + return this->m_inputBindingInfo.second.GetQuantizationOffset(); +} + +template +Size ArmnnNetworkExecutor::GetImageAspectRatio() +{ + const auto shape = m_inputBindingInfo.second.GetShape(); + assert(shape.GetNumDimensions() == 4); + armnnUtils::DataLayoutIndexed nhwc(armnn::DataLayout::NHWC); + return Size(shape[nhwc.GetWidthIndex()], + shape[nhwc.GetHeightIndex()]); +} +}// namespace common \ No newline at end of file diff --git a/samples/common/include/CVUtils/CvVideoFileWriter.hpp b/samples/common/include/CVUtils/CvVideoFileWriter.hpp new file mode 100644 index 0000000000..30348f09cc --- /dev/null +++ b/samples/common/include/CVUtils/CvVideoFileWriter.hpp @@ -0,0 +1,61 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "IFrameOutput.hpp" +#include + +namespace common +{ + +class CvVideoFileWriter : public IFrameOutput { +public: + /** + * @brief Default constructor. + * + * Underlying open cv video writer object will be instantiated. + */ + CvVideoFileWriter() = default; + + ~CvVideoFileWriter() override = default; + + /** + * @brief Initialises video file writer. + * + * Opens opencv writer with given params. FFMPEG backend is used. + * + * @param outputVideo path to the video file. + * @param encoding cv::CAP_PROP_FOURCC code. + * @param fps target frame rate. + * @param width target frame width. + * @param height target frame height. + * + */ + void Init(const std::string& outputVideo, int encoding, double fps, int width, int height); + + /** + * Writes frame to the file using opencv writer. + * + * @param frame data to write. + */ + void WriteFrame(std::shared_ptr& frame) override; + + /** + * Releases opencv writer. + */ + void Close() override; + + /** + * Checks if opencv writer was successfully opened. + * @return true is underlying writer is ready to be used, false otherwise. + */ + bool IsReady() const override; + +private: + cv::VideoWriter m_cvWriter{}; + bool m_ready = false; +}; +}// namespace common \ No newline at end of file diff --git a/samples/common/include/CVUtils/CvVideoFrameReader.hpp b/samples/common/include/CVUtils/CvVideoFrameReader.hpp new file mode 100644 index 0000000000..96d94f4079 --- /dev/null +++ b/samples/common/include/CVUtils/CvVideoFrameReader.hpp @@ -0,0 +1,108 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once + + +#include "IFrameReader.hpp" +#include + +namespace common +{ + +class CvVideoFrameReader : + public IFrameReader +{ +public: + /** + * @brief Default constructor. + * + * Underlying open cv video capture object will be instantiated. + */ + CvVideoFrameReader() = default; + + ~CvVideoFrameReader() override = default; + + /** + *@brief Initialises reader to capture frames from video file. + * + * @param source path to the video file or image sequence. + * + * @throws std::runtime_error if init failed + */ + void Init(const std::string& source); + + std::shared_ptr ReadFrame() override; + + bool IsExhausted(const std::shared_ptr & frame) const override; + + /** + * Returns effective video frame width supported by the source/set by the user. + * Must be called after Init method. + * @return frame width + */ + int GetSourceWidth() const; + + /** + * Returns effective video frame height supported by the source/set by the user. + * Must be called after Init method. + * @return frame height + */ + int GetSourceHeight() const; + + /** + * Returns effective fps value supported by the source/set by the user. + * @return fps value + */ + double GetSourceFps() const; + + /** + * Will query OpenCV to convert images to RGB + * Copy is actually default behaviour, but the set function needs to be called + * in order to know whether OpenCV supports conversion from our source format. + * @return boolean, + * true: OpenCV returns RGB + * false: OpenCV returns the fourcc format from GetSourceEncoding + */ + bool ConvertToRGB(); + + /** + * Returns 4-character code of codec. + * @return codec name + */ + std::string GetSourceEncoding() const; + + /** + * Get the fourcc int from its string name. + * @return codec int + */ + int GetSourceEncodingInt() const; + + int GetFrameCount() const; + +private: + cv::VideoCapture m_capture; + + void CheckIsOpen(const std::string& source); +}; + +class CvVideoFrameReaderRgbWrapper : + public IFrameReader +{ +public: + CvVideoFrameReaderRgbWrapper() = delete; + CvVideoFrameReaderRgbWrapper(const CvVideoFrameReaderRgbWrapper& o) = delete; + CvVideoFrameReaderRgbWrapper(CvVideoFrameReaderRgbWrapper&& o) = delete; + + CvVideoFrameReaderRgbWrapper(std::unique_ptr reader); + + std::shared_ptr ReadFrame() override; + + bool IsExhausted(const std::shared_ptr& frame) const override; + +private: + std::unique_ptr m_reader; +}; + +}// namespace common \ No newline at end of file diff --git a/samples/common/include/CVUtils/CvWindowOutput.hpp b/samples/common/include/CVUtils/CvWindowOutput.hpp new file mode 100644 index 0000000000..4b9ae3b743 --- /dev/null +++ b/samples/common/include/CVUtils/CvWindowOutput.hpp @@ -0,0 +1,53 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "IFrameOutput.hpp" +#include + +namespace common +{ + +class CvWindowOutput : public IFrameOutput { +public: + + CvWindowOutput() = default; + + ~CvWindowOutput() override = default; + + /** + * @brief Creates a named window. + * + * Uses opencv to create a window with given name. + * + * @param windowName opencv window name. + * + */ + void Init(const std::string& windowName); + + /** + * Writes frame to the window. + * + * @param frame data to write. + */ + void WriteFrame(std::shared_ptr& frame) override; + + /** + * Releases all windows. + */ + void Close() override; + + /** + * Always true. + * @return true. + */ + bool IsReady() const override; + +private: + std::string m_windowName; + +}; +}// namespace common \ No newline at end of file diff --git a/samples/common/include/CVUtils/IFrameOutput.hpp b/samples/common/include/CVUtils/IFrameOutput.hpp new file mode 100644 index 0000000000..6f7ca0b574 --- /dev/null +++ b/samples/common/include/CVUtils/IFrameOutput.hpp @@ -0,0 +1,48 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include +#include + +namespace common +{ +/** + * @brief Frames output interface + * + * @tparam FrameDataT frame container data type + */ + template class IFrameOutput + { + + public: + /** + * @brief Writes frame to the selected output + * + * @param frame container + */ + virtual void WriteFrame(std::shared_ptr & frame) = 0; + + /** + * @brief Closes the frame output + */ + virtual void Close() = 0; + + /** + * @brief Checks if the frame sink is ready to write. + * + * @return True if frame sink is ready, False otherwise + */ + virtual bool IsReady() const = 0; + + /** + * @brief Default destructor + */ + virtual ~IFrameOutput() = default; + + }; + +}// namespace common diff --git a/samples/common/include/CVUtils/IFrameReader.hpp b/samples/common/include/CVUtils/IFrameReader.hpp new file mode 100644 index 0000000000..e171b3bb94 --- /dev/null +++ b/samples/common/include/CVUtils/IFrameReader.hpp @@ -0,0 +1,45 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include +#include + +namespace common +{ +/** + * @brief Frame source reader interface + * + * @tparam FrameDataT frame container data type + */ +template class IFrameReader +{ + +public: + /** + * @brief Reads the next frame from the source + * + * @return pointer to the frame container + */ + virtual std::shared_ptr ReadFrame() = 0; + + /** + * @brief Checks if the frame source has more frames to read. + * + * @param[in] frame the pointer to the last frame captured with the ReadFrame method could be used in + * implementation specific logic to check frames source state. + * @return True if frame source was exhausted, False otherwise + */ + virtual bool IsExhausted(const std::shared_ptr & frame) const = 0; + + /** + * @brief Default destructor + */ + virtual ~IFrameReader() = default; + +}; + +}// namespace common \ No newline at end of file diff --git a/samples/common/include/Utils/CmdArgsParser.hpp b/samples/common/include/Utils/CmdArgsParser.hpp new file mode 100644 index 0000000000..710a33df93 --- /dev/null +++ b/samples/common/include/Utils/CmdArgsParser.hpp @@ -0,0 +1,25 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// +#pragma once +#include +#include + +/* + * Checks that a particular option was specified by the user + */ +bool CheckOptionSpecified(const std::map& options, const std::string& option); + + +/* + * Retrieves the user provided option + */ +std::string GetSpecifiedOption(const std::map& options, const std::string& option); + + +/* + * Parses all the command line options provided by the user and stores in a map. + */ +int ParseOptions(std::map& options, std::map& acceptedOptions, + char *argv[], int argc); \ No newline at end of file diff --git a/samples/common/include/Utils/Types.hpp b/samples/common/include/Utils/Types.hpp new file mode 100644 index 0000000000..4d1f708844 --- /dev/null +++ b/samples/common/include/Utils/Types.hpp @@ -0,0 +1,54 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include +#include +#include +#include + +#include + +namespace common +{ + +struct Size +{ + + uint32_t m_Width; + uint32_t m_Height; + + Size() : Size(0, 0) {} + + Size(uint32_t width, uint32_t height) : + m_Width{width}, m_Height{height} {} + + Size(const Size& other) + : Size(other.m_Width, other.m_Height) {} + + ~Size() = default; + + Size &operator=(const Size& other) = default; +}; + +struct BBoxColor +{ + std::tuple colorCode; +}; + +struct PipelineOptions +{ + std::string m_ModelName; + std::string m_ModelFilePath; + std::vector m_backends; +}; + +template +using InferenceResult = std::vector; + +template +using InferenceResults = std::vector>; +} // namespace common \ No newline at end of file diff --git a/samples/common/src/CVUtils/CvVideoFileWriter.cpp b/samples/common/src/CVUtils/CvVideoFileWriter.cpp new file mode 100644 index 0000000000..b76630049a --- /dev/null +++ b/samples/common/src/CVUtils/CvVideoFileWriter.cpp @@ -0,0 +1,38 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "CvVideoFileWriter.hpp" + +namespace common +{ + +void CvVideoFileWriter::Init(const std::string& outputVideo, int encoding, double fps, int width, int height) +{ + m_ready = m_cvWriter.open(outputVideo, cv::CAP_FFMPEG, + encoding, + fps, + cv::Size(width, height), true); +} + + +void CvVideoFileWriter::WriteFrame(std::shared_ptr& frame) +{ + if(m_cvWriter.isOpened()) + { + cv::cvtColor(*frame, *frame, cv::COLOR_RGB2BGR); + m_cvWriter.write(*frame); + } +} + +bool CvVideoFileWriter::IsReady() const +{ + return m_ready; +} + +void CvVideoFileWriter::Close() +{ + m_cvWriter.release(); +} +}// namespace common diff --git a/samples/common/src/CVUtils/CvVideoFrameReader.cpp b/samples/common/src/CVUtils/CvVideoFrameReader.cpp new file mode 100644 index 0000000000..2bd92d2d81 --- /dev/null +++ b/samples/common/src/CVUtils/CvVideoFrameReader.cpp @@ -0,0 +1,98 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + + +#include "CvVideoFrameReader.hpp" + +namespace common +{ + +std::shared_ptr CvVideoFrameReader::ReadFrame() +{ + // opencv copies data anyway + cv::Mat captureFrame; + m_capture.read(captureFrame); + return std::make_shared(std::move(captureFrame)); +} + +bool CvVideoFrameReader::IsExhausted(const std::shared_ptr& frame) const +{ + assert(frame!=nullptr); + return frame->empty(); +} + +void CvVideoFrameReader::CheckIsOpen(const std::string& source) +{ + if (!m_capture.isOpened()) + { + throw std::runtime_error("Failed to open video capture for the source = " + source); + } +} + +void CvVideoFrameReader::Init(const std::string& source) +{ + m_capture.open(source); + CheckIsOpen(source); +} + +int CvVideoFrameReader::GetSourceWidth() const +{ + return static_cast(lround(m_capture.get(cv::CAP_PROP_FRAME_WIDTH))); +} + +int CvVideoFrameReader::GetSourceHeight() const +{ + return static_cast(lround(m_capture.get(cv::CAP_PROP_FRAME_HEIGHT))); +} + +double CvVideoFrameReader::GetSourceFps() const +{ + return m_capture.get(cv::CAP_PROP_FPS); +} + +bool CvVideoFrameReader::ConvertToRGB() +{ + m_capture.set(cv::CAP_PROP_CONVERT_RGB, 1.0); + return static_cast(m_capture.get(cv::CAP_PROP_CONVERT_RGB)); +} + +std::string CvVideoFrameReader::GetSourceEncoding() const +{ + char fourccStr[5]; + auto fourcc = (int)m_capture.get(cv::CAP_PROP_FOURCC); + sprintf(fourccStr,"%c%c%c%c",fourcc & 0xFF, (fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF, (fourcc >> 24) & 0xFF); + return fourccStr; +} + +int CvVideoFrameReader::GetSourceEncodingInt() const +{ + return (int)m_capture.get(cv::CAP_PROP_FOURCC); +} + +int CvVideoFrameReader::GetFrameCount() const +{ + return static_cast(lround(m_capture.get(cv::CAP_PROP_FRAME_COUNT))); +}; + +std::shared_ptr CvVideoFrameReaderRgbWrapper::ReadFrame() +{ + auto framePtr = m_reader->ReadFrame(); + if (!IsExhausted(framePtr)) + { + cv::cvtColor(*framePtr, *framePtr, cv::COLOR_BGR2RGB); + } + return framePtr; +} + +bool CvVideoFrameReaderRgbWrapper::IsExhausted(const std::shared_ptr& frame) const +{ + return m_reader->IsExhausted(frame); +} + +CvVideoFrameReaderRgbWrapper::CvVideoFrameReaderRgbWrapper(std::unique_ptr reader): + m_reader(std::move(reader)) +{} + +}// namespace common \ No newline at end of file diff --git a/samples/common/src/CVUtils/CvWindowOutput.cpp b/samples/common/src/CVUtils/CvWindowOutput.cpp new file mode 100644 index 0000000000..190a7602e2 --- /dev/null +++ b/samples/common/src/CVUtils/CvWindowOutput.cpp @@ -0,0 +1,33 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "CvWindowOutput.hpp" + +namespace common +{ + +void CvWindowOutput::Init(const std::string& windowName) +{ + m_windowName = windowName; + cv::namedWindow(m_windowName, cv::WINDOW_AUTOSIZE); +} + +void CvWindowOutput::WriteFrame(std::shared_ptr& frame) +{ + cv::cvtColor(*frame, *frame, cv::COLOR_RGB2BGR); + cv::imshow( m_windowName, *frame); + cv::waitKey(30); +} + +void CvWindowOutput::Close() +{ + cv::destroyWindow(m_windowName); +} + +bool CvWindowOutput::IsReady() const +{ + return true; +} +}// namespace common \ No newline at end of file diff --git a/samples/common/src/Utils/CmdArgsParser.cpp b/samples/common/src/Utils/CmdArgsParser.cpp new file mode 100644 index 0000000000..1f09826a8b --- /dev/null +++ b/samples/common/src/Utils/CmdArgsParser.cpp @@ -0,0 +1,70 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "CmdArgsParser.hpp" +#include +/* + * Checks that a particular option was specified by the user + */ +bool CheckOptionSpecified(const std::map& options, const std::string& option) +{ + auto it = options.find(option); + return it!=options.end(); +} + +/* + * Retrieves the user provided option + */ +std::string GetSpecifiedOption(const std::map& options, const std::string& option) +{ + if (CheckOptionSpecified(options, option)){ + return options.at(option); + } + else + { + throw std::invalid_argument("Required option: " + option + " not defined."); + } +} + +/* + * Parses all the command line options provided by the user and stores in a map. + */ +int ParseOptions(std::map& options, std::map& acceptedOptions, + char *argv[], int argc) +{ + for (int i = 1; i < argc; ++i) + { + std::string currentOption = std::string(argv[i]); + auto it = acceptedOptions.find(currentOption); + if (it != acceptedOptions.end()) + { + if (i + 1 < argc && std::string(argv[i + 1]).rfind("--", 0) != 0) + { + std::string value = argv[++i]; + options.insert({it->first, value}); + } + else if (std::string(argv[i]) == "HELP") + { + std::cout << "Available options" << std::endl; + for (auto & acceptedOption : acceptedOptions) + { + std::cout << acceptedOption.first << " : " << acceptedOption.second << std::endl; + } + return 2; + } + else + { + std::cerr << std::string(argv[i]) << " option requires one argument." << std::endl; + return 1; + } + } + else + { + std::cerr << "Unrecognised option: " << std::string(argv[i]) << std::endl; + return 1; + } + } + return 0; +} -- cgit v1.2.1