From b4b3ac91990eb5deaffca2300319f2ddf7aa0886 Mon Sep 17 00:00:00 2001
From: Kevin May <kevin.may@arm.com>
Date: Fri, 21 May 2021 16:42:21 +0100
Subject: IVGCVSW-6009 Integrate threadpool into ExNet

 * Remove concurrent flag from ExecuteNetwork as it is possible
   to deduce if SimultaneousIterations > 1
 * Add void RunAsync()
 * Refactor some unit tests

Change-Id: I7021d4821b0e460470908294cbd9462850e8b361
Signed-off-by: Keith Davis <keith.davis@arm.com>
Signed-off-by: Kevin May <kevin.may@arm.com>
---
 tests/ExecuteNetwork/ExecuteNetwork.cpp            | 130 +++++++++++++++++++--
 tests/ExecuteNetwork/ExecuteNetworkParams.cpp      |   6 +-
 tests/ExecuteNetwork/ExecuteNetworkParams.hpp      |   1 +
 .../ExecuteNetworkProgramOptions.cpp               |  21 +++-
 tests/InferenceModel.hpp                           |  46 +++++++-
 5 files changed, 187 insertions(+), 17 deletions(-)

(limited to 'tests')
diff --git a/tests/ExecuteNetwork/ExecuteNetwork.cpp b/tests/ExecuteNetwork/ExecuteNetwork.cpp
index 2bbb51783c..cd760a8199 100644
--- a/tests/ExecuteNetwork/ExecuteNetwork.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetwork.cpp
@@ -5,6 +5,8 @@
 
 #include "NetworkExecutionUtils/NetworkExecutionUtils.hpp"
 #include "ExecuteNetworkProgramOptions.hpp"
+#include <armnn/IAsyncExecutionCallback.hpp>
+#include <AsyncExecutionCallback.hpp>
 
 #include <armnn/Logging.hpp>
 #include <Filesystem.hpp>
@@ -276,8 +278,7 @@ template<typename TParser, typename TDataType>
 int MainImpl(const ExecuteNetworkParams& params,
              const std::shared_ptr<armnn::IRuntime>& runtime = nullptr)
 {
-    using TContainer =
-           mapbox::util::variant<std::vector<float>, std::vector<int>, std::vector<unsigned char>, std::vector<int8_t>>;
+    using namespace std::chrono;
 
     std::vector<std::vector<TContainer>> inputs;
     std::vector<std::vector<TContainer>> outputs;
@@ -300,6 +301,7 @@ int MainImpl(const ExecuteNetworkParams& params,
         inferenceModelParams.m_NumberOfThreads                = params.m_NumberOfThreads;
         inferenceModelParams.m_MLGOTuningFilePath             = params.m_MLGOTuningFilePath;
         inferenceModelParams.m_AsyncEnabled                   = params.m_Concurrent;
+        inferenceModelParams.m_ThreadPoolSize                 = params.m_ThreadPoolSize;
 
         for(const std::string& inputName: params.m_InputNames)
         {
@@ -390,9 +392,9 @@ int MainImpl(const ExecuteNetworkParams& params,
             outputs.push_back(outputDataContainers);
         }
 
+        // Synchronous execution
         if (!params.m_Concurrent)
         {
-            // Synchronous Execution
             for (size_t x = 0; x < params.m_Iterations; x++)
             {
                 // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
@@ -437,13 +439,118 @@ int MainImpl(const ExecuteNetworkParams& params,
                 }
             }
         }
+        // Asynchronous execution using the Arm NN thread pool
+        else if (params.m_ThreadPoolSize >= 2)
+        {
+            try
+            {
+                ARMNN_LOG(info) << "Asynchronous execution with Arm NN thread pool...  \n";
+                std::vector<armnn::experimental::IAsyncExecutionCallbackPtr> callbacks;
+
+                // Create callbacks that will be checked post scheduling
+                for (size_t i = 0; i < params.m_SimultaneousIterations; ++i)
+                {
+                    // Point to ArmNN example implementation of AsyncExecutionCallback
+                    callbacks.emplace_back(std::make_shared<armnn::experimental::AsyncExecutionCallback>());
+                }
+
+                // Declare the latest and earliest inference times here to be used when calculating overall time
+                std::chrono::high_resolution_clock::time_point earliestStartTime;
+                std::chrono::high_resolution_clock::time_point latestEndTime =
+                    std::chrono::high_resolution_clock::now();
+
+                // For the asynchronous execution, we are adding a pool of working memory handles (1 per thread) in the
+                // LoadedNetwork with each scheduled inference having a specific priority
+                for (size_t i = 0; i < callbacks.size(); ++i)
+                {
+                    model.RunAsync(inputs[i], outputs[i], callbacks[i]);
+                }
+
+                // Check the results
+                unsigned int j = 0;
+                for (armnn::experimental::IAsyncExecutionCallbackPtr cb : callbacks)
+                {
+                    // Get the results
+                    auto endTime = time_point_cast<std::chrono::milliseconds>(cb->GetEndTime());
+                    auto startTime = time_point_cast<std::chrono::milliseconds>(cb->GetStartTime());
+                    auto inferenceDuration = endTime - startTime;
+
+                    if (latestEndTime < cb->GetEndTime())
+                    {
+                        latestEndTime = cb->GetEndTime();
+                    }
+
+                    if (earliestStartTime.time_since_epoch().count() == 0)
+                    {
+                        earliestStartTime = cb->GetStartTime();
+                    }
+                    else if (earliestStartTime > cb->GetStartTime())
+                    {
+                        earliestStartTime = cb->GetStartTime();
+                    }
+
+                    if (params.m_GenerateTensorData)
+                    {
+                        ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
+                    }
+
+                    // Print output tensors
+                    const auto& infosOut = model.GetOutputBindingInfos();
+                    for (size_t i = 0; i < numOutputs; i++)
+                    {
+                        const armnn::TensorInfo& infoOut = infosOut[i].second;
+                        auto outputTensorFile = params.m_OutputTensorFiles.empty()
+                                                ? ""
+                                                : params.m_OutputTensorFiles[(j * numOutputs) + i];
+
+                        TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
+                                              infoOut,
+                                              outputTensorFile,
+                                              params.m_DequantizeOutput);
+                        mapbox::util::apply_visitor(printer, outputs[j][i]);
+                    }
+
+                    ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
+                                    << std::fixed << inferenceDuration.count() << " ms\n";
+
+                     // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
+                    if (params.m_ThresholdTime != 0.0)
+                    {
+                        ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
+                                        << std::fixed << params.m_ThresholdTime << " ms";
+                        auto thresholdMinusInference =
+                            params.m_ThresholdTime - duration<double, std::milli>(inferenceDuration).count();
+                        ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
+                                        << std::fixed << thresholdMinusInference << " ms" << "\n";
+
+                        if (thresholdMinusInference < 0)
+                        {
+                            ARMNN_LOG(fatal) << "Elapsed inference time is greater than provided threshold time. \n";
+                        }
+                    }
+                    ++j;
+                }
+                //print duration difference between overallStartTime and overallEndTime
+                auto overallEndTime = time_point_cast<std::chrono::milliseconds>(latestEndTime);
+                auto overallStartTime = time_point_cast<std::chrono::milliseconds>(earliestStartTime);
+                auto totalInferenceDuration = overallEndTime - overallStartTime;
+                ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2)
+                                << std::fixed << totalInferenceDuration.count() << " ms\n";
+            }
+            catch (const armnn::Exception& e)
+            {
+                ARMNN_LOG(fatal) << "Armnn Error: " << e.what();
+                return EXIT_FAILURE;
+            }
+        }
+        // Asynchronous execution using std::launch::async
         else
         {
             try
             {
-                ARMNN_LOG(info) << "Asynchronous Execution...  \n";
+                ARMNN_LOG(info) << "Asynchronous Execution with std::launch:async...  \n";
                 std::vector<std::future<std::tuple<armnn::profiling::ProfilingGuid,
-                std::chrono::duration<double, std::milli>>>> inferenceResults;
+                    std::chrono::duration<double, std::milli>>>> inferenceResults;
                 inferenceResults.reserve(params.m_SimultaneousIterations);
 
                 // Create WorkingMemHandles for each inference
@@ -455,6 +562,8 @@ int MainImpl(const ExecuteNetworkParams& params,
                 }
 
                 // Run each inference in its own thread
+                // start a timer
+                const auto start_time = armnn::GetTimeNow();
                 for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i)
                 {
                     armnn::experimental::IWorkingMemHandle& workingMemHandleRef = *workingMemHandles[i].get();
@@ -470,7 +579,7 @@ int MainImpl(const ExecuteNetworkParams& params,
                 {
                     // Get the results
                     auto inferenceResult = inferenceResults[j].get();
-                    auto inference_duration = std::get<1>(inferenceResult);
+                    auto inferenceDuration = std::get<1>(inferenceResult);
                     auto inferenceID = std::get<0>(inferenceResult);
 
                     if (params.m_GenerateTensorData)
@@ -495,14 +604,14 @@ int MainImpl(const ExecuteNetworkParams& params,
                     }
 
                     ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
-                                    << std::fixed << inference_duration.count() << " ms\n";
+                                    << std::fixed << inferenceDuration.count() << " ms\n";
 
                     // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
                     if (params.m_ThresholdTime != 0.0)
                     {
                         ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
                                         << std::fixed << params.m_ThresholdTime << " ms";
-                        auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
+                        auto thresholdMinusInference = params.m_ThresholdTime - inferenceDuration.count();
                         ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
                                         << std::fixed << thresholdMinusInference << " ms" << "\n";
 
@@ -514,13 +623,16 @@ int MainImpl(const ExecuteNetworkParams& params,
                     ARMNN_LOG(info) << "Asynchronous Execution is finished for Inference ID: " << inferenceID << " \n";
 
                 }
+                // finish timer
+                const auto duration = armnn::GetTimeDuration(start_time);
+                ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2)
+                                << std::fixed << duration.count() << " ms\n";
             }
             catch (const armnn::Exception& e)
             {
                 ARMNN_LOG(fatal) << "Armnn Error: " << e.what();
                 return EXIT_FAILURE;
             }
-
         }
     }
     catch (const armnn::Exception& e)
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
index 944efec17f..189ece25a7 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
@@ -138,9 +138,11 @@ void ExecuteNetworkParams::ValidateParams()
         CheckModelFormat(m_ModelFormat);
 
         // Check number of simultaneous iterations
-        if ((m_SimultaneousIterations < 1))
+        // Testing std::launch::async with a single iteration is possible if concurrent is manually set
+        if ((m_SimultaneousIterations <= 1 && m_ThreadPoolSize > 1) ||
+            (m_SimultaneousIterations <= 1 && !m_Concurrent))
         {
-            ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 1. ";
+            ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 2.";
         }
 
         // Check input tensor shapes
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
index c325df110f..fe0c446087 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
@@ -57,6 +57,7 @@ struct ExecuteNetworkParams
     std::string                   m_TuningPath;
     std::string                   m_MLGOTuningFilePath;
     TfLiteExecutor                m_TfLiteExecutor;
+    size_t                        m_ThreadPoolSize;
 
     // Ensures that the parameters for ExecuteNetwork fit together
     void ValidateParams();
diff --git a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
index 042087e4f4..1f57f85252 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
@@ -195,7 +195,8 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork",
                  cxxopts::value<std::string>(m_RuntimeOptions.m_DynamicBackendsPath))
 
                 ("n,concurrent",
-                 "If this option is enabled inferences will be executed in parallel asynchronously.",
+                 "This option is for Arm NN internal asynchronous testing purposes. "
+                 "By default it is set to true if thread-pool-size or simultaneous-iterations are greater than 1",
                  cxxopts::value<bool>(m_ExNetParams.m_Concurrent)->default_value("false")->implicit_value("true"))
 
                 ("d,input-tensor-data",
@@ -284,9 +285,15 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork",
                  "This option is depreciated please use tflite-executor instead",
                  cxxopts::value<bool>(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true"))
 
-               ("simultaneous-iterations",
-                "Number of simultaneous iterations to async-run the network for, default is set to 1",
-                cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1"));
+                ("simultaneous-iterations",
+                 "Number of simultaneous iterations to async-run the network for, default is set to 1 (disabled). "
+                 "When thread-pool-size is set the Arm NN thread pool is used. Otherwise std::launch::async is used.",
+                 cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1"))
+
+                ("thread-pool-size",
+                 "Number of Arm NN threads to use when running the network asynchronously via the Arm NN thread pool. "
+                 "The default is set to 1",
+                 cxxopts::value<size_t>(m_ExNetParams.m_ThreadPoolSize)->default_value("1"));
 
         m_CxxOptions.add_options("c) Optimization")
                 ("bf16-turbo-mode",
@@ -453,7 +460,11 @@ void ProgramOptions::ParseOptions(int ac, const char* av[])
                                        "please use tflite-executor instead.");
     }
 
-
+    // Set concurrent to true if the user expects to run inferences asynchronously
+    if (m_ExNetParams.m_SimultaneousIterations > 1)
+    {
+        m_ExNetParams.m_Concurrent = true;
+    }
 
     // Parse input tensor shape from the string we got from the command-line.
     std::vector<std::string> inputTensorShapesVector =
diff --git a/tests/InferenceModel.hpp b/tests/InferenceModel.hpp
index 3429598249..7c51011a22 100644
--- a/tests/InferenceModel.hpp
+++ b/tests/InferenceModel.hpp
@@ -102,6 +102,7 @@ struct Params
     unsigned int                    m_NumberOfThreads;
     std::string                     m_MLGOTuningFilePath;
     bool                            m_AsyncEnabled;
+    size_t                          m_ThreadPoolSize;
 
 
     Params()
@@ -120,6 +121,7 @@ struct Params
         , m_NumberOfThreads(0)
         , m_MLGOTuningFilePath("")
         , m_AsyncEnabled(false)
+        , m_ThreadPoolSize(1)
     {}
 };
 
@@ -481,7 +483,8 @@ public:
             const auto loading_start_time = armnn::GetTimeNow();
             armnn::INetworkProperties networkProperties(params.m_AsyncEnabled,
                                                         armnn::MemorySource::Undefined,
-                                                        armnn::MemorySource::Undefined);
+                                                        armnn::MemorySource::Undefined,
+                                                        params.m_ThreadPoolSize);
             std::string errorMessage;
             ret = m_Runtime->LoadNetwork(m_NetworkIdentifier, std::move(optNet), errorMessage, networkProperties);
 
@@ -632,6 +635,47 @@ public:
         }
     }
 
+    void RunAsync(const std::vector<TContainer>& inputContainers,
+                  std::vector<TContainer>& outputContainers,
+                  armnn::experimental::IAsyncExecutionCallbackPtr cb)
+    {
+        for (unsigned int i = 0; i < outputContainers.size(); ++i)
+        {
+            const unsigned int expectedOutputDataSize = GetOutputSize(i);
+
+            mapbox::util::apply_visitor([expectedOutputDataSize, i](auto&& value)
+            {
+                const unsigned int actualOutputDataSize   = armnn::numeric_cast<unsigned int>(value.size());
+                if (actualOutputDataSize < expectedOutputDataSize)
+                {
+                    unsigned int outputIndex = i;
+                    throw armnn::Exception(
+                            fmt::format("Not enough data for output #{0}: expected "
+                            "{1} elements, got {2}", outputIndex, expectedOutputDataSize, actualOutputDataSize));
+                }
+            },
+            outputContainers[i]);
+        }
+
+        std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkIdentifier);
+        if (profiler)
+        {
+            profiler->EnableProfiling(m_EnableProfiling);
+        }
+
+        m_Runtime->Schedule(m_NetworkIdentifier,
+                            MakeInputTensors(inputContainers),
+                            MakeOutputTensors(outputContainers),
+                            armnn::QosExecPriority::Medium,
+                            cb);
+
+        // if profiling is enabled print out the results
+        if (profiler && profiler->IsProfilingEnabled())
+        {
+            profiler->Print(std::cout);
+        }
+    }
+
     const armnn::BindingPointInfo& GetInputBindingInfo(unsigned int inputIndex = 0u) const
     {
         CheckInputIndexIsValid(inputIndex);
-- 
cgit v1.2.1