From a04a9d7c11f28c7e932435535e80223782f369f2 Mon Sep 17 00:00:00 2001
From: Sadik Armagan <sadik.armagan@arm.com>
Date: Tue, 27 Apr 2021 10:02:10 +0100
Subject: IVGCVSW-5775 'Add Async Support to ExecuteNetwork'

* Enabled async mode with '-n, concurrent' and 'simultaneous-iterations'
  in ExecuteNetwork
* Number of input files provided should be equal to number of input files
  provided multiply by number of simultaneous iterations divided by comma

!armnn:5443

Signed-off-by: Sadik Armagan <sadik.armagan@arm.com>
Change-Id: Ibeb318010430bf4ae61a02b18b1bf88f3657774c
---
 tests/ExecuteNetwork/ExecuteNetwork.cpp            | 253 ++++++++++++++-------
 tests/ExecuteNetwork/ExecuteNetworkParams.cpp      |  23 +-
 tests/ExecuteNetwork/ExecuteNetworkParams.hpp      |   2 +
 .../ExecuteNetworkProgramOptions.cpp               |  10 +-
 4 files changed, 208 insertions(+), 80 deletions(-)

(limited to 'tests/ExecuteNetwork')
diff --git a/tests/ExecuteNetwork/ExecuteNetwork.cpp b/tests/ExecuteNetwork/ExecuteNetwork.cpp
index 60e4ec3401..2bbb51783c 100644
--- a/tests/ExecuteNetwork/ExecuteNetwork.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetwork.cpp
@@ -279,7 +279,8 @@ int MainImpl(const ExecuteNetworkParams& params,
     using TContainer =
            mapbox::util::variant<std::vector<float>, std::vector<int>, std::vector<unsigned char>, std::vector<int8_t>>;
 
-    std::vector<TContainer> inputDataContainers;
+    std::vector<std::vector<TContainer>> inputs;
+    std::vector<std::vector<TContainer>> outputs;
 
     try
     {
@@ -298,6 +299,7 @@ int MainImpl(const ExecuteNetworkParams& params,
         inferenceModelParams.m_CachedNetworkFilePath          = params.m_CachedNetworkFilePath;
         inferenceModelParams.m_NumberOfThreads                = params.m_NumberOfThreads;
         inferenceModelParams.m_MLGOTuningFilePath             = params.m_MLGOTuningFilePath;
+        inferenceModelParams.m_AsyncEnabled                   = params.m_Concurrent;
 
         for(const std::string& inputName: params.m_InputNames)
         {
@@ -324,106 +326,201 @@ int MainImpl(const ExecuteNetworkParams& params,
                                                  runtime);
 
         const size_t numInputs = inferenceModelParams.m_InputBindings.size();
-        for(unsigned int i = 0; i < numInputs; ++i)
+
+        armnn::Optional<QuantizationParams> qParams = params.m_QuantizeInput ?
+                                                      armnn::MakeOptional<QuantizationParams>(
+                                                          model.GetInputQuantizationParams()) :
+                                                      armnn::EmptyOptional();
+
+        for(unsigned int j = 0; j < params.m_SimultaneousIterations ; ++j)
         {
-            armnn::Optional<QuantizationParams> qParams = params.m_QuantizeInput ?
-                                                          armnn::MakeOptional<QuantizationParams>(
-                                                                  model.GetInputQuantizationParams()) :
-                                                          armnn::EmptyOptional();
-
-            armnn::Optional<std::string> dataFile = params.m_GenerateTensorData ?
-                                                    armnn::EmptyOptional() :
-                                                    armnn::MakeOptional<std::string>(
-                                                            params.m_InputTensorDataFilePaths[i]);
-
-            unsigned int numElements = model.GetInputSize(i);
-            if (params.m_InputTensorShapes.size() > i && params.m_InputTensorShapes[i])
+            std::vector<TContainer> inputDataContainers;
+            for(unsigned int i = 0; i < numInputs; ++i)
             {
-                // If the user has provided a tensor shape for the current input,
-                // override numElements
-                numElements = params.m_InputTensorShapes[i]->GetNumElements();
-            }
+                armnn::Optional<std::string> dataFile = params.m_GenerateTensorData ?
+                                                        armnn::EmptyOptional() :
+                                                        armnn::MakeOptional<std::string>(
+                                                            params.m_InputTensorDataFilePaths[(j * numInputs) + i]);
 
-            TContainer tensorData;
-            PopulateTensorWithData(tensorData,
-                                   numElements,
-                                   params.m_InputTypes[i],
-                                   qParams,
-                                   dataFile);
+                unsigned int numElements = model.GetInputSize(i);
+                if (params.m_InputTensorShapes.size() > i && params.m_InputTensorShapes[i])
+                {
+                    // If the user has provided a tensor shape for the current input,
+                    // override numElements
+                    numElements = params.m_InputTensorShapes[i]->GetNumElements();
+                }
+
+                TContainer tensorData;
+                PopulateTensorWithData(tensorData,
+                                       numElements,
+                                       params.m_InputTypes[i],
+                                       qParams,
+                                       dataFile);
 
-            inputDataContainers.push_back(tensorData);
+                inputDataContainers.push_back(tensorData);
+            }
+            inputs.push_back(inputDataContainers);
         }
 
         const size_t numOutputs = inferenceModelParams.m_OutputBindings.size();
-        std::vector<TContainer> outputDataContainers;
 
-        for (unsigned int i = 0; i < numOutputs; ++i)
+        for (unsigned int j = 0; j < params.m_SimultaneousIterations; ++j)
         {
-            if (params.m_OutputTypes[i].compare("float") == 0)
-            {
-                outputDataContainers.push_back(std::vector<float>(model.GetOutputSize(i)));
-            }
-            else if (params.m_OutputTypes[i].compare("int") == 0)
-            {
-                outputDataContainers.push_back(std::vector<int>(model.GetOutputSize(i)));
-            }
-            else if (params.m_OutputTypes[i].compare("qasymm8") == 0)
-            {
-                outputDataContainers.push_back(std::vector<uint8_t>(model.GetOutputSize(i)));
-            }
-            else if (params.m_OutputTypes[i].compare("qsymms8") == 0)
+            std::vector <TContainer> outputDataContainers;
+            for (unsigned int i = 0; i < numOutputs; ++i)
             {
-                outputDataContainers.push_back(std::vector<int8_t>(model.GetOutputSize(i)));
-            }
-            else
-            {
-                ARMNN_LOG(fatal) << "Unsupported tensor data type \"" << params.m_OutputTypes[i] << "\". ";
-                return EXIT_FAILURE;
+                if (params.m_OutputTypes[i].compare("float") == 0)
+                {
+                    outputDataContainers.push_back(std::vector<float>(model.GetOutputSize(i)));
+                } else if (params.m_OutputTypes[i].compare("int") == 0)
+                {
+                    outputDataContainers.push_back(std::vector<int>(model.GetOutputSize(i)));
+                } else if (params.m_OutputTypes[i].compare("qasymm8") == 0)
+                {
+                    outputDataContainers.push_back(std::vector<uint8_t>(model.GetOutputSize(i)));
+                } else if (params.m_OutputTypes[i].compare("qsymms8") == 0)
+                {
+                    outputDataContainers.push_back(std::vector<int8_t>(model.GetOutputSize(i)));
+                } else
+                {
+                    ARMNN_LOG(fatal) << "Unsupported tensor data type \"" << params.m_OutputTypes[i] << "\". ";
+                    return EXIT_FAILURE;
+                }
             }
+            outputs.push_back(outputDataContainers);
         }
 
-        for (size_t x = 0; x < params.m_Iterations; x++)
+        if (!params.m_Concurrent)
         {
-            // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
-            auto inference_duration = model.Run(inputDataContainers, outputDataContainers);
-
-            if (params.m_GenerateTensorData)
+            // Synchronous Execution
+            for (size_t x = 0; x < params.m_Iterations; x++)
             {
-                ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
-            }
+                // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
+                auto inference_duration = model.Run(inputs[0], outputs[0]);
 
-            // Print output tensors
-            const auto& infosOut = model.GetOutputBindingInfos();
-            for (size_t i = 0; i < numOutputs; i++)
-            {
-                const armnn::TensorInfo& infoOut = infosOut[i].second;
-                auto outputTensorFile = params.m_OutputTensorFiles.empty() ? "" : params.m_OutputTensorFiles[i];
-
-                TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
-                                      infoOut,
-                                      outputTensorFile,
-                                      params.m_DequantizeOutput);
-                mapbox::util::apply_visitor(printer, outputDataContainers[i]);
-            }
+                if (params.m_GenerateTensorData)
+                {
+                    ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
+                }
+
+                // Print output tensors
+                const auto& infosOut = model.GetOutputBindingInfos();
+                for (size_t i = 0; i < numOutputs; i++)
+                {
+                    const armnn::TensorInfo& infoOut = infosOut[i].second;
+                    auto outputTensorFile = params.m_OutputTensorFiles.empty() ? "" : params.m_OutputTensorFiles[i];
+
+                    TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
+                                          infoOut,
+                                          outputTensorFile,
+                                          params.m_DequantizeOutput);
+                    mapbox::util::apply_visitor(printer, outputs[0][i]);
+                }
 
-            ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
-                            << std::fixed << inference_duration.count() << " ms\n";
+                ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
+                                << std::fixed << inference_duration.count() << " ms\n";
 
-            // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
-            if (params.m_ThresholdTime != 0.0)
+                // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
+                if (params.m_ThresholdTime != 0.0)
+                {
+                    ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
+                                    << std::fixed << params.m_ThresholdTime << " ms";
+                    auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
+                    ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
+                                    << std::fixed << thresholdMinusInference << " ms" << "\n";
+
+                    if (thresholdMinusInference < 0)
+                    {
+                        std::string errorMessage = "Elapsed inference time is greater than provided threshold time.";
+                        ARMNN_LOG(fatal) << errorMessage;
+                    }
+                }
+            }
+        }
+        else
+        {
+            try
             {
-                ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
-                                << std::fixed << params.m_ThresholdTime << " ms";
-                auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
-                ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
-                                << std::fixed << thresholdMinusInference << " ms" << "\n";
+                ARMNN_LOG(info) << "Asynchronous Execution...  \n";
+                std::vector<std::future<std::tuple<armnn::profiling::ProfilingGuid,
+                std::chrono::duration<double, std::milli>>>> inferenceResults;
+                inferenceResults.reserve(params.m_SimultaneousIterations);
+
+                // Create WorkingMemHandles for each inference
+                std::vector<std::unique_ptr<armnn::experimental::IWorkingMemHandle>> workingMemHandles;
+                workingMemHandles.reserve(params.m_SimultaneousIterations);
+                for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i)
+                {
+                    workingMemHandles.push_back(model.CreateWorkingMemHandle());
+                }
+
+                // Run each inference in its own thread
+                for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i)
+                {
+                    armnn::experimental::IWorkingMemHandle& workingMemHandleRef = *workingMemHandles[i].get();
+                    inferenceResults.push_back(std::async(
+                        std::launch::async, [&model, &workingMemHandleRef, &inputs, &outputs, i]() {
+                            return model.RunAsync(workingMemHandleRef, inputs[i], outputs[i]);
+                        }
+                        ));
+                }
 
-                if (thresholdMinusInference < 0)
+                // Check the results
+                for (unsigned int j = 0; j < inferenceResults.size(); ++j)
                 {
-                    std::string errorMessage = "Elapsed inference time is greater than provided threshold time.";
-                    ARMNN_LOG(fatal) << errorMessage;
+                    // Get the results
+                    auto inferenceResult = inferenceResults[j].get();
+                    auto inference_duration = std::get<1>(inferenceResult);
+                    auto inferenceID = std::get<0>(inferenceResult);
+
+                    if (params.m_GenerateTensorData)
+                    {
+                        ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
+                    }
+
+                    // Print output tensors
+                    const auto& infosOut = model.GetOutputBindingInfos();
+                    for (size_t i = 0; i < numOutputs; i++)
+                    {
+                        const armnn::TensorInfo& infoOut = infosOut[i].second;
+                        auto outputTensorFile = params.m_OutputTensorFiles.empty()
+                                                ? ""
+                                                : params.m_OutputTensorFiles[(j * numOutputs) + i];
+
+                        TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
+                                              infoOut,
+                                              outputTensorFile,
+                                              params.m_DequantizeOutput);
+                        mapbox::util::apply_visitor(printer, outputs[j][i]);
+                    }
+
+                    ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
+                                    << std::fixed << inference_duration.count() << " ms\n";
+
+                    // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
+                    if (params.m_ThresholdTime != 0.0)
+                    {
+                        ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
+                                        << std::fixed << params.m_ThresholdTime << " ms";
+                        auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
+                        ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
+                                        << std::fixed << thresholdMinusInference << " ms" << "\n";
+
+                        if (thresholdMinusInference < 0)
+                        {
+                            ARMNN_LOG(fatal) << "Elapsed inference time is greater than provided threshold time. \n";
+                        }
+                    }
+                    ARMNN_LOG(info) << "Asynchronous Execution is finished for Inference ID: " << inferenceID << " \n";
+
                 }
             }
+            catch (const armnn::Exception& e)
+            {
+                ARMNN_LOG(fatal) << "Armnn Error: " << e.what();
+                return EXIT_FAILURE;
+            }
+
         }
     }
     catch (const armnn::Exception& e)
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
index 4e3b5e313d..8f1cb0b599 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
@@ -145,6 +145,12 @@ void ExecuteNetworkParams::ValidateParams()
 
         CheckModelFormat(m_ModelFormat);
 
+        // Check number of simultaneous iterations
+        if ((m_SimultaneousIterations < 1))
+        {
+            ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 1. ";
+        }
+
         // Check input tensor shapes
         if ((m_InputTensorShapes.size() != 0) &&
             (m_InputTensorShapes.size() != m_InputNames.size()))
@@ -159,10 +165,19 @@ void ExecuteNetworkParams::ValidateParams()
                 ARMNN_LOG(fatal) << "One or more input data file paths are not valid. ";
             }
 
-            if (m_InputTensorDataFilePaths.size() != m_InputNames.size())
+            if (!m_Concurrent && m_InputTensorDataFilePaths.size() != m_InputNames.size())
             {
                 ARMNN_LOG(fatal) << "input-name and input-tensor-data must have the same amount of elements. ";
             }
+
+            if (m_InputTensorDataFilePaths.size() < m_SimultaneousIterations * m_InputNames.size())
+            {
+                ARMNN_LOG(fatal) << "There is not enough input data for " << m_SimultaneousIterations << " execution.";
+            }
+            if (m_InputTensorDataFilePaths.size() > m_SimultaneousIterations * m_InputNames.size())
+            {
+                ARMNN_LOG(fatal) << "There is more input data for " << m_SimultaneousIterations << " execution.";
+            }
         }
 
         if ((m_OutputTensorFiles.size() != 0) &&
@@ -171,6 +186,12 @@ void ExecuteNetworkParams::ValidateParams()
             ARMNN_LOG(fatal) << "output-name and write-outputs-to-file must have the same amount of elements. ";
         }
 
+        if ((m_OutputTensorFiles.size() != 0)
+            && m_OutputTensorFiles.size() != m_SimultaneousIterations * m_OutputNames.size())
+        {
+            ARMNN_LOG(fatal) << "There is not enough output data for " << m_SimultaneousIterations << " execution.";
+        }
+
         if (m_InputTypes.size() == 0)
         {
             //Defaults the value of all inputs to "float"
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
index a19eaa9346..c325df110f 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
@@ -23,6 +23,7 @@ struct ExecuteNetworkParams
 
     std::string                   m_CachedNetworkFilePath;
     std::vector<armnn::BackendId> m_ComputeDevices;
+    bool                          m_Concurrent;
     bool                          m_DequantizeOutput;
     std::string                   m_DynamicBackendsPath;
     bool                          m_EnableBf16TurboMode;
@@ -49,6 +50,7 @@ struct ExecuteNetworkParams
     bool                          m_PrintIntermediate;
     bool                          m_QuantizeInput;
     bool                          m_SaveCachedNetwork;
+    size_t                        m_SimultaneousIterations;
     size_t                        m_SubgraphId;
     double                        m_ThresholdTime;
     int                           m_TuningLevel;
diff --git a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
index 286c970d72..042087e4f4 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
@@ -194,6 +194,10 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork",
                  "If left empty (the default), dynamic backends will not be used.",
                  cxxopts::value<std::string>(m_RuntimeOptions.m_DynamicBackendsPath))
 
+                ("n,concurrent",
+                 "If this option is enabled inferences will be executed in parallel asynchronously.",
+                 cxxopts::value<bool>(m_ExNetParams.m_Concurrent)->default_value("false")->implicit_value("true"))
+
                 ("d,input-tensor-data",
                  "Path to files containing the input data as a flat array separated by whitespace. "
                  "Several paths can be passed by separating them with a comma. If not specified, the network will be "
@@ -278,7 +282,11 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork",
                 ("D,armnn-tflite-delegate",
                  "Enable Arm NN TfLite delegate. "
                  "This option is depreciated please use tflite-executor instead",
-                 cxxopts::value<bool>(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true"));
+                 cxxopts::value<bool>(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true"))
+
+               ("simultaneous-iterations",
+                "Number of simultaneous iterations to async-run the network for, default is set to 1",
+                cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1"));
 
         m_CxxOptions.add_options("c) Optimization")
                 ("bf16-turbo-mode",
-- 
cgit v1.2.1