IVGCVSW-6009 Integrate threadpool into ExNet

* Remove concurrent flag from ExecuteNetwork as it is possible to deduce if SimultaneousIterations > 1 * Add void RunAsync() * Refactor some unit tests Change-Id: I7021d4821b0e460470908294cbd9462850e8b361 Signed-off-by: Keith Davis <keith.davis@arm.com> Signed-off-by: Kevin May <kevin.may@arm.com>
author: Kevin May <kevin.may@arm.com> 2021-05-21 16:42:21 +0100
committer: Kevin May <kevin.may@arm.com> 2021-05-26 11:56:54 +0000
commit: b4b3ac91990eb5deaffca2300319f2ddf7aa0886 (patch)
tree: e480826fe604d652877459ce4bbf4314a461e4b2
parent: 401c1c3f973da1a2e2cef7f88a5aac2cf295fac7 (diff)
download: armnn-b4b3ac91990eb5deaffca2300319f2ddf7aa0886.tar.gz
9 files changed, 206 insertions, 133 deletions
diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp
index 870e027f33..bfc13c9c01 100644
--- a/include/armnn/IRuntime.hpp
+++ b/include/armnn/IRuntime.hpp
@@ -33,7 +33,7 @@ struct INetworkProperties
     INetworkProperties(bool importEnabled = false,
                        bool exportEnabled = false,
                        bool asyncEnabled = false,
-                       size_t numThreads = 0)
+                       size_t numThreads = 1)
         : m_ImportEnabled(importEnabled)
         , m_ExportEnabled(exportEnabled)
         , m_AsyncEnabled(asyncEnabled)
@@ -45,7 +45,7 @@ struct INetworkProperties
     INetworkProperties(bool asyncEnabled,
                        MemorySource m_InputSource,
                        MemorySource m_OutputSource,
-                       size_t numThreads = 0)
+                       size_t numThreads = 1)
         : m_ImportEnabled(m_InputSource != MemorySource::Undefined)
         , m_ExportEnabled(m_OutputSource != MemorySource::Undefined)
         , m_AsyncEnabled(asyncEnabled)
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index 53a9e18863..25f8ddf6e5 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -169,17 +169,17 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
     }
 
     // Create the thread pool which will have working memory handles assigned to each thread
-    // Should occur after factories are registered so thet the WorkingMemHandles can be created
-    if (m_NetworkProperties.m_NumThreads > 0 && networkProperties.m_AsyncEnabled)
+    // Should occur after factories are registered so that the WorkingMemHandles can be created
+    if (m_NetworkProperties.m_NumThreads > 1 && networkProperties.m_AsyncEnabled)
     {
         CreateThreadPool(m_NetworkProperties.m_NumThreads);
     }
 
     if (!networkProperties.m_AsyncEnabled)
     {
-        for (auto &&layer : order)
+        for (auto&& layer : order)
         {
-            auto &workloadFactory = GetWorkloadFactory(*layer);
+            auto& workloadFactory = GetWorkloadFactory(*layer);
 
             switch (layer->GetType())
             {
diff --git a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
index 87cade7dca..0ad6bc4964 100644
--- a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
+++ b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
@@ -123,7 +123,7 @@ void AsyncEndToEndTestImpl(INetworkPtr network,
                            const std::map<int, std::vector<TOutput>>& expectedOutputData,
                            std::vector<BackendId> backends,
                            float tolerance = 0.000001f,
-                           size_t numThreads = 0)
+                           size_t numThreads = 1)
 {
     // Create Runtime in which test will run
     IRuntime::CreationOptions options;
@@ -161,7 +161,7 @@ void AsyncEndToEndTestImpl(INetworkPtr network,
                                         outputStorage.at(it.first).data())});
     }
 
-    if (numThreads == 0)
+    if (numThreads <= 1)
     {
         // Create WorkingMemHandle for this async network
         std::unique_ptr<IWorkingMemHandle> workingMemHandle = runtime->CreateWorkingMemHandle(networkId);
@@ -254,7 +254,7 @@ INetworkPtr CreateStridedSliceNetwork(const TensorShape& inputShape,
 }
 
 template<armnn::DataType ArmnnType>
-void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends)
+void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends, size_t numThreads)
 {
     using namespace armnn;
     using T = ResolveType<ArmnnType>;
@@ -300,103 +300,12 @@ void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends)
     std::map<int, std::vector<T>> inputTensorData = {{0, inputData}};
     std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}};
 
-    AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f);
-}
-
-template<armnn::DataType ArmnnType>
-void AsyncScheduledStridedSlicedEndToEndTest(const std::vector<BackendId>& backends)
-{
-    using namespace armnn;
-    using T = ResolveType<ArmnnType>;
-
-    const TensorShape& inputShape = {3, 2, 3, 1};
-    const TensorShape& outputShape = {1, 2, 3, 1};
-    const std::vector<int>& beginData = {1, 0, 0, 0};
-    const std::vector<int>& endData = {2, 2, 3, 1};
-    const std::vector<int>& stridesData = {1, 1, 1, 1};
-    int beginMask = 0;
-    int endMask = 0;
-    int shrinkAxisMask = 0;
-    int ellipsisMask = 0;
-    int newAxisMask = 0;
-
-    // Builds up the structure of the network
-    INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape,
-                                                           outputShape,
-                                                           beginData,
-                                                           endData,
-                                                           stridesData,
-                                                           beginMask,
-                                                           endMask,
-                                                           shrinkAxisMask,
-                                                           ellipsisMask,
-                                                           newAxisMask);
-
-    // Creates structures for input & output.
-    std::vector<T> inputData{
-            1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
-
-            3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
-
-            5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
-    };
-
-    std::vector<T> outputExpected{
-            3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f
-    };
-
-    std::map<int, std::vector<T>> inputTensorData = {{0, inputData}};
-    std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}};
-
-    AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f, 1);
-}
-
-template<armnn::DataType ArmnnType>
-void AsyncScheduledStridedSlicedMultiThreadedEndToEndTest(const std::vector<BackendId>& backends)
-{
-    using namespace armnn;
-    using T = ResolveType<ArmnnType>;
-
-    const TensorShape& inputShape = {3, 2, 3, 1};
-    const TensorShape& outputShape = {1, 2, 3, 1};
-    const std::vector<int>& beginData = {1, 0, 0, 0};
-    const std::vector<int>& endData = {2, 2, 3, 1};
-    const std::vector<int>& stridesData = {1, 1, 1, 1};
-    int beginMask = 0;
-    int endMask = 0;
-    int shrinkAxisMask = 0;
-    int ellipsisMask = 0;
-    int newAxisMask = 0;
-
-    // Builds up the structure of the network
-    INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape,
-                                                           outputShape,
-                                                           beginData,
-                                                           endData,
-                                                           stridesData,
-                                                           beginMask,
-                                                           endMask,
-                                                           shrinkAxisMask,
-                                                           ellipsisMask,
-                                                           newAxisMask);
-
-    // Creates structures for input & output.
-    std::vector<T> inputData{
-            1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
-
-            3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
-
-            5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
-    };
-
-    std::vector<T> outputExpected{
-            3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f
-    };
-
-    std::map<int, std::vector<T>> inputTensorData = {{0, inputData}};
-    std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}};
-
-    AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f, 3);
+    AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net),
+                                                inputTensorData,
+                                                expectedOutputData,
+                                                backends,
+                                                0.000001f,
+                                                numThreads);
 }
 
 template<armnn::DataType ArmnnType>
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index 54b16dac38..910df292a9 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -1336,9 +1336,9 @@ BOOST_AUTO_TEST_CASE(RefStridedSliceInvalidSliceEndToEndTest)
     StridedSliceInvalidSliceEndToEndTest(defaultBackends);
 }
 
-BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedEndToEndTest)
+BOOST_AUTO_TEST_CASE(RefThreadSafeFP32StridedSlicedEndToEndTest)
 {
-    armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
+    armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends, 1);
 }
 
 BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedMultiThreadedEndToEndTest)
@@ -1346,15 +1346,9 @@ BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedMultiThreadedEndToEndTest)
     armnn::experimental::StridedSlicedMultiThreadedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
 }
 
-BOOST_AUTO_TEST_CASE(RefAsyncScheduledFP32StridedSlicedEndToEndTest)
+BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedScheduledMultiThreadedEndToEndTest)
 {
-    armnn::experimental::AsyncScheduledStridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
-}
-
-BOOST_AUTO_TEST_CASE(RefAsyncScheduledStridedSlicedMultiThreadedEndToEndTest)
-{
-    using namespace armnn::experimental;
-    AsyncScheduledStridedSlicedMultiThreadedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
+    armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends, 3);
 }
 #endif
 
diff --git a/tests/ExecuteNetwork/ExecuteNetwork.cpp b/tests/ExecuteNetwork/ExecuteNetwork.cpp
index 2bbb51783c..cd760a8199 100644
--- a/tests/ExecuteNetwork/ExecuteNetwork.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetwork.cpp
@@ -5,6 +5,8 @@
 
 #include "NetworkExecutionUtils/NetworkExecutionUtils.hpp"
 #include "ExecuteNetworkProgramOptions.hpp"
+#include <armnn/IAsyncExecutionCallback.hpp>
+#include <AsyncExecutionCallback.hpp>
 
 #include <armnn/Logging.hpp>
 #include <Filesystem.hpp>
@@ -276,8 +278,7 @@ template<typename TParser, typename TDataType>
 int MainImpl(const ExecuteNetworkParams& params,
              const std::shared_ptr<armnn::IRuntime>& runtime = nullptr)
 {
-    using TContainer =
-           mapbox::util::variant<std::vector<float>, std::vector<int>, std::vector<unsigned char>, std::vector<int8_t>>;
+    using namespace std::chrono;
 
     std::vector<std::vector<TContainer>> inputs;
     std::vector<std::vector<TContainer>> outputs;
@@ -300,6 +301,7 @@ int MainImpl(const ExecuteNetworkParams& params,
         inferenceModelParams.m_NumberOfThreads                = params.m_NumberOfThreads;
         inferenceModelParams.m_MLGOTuningFilePath             = params.m_MLGOTuningFilePath;
         inferenceModelParams.m_AsyncEnabled                   = params.m_Concurrent;
+        inferenceModelParams.m_ThreadPoolSize                 = params.m_ThreadPoolSize;
 
         for(const std::string& inputName: params.m_InputNames)
         {
@@ -390,9 +392,9 @@ int MainImpl(const ExecuteNetworkParams& params,
             outputs.push_back(outputDataContainers);
         }
 
+        // Synchronous execution
         if (!params.m_Concurrent)
         {
-            // Synchronous Execution
             for (size_t x = 0; x < params.m_Iterations; x++)
             {
                 // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
@@ -437,13 +439,118 @@ int MainImpl(const ExecuteNetworkParams& params,
                 }
             }
         }
+        // Asynchronous execution using the Arm NN thread pool
+        else if (params.m_ThreadPoolSize >= 2)
+        {
+            try
+            {
+                ARMNN_LOG(info) << "Asynchronous execution with Arm NN thread pool...  \n";
+                std::vector<armnn::experimental::IAsyncExecutionCallbackPtr> callbacks;
+
+                // Create callbacks that will be checked post scheduling
+                for (size_t i = 0; i < params.m_SimultaneousIterations; ++i)
+                {
+                    // Point to ArmNN example implementation of AsyncExecutionCallback
+                    callbacks.emplace_back(std::make_shared<armnn::experimental::AsyncExecutionCallback>());
+                }
+
+                // Declare the latest and earliest inference times here to be used when calculating overall time
+                std::chrono::high_resolution_clock::time_point earliestStartTime;
+                std::chrono::high_resolution_clock::time_point latestEndTime =
+                    std::chrono::high_resolution_clock::now();
+
+                // For the asynchronous execution, we are adding a pool of working memory handles (1 per thread) in the
+                // LoadedNetwork with each scheduled inference having a specific priority
+                for (size_t i = 0; i < callbacks.size(); ++i)
+                {
+                    model.RunAsync(inputs[i], outputs[i], callbacks[i]);
+                }
+
+                // Check the results
+                unsigned int j = 0;
+                for (armnn::experimental::IAsyncExecutionCallbackPtr cb : callbacks)
+                {
+                    // Get the results
+                    auto endTime = time_point_cast<std::chrono::milliseconds>(cb->GetEndTime());
+                    auto startTime = time_point_cast<std::chrono::milliseconds>(cb->GetStartTime());
+                    auto inferenceDuration = endTime - startTime;
+
+                    if (latestEndTime < cb->GetEndTime())
+                    {
+                        latestEndTime = cb->GetEndTime();
+                    }
+
+                    if (earliestStartTime.time_since_epoch().count() == 0)
+                    {
+                        earliestStartTime = cb->GetStartTime();
+                    }
+                    else if (earliestStartTime > cb->GetStartTime())
+                    {
+                        earliestStartTime = cb->GetStartTime();
+                    }
+
+                    if (params.m_GenerateTensorData)
+                    {
+                        ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
+                    }
+
+                    // Print output tensors
+                    const auto& infosOut = model.GetOutputBindingInfos();
+                    for (size_t i = 0; i < numOutputs; i++)
+                    {
+                        const armnn::TensorInfo& infoOut = infosOut[i].second;
+                        auto outputTensorFile = params.m_OutputTensorFiles.empty()
+                                                ? ""
+                                                : params.m_OutputTensorFiles[(j * numOutputs) + i];
+
+                        TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
+                                              infoOut,
+                                              outputTensorFile,
+                                              params.m_DequantizeOutput);
+                        mapbox::util::apply_visitor(printer, outputs[j][i]);
+                    }
+
+                    ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
+                                    << std::fixed << inferenceDuration.count() << " ms\n";
+
+                     // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
+                    if (params.m_ThresholdTime != 0.0)
+                    {
+                        ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
+                                        << std::fixed << params.m_ThresholdTime << " ms";
+                        auto thresholdMinusInference =
+                            params.m_ThresholdTime - duration<double, std::milli>(inferenceDuration).count();
+                        ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
+                                        << std::fixed << thresholdMinusInference << " ms" << "\n";
+
+                        if (thresholdMinusInference < 0)
+                        {
+                            ARMNN_LOG(fatal) << "Elapsed inference time is greater than provided threshold time. \n";
+                        }
+                    }
+                    ++j;
+                }
+                //print duration difference between overallStartTime and overallEndTime
+                auto overallEndTime = time_point_cast<std::chrono::milliseconds>(latestEndTime);
+                auto overallStartTime = time_point_cast<std::chrono::milliseconds>(earliestStartTime);
+                auto totalInferenceDuration = overallEndTime - overallStartTime;
+                ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2)
+                                << std::fixed << totalInferenceDuration.count() << " ms\n";
+            }
+            catch (const armnn::Exception& e)
+            {
+                ARMNN_LOG(fatal) << "Armnn Error: " << e.what();
+                return EXIT_FAILURE;
+            }
+        }
+        // Asynchronous execution using std::launch::async
         else
         {
             try
             {
-                ARMNN_LOG(info) << "Asynchronous Execution...  \n";
+                ARMNN_LOG(info) << "Asynchronous Execution with std::launch:async...  \n";
                 std::vector<std::future<std::tuple<armnn::profiling::ProfilingGuid,
-                std::chrono::duration<double, std::milli>>>> inferenceResults;
+                    std::chrono::duration<double, std::milli>>>> inferenceResults;
                 inferenceResults.reserve(params.m_SimultaneousIterations);
 
                 // Create WorkingMemHandles for each inference
@@ -455,6 +562,8 @@ int MainImpl(const ExecuteNetworkParams& params,
                 }
 
                 // Run each inference in its own thread
+                // start a timer
+                const auto start_time = armnn::GetTimeNow();
                 for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i)
                 {
                     armnn::experimental::IWorkingMemHandle& workingMemHandleRef = *workingMemHandles[i].get();
@@ -470,7 +579,7 @@ int MainImpl(const ExecuteNetworkParams& params,
                 {
                     // Get the results
                     auto inferenceResult = inferenceResults[j].get();
-                    auto inference_duration = std::get<1>(inferenceResult);
+                    auto inferenceDuration = std::get<1>(inferenceResult);
                     auto inferenceID = std::get<0>(inferenceResult);
 
                     if (params.m_GenerateTensorData)
@@ -495,14 +604,14 @@ int MainImpl(const ExecuteNetworkParams& params,
                     }
 
                     ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
-                                    << std::fixed << inference_duration.count() << " ms\n";
+                                    << std::fixed << inferenceDuration.count() << " ms\n";
 
                     // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
                     if (params.m_ThresholdTime != 0.0)
                     {
                         ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
                                         << std::fixed << params.m_ThresholdTime << " ms";
-                        auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
+                        auto thresholdMinusInference = params.m_ThresholdTime - inferenceDuration.count();
                         ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
                                         << std::fixed << thresholdMinusInference << " ms" << "\n";
 
@@ -514,13 +623,16 @@ int MainImpl(const ExecuteNetworkParams& params,
                     ARMNN_LOG(info) << "Asynchronous Execution is finished for Inference ID: " << inferenceID << " \n";
 
                 }
+                // finish timer
+                const auto duration = armnn::GetTimeDuration(start_time);
+                ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2)
+                                << std::fixed << duration.count() << " ms\n";
             }
             catch (const armnn::Exception& e)
             {
                 ARMNN_LOG(fatal) << "Armnn Error: " << e.what();
                 return EXIT_FAILURE;
             }
-
         }
     }
     catch (const armnn::Exception& e)
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
index 944efec17f..189ece25a7 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
@@ -138,9 +138,11 @@ void ExecuteNetworkParams::ValidateParams()
         CheckModelFormat(m_ModelFormat);
 
         // Check number of simultaneous iterations
-        if ((m_SimultaneousIterations < 1))
+        // Testing std::launch::async with a single iteration is possible if concurrent is manually set
+        if ((m_SimultaneousIterations <= 1 && m_ThreadPoolSize > 1) ||
+            (m_SimultaneousIterations <= 1 && !m_Concurrent))
         {
-            ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 1. ";
+            ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 2.";
         }
 
         // Check input tensor shapes
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
index c325df110f..fe0c446087 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
@@ -57,6 +57,7 @@ struct ExecuteNetworkParams
     std::string                   m_TuningPath;
     std::string                   m_MLGOTuningFilePath;
     TfLiteExecutor                m_TfLiteExecutor;
+    size_t                        m_ThreadPoolSize;
 
     // Ensures that the parameters for ExecuteNetwork fit together
     void ValidateParams();
diff --git a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
index 042087e4f4..1f57f85252 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
@@ -195,7 +195,8 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork",
                  cxxopts::value<std::string>(m_RuntimeOptions.m_DynamicBackendsPath))
 
                 ("n,concurrent",
-                 "If this option is enabled inferences will be executed in parallel asynchronously.",
+                 "This option is for Arm NN internal asynchronous testing purposes. "
+                 "By default it is set to true if thread-pool-size or simultaneous-iterations are greater than 1",
                  cxxopts::value<bool>(m_ExNetParams.m_Concurrent)->default_value("false")->implicit_value("true"))
 
                 ("d,input-tensor-data",
@@ -284,9 +285,15 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork",
                  "This option is depreciated please use tflite-executor instead",
                  cxxopts::value<bool>(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true"))
 
-               ("simultaneous-iterations",
-                "Number of simultaneous iterations to async-run the network for, default is set to 1",
-                cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1"));
+                ("simultaneous-iterations",
+                 "Number of simultaneous iterations to async-run the network for, default is set to 1 (disabled). "
+                 "When thread-pool-size is set the Arm NN thread pool is used. Otherwise std::launch::async is used.",
+                 cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1"))
+
+                ("thread-pool-size",
+                 "Number of Arm NN threads to use when running the network asynchronously via the Arm NN thread pool. "
+                 "The default is set to 1",
+                 cxxopts::value<size_t>(m_ExNetParams.m_ThreadPoolSize)->default_value("1"));
 
         m_CxxOptions.add_options("c) Optimization")
                 ("bf16-turbo-mode",
@@ -453,7 +460,11 @@ void ProgramOptions::ParseOptions(int ac, const char* av[])
                                        "please use tflite-executor instead.");
     }
 
-
+    // Set concurrent to true if the user expects to run inferences asynchronously
+    if (m_ExNetParams.m_SimultaneousIterations > 1)
+    {
+        m_ExNetParams.m_Concurrent = true;
+    }
 
     // Parse input tensor shape from the string we got from the command-line.
     std::vector<std::string> inputTensorShapesVector =
diff --git a/tests/InferenceModel.hpp b/tests/InferenceModel.hpp
index 3429598249..7c51011a22 100644
--- a/tests/InferenceModel.hpp
+++ b/tests/InferenceModel.hpp
@@ -102,6 +102,7 @@ struct Params
     unsigned int                    m_NumberOfThreads;
     std::string                     m_MLGOTuningFilePath;
     bool                            m_AsyncEnabled;
+    size_t                          m_ThreadPoolSize;
 
 
     Params()
@@ -120,6 +121,7 @@ struct Params
         , m_NumberOfThreads(0)
         , m_MLGOTuningFilePath("")
         , m_AsyncEnabled(false)
+        , m_ThreadPoolSize(1)
     {}
 };
 
@@ -481,7 +483,8 @@ public:
             const auto loading_start_time = armnn::GetTimeNow();
             armnn::INetworkProperties networkProperties(params.m_AsyncEnabled,
                                                         armnn::MemorySource::Undefined,
-                                                        armnn::MemorySource::Undefined);
+                                                        armnn::MemorySource::Undefined,
+                                                        params.m_ThreadPoolSize);
             std::string errorMessage;
             ret = m_Runtime->LoadNetwork(m_NetworkIdentifier, std::move(optNet), errorMessage, networkProperties);
 
@@ -632,6 +635,47 @@ public:
         }
     }
 
+    void RunAsync(const std::vector<TContainer>& inputContainers,
+                  std::vector<TContainer>& outputContainers,
+                  armnn::experimental::IAsyncExecutionCallbackPtr cb)
+    {
+        for (unsigned int i = 0; i < outputContainers.size(); ++i)
+        {
+            const unsigned int expectedOutputDataSize = GetOutputSize(i);
+
+            mapbox::util::apply_visitor([expectedOutputDataSize, i](auto&& value)
+            {
+                const unsigned int actualOutputDataSize   = armnn::numeric_cast<unsigned int>(value.size());
+                if (actualOutputDataSize < expectedOutputDataSize)
+                {
+                    unsigned int outputIndex = i;
+                    throw armnn::Exception(
+                            fmt::format("Not enough data for output #{0}: expected "
+                            "{1} elements, got {2}", outputIndex, expectedOutputDataSize, actualOutputDataSize));
+                }
+            },
+            outputContainers[i]);
+        }
+
+        std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkIdentifier);
+        if (profiler)
+        {
+            profiler->EnableProfiling(m_EnableProfiling);
+        }
+
+        m_Runtime->Schedule(m_NetworkIdentifier,
+                            MakeInputTensors(inputContainers),
+                            MakeOutputTensors(outputContainers),
+                            armnn::QosExecPriority::Medium,
+                            cb);
+
+        // if profiling is enabled print out the results
+        if (profiler && profiler->IsProfilingEnabled())
+        {
+            profiler->Print(std::cout);
+        }
+    }
+
     const armnn::BindingPointInfo& GetInputBindingInfo(unsigned int inputIndex = 0u) const
     {
         CheckInputIndexIsValid(inputIndex);
author	Kevin May <kevin.may@arm.com>	2021-05-21 16:42:21 +0100
committer	Kevin May <kevin.may@arm.com>	2021-05-26 11:56:54 +0000
commit	b4b3ac91990eb5deaffca2300319f2ddf7aa0886 (patch)
tree	e480826fe604d652877459ce4bbf4314a461e4b2
parent	401c1c3f973da1a2e2cef7f88a5aac2cf295fac7 (diff)
download	armnn-b4b3ac91990eb5deaffca2300319f2ddf7aa0886.tar.gz