aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin May <kevin.may@arm.com>2021-05-21 16:42:21 +0100
committerKevin May <kevin.may@arm.com>2021-05-26 11:56:54 +0000
commitb4b3ac91990eb5deaffca2300319f2ddf7aa0886 (patch)
treee480826fe604d652877459ce4bbf4314a461e4b2
parent401c1c3f973da1a2e2cef7f88a5aac2cf295fac7 (diff)
downloadarmnn-b4b3ac91990eb5deaffca2300319f2ddf7aa0886.tar.gz
IVGCVSW-6009 Integrate threadpool into ExNet
* Remove concurrent flag from ExecuteNetwork as it is possible to deduce if SimultaneousIterations > 1 * Add void RunAsync() * Refactor some unit tests Change-Id: I7021d4821b0e460470908294cbd9462850e8b361 Signed-off-by: Keith Davis <keith.davis@arm.com> Signed-off-by: Kevin May <kevin.may@arm.com>
-rw-r--r--include/armnn/IRuntime.hpp4
-rw-r--r--src/armnn/LoadedNetwork.cpp8
-rw-r--r--src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp109
-rw-r--r--src/backends/reference/test/RefEndToEndTests.cpp14
-rw-r--r--tests/ExecuteNetwork/ExecuteNetwork.cpp130
-rw-r--r--tests/ExecuteNetwork/ExecuteNetworkParams.cpp6
-rw-r--r--tests/ExecuteNetwork/ExecuteNetworkParams.hpp1
-rw-r--r--tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp21
-rw-r--r--tests/InferenceModel.hpp46
9 files changed, 206 insertions, 133 deletions
diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp
index 870e027f33..bfc13c9c01 100644
--- a/include/armnn/IRuntime.hpp
+++ b/include/armnn/IRuntime.hpp
@@ -33,7 +33,7 @@ struct INetworkProperties
INetworkProperties(bool importEnabled = false,
bool exportEnabled = false,
bool asyncEnabled = false,
- size_t numThreads = 0)
+ size_t numThreads = 1)
: m_ImportEnabled(importEnabled)
, m_ExportEnabled(exportEnabled)
, m_AsyncEnabled(asyncEnabled)
@@ -45,7 +45,7 @@ struct INetworkProperties
INetworkProperties(bool asyncEnabled,
MemorySource m_InputSource,
MemorySource m_OutputSource,
- size_t numThreads = 0)
+ size_t numThreads = 1)
: m_ImportEnabled(m_InputSource != MemorySource::Undefined)
, m_ExportEnabled(m_OutputSource != MemorySource::Undefined)
, m_AsyncEnabled(asyncEnabled)
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index 53a9e18863..25f8ddf6e5 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -169,17 +169,17 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
}
// Create the thread pool which will have working memory handles assigned to each thread
- // Should occur after factories are registered so thet the WorkingMemHandles can be created
- if (m_NetworkProperties.m_NumThreads > 0 && networkProperties.m_AsyncEnabled)
+ // Should occur after factories are registered so that the WorkingMemHandles can be created
+ if (m_NetworkProperties.m_NumThreads > 1 && networkProperties.m_AsyncEnabled)
{
CreateThreadPool(m_NetworkProperties.m_NumThreads);
}
if (!networkProperties.m_AsyncEnabled)
{
- for (auto &&layer : order)
+ for (auto&& layer : order)
{
- auto &workloadFactory = GetWorkloadFactory(*layer);
+ auto& workloadFactory = GetWorkloadFactory(*layer);
switch (layer->GetType())
{
diff --git a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
index 87cade7dca..0ad6bc4964 100644
--- a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
+++ b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
@@ -123,7 +123,7 @@ void AsyncEndToEndTestImpl(INetworkPtr network,
const std::map<int, std::vector<TOutput>>& expectedOutputData,
std::vector<BackendId> backends,
float tolerance = 0.000001f,
- size_t numThreads = 0)
+ size_t numThreads = 1)
{
// Create Runtime in which test will run
IRuntime::CreationOptions options;
@@ -161,7 +161,7 @@ void AsyncEndToEndTestImpl(INetworkPtr network,
outputStorage.at(it.first).data())});
}
- if (numThreads == 0)
+ if (numThreads <= 1)
{
// Create WorkingMemHandle for this async network
std::unique_ptr<IWorkingMemHandle> workingMemHandle = runtime->CreateWorkingMemHandle(networkId);
@@ -254,7 +254,7 @@ INetworkPtr CreateStridedSliceNetwork(const TensorShape& inputShape,
}
template<armnn::DataType ArmnnType>
-void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends)
+void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends, size_t numThreads)
{
using namespace armnn;
using T = ResolveType<ArmnnType>;
@@ -300,103 +300,12 @@ void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends)
std::map<int, std::vector<T>> inputTensorData = {{0, inputData}};
std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}};
- AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f);
-}
-
-template<armnn::DataType ArmnnType>
-void AsyncScheduledStridedSlicedEndToEndTest(const std::vector<BackendId>& backends)
-{
- using namespace armnn;
- using T = ResolveType<ArmnnType>;
-
- const TensorShape& inputShape = {3, 2, 3, 1};
- const TensorShape& outputShape = {1, 2, 3, 1};
- const std::vector<int>& beginData = {1, 0, 0, 0};
- const std::vector<int>& endData = {2, 2, 3, 1};
- const std::vector<int>& stridesData = {1, 1, 1, 1};
- int beginMask = 0;
- int endMask = 0;
- int shrinkAxisMask = 0;
- int ellipsisMask = 0;
- int newAxisMask = 0;
-
- // Builds up the structure of the network
- INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape,
- outputShape,
- beginData,
- endData,
- stridesData,
- beginMask,
- endMask,
- shrinkAxisMask,
- ellipsisMask,
- newAxisMask);
-
- // Creates structures for input & output.
- std::vector<T> inputData{
- 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
-
- 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
-
- 5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
- };
-
- std::vector<T> outputExpected{
- 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f
- };
-
- std::map<int, std::vector<T>> inputTensorData = {{0, inputData}};
- std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}};
-
- AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f, 1);
-}
-
-template<armnn::DataType ArmnnType>
-void AsyncScheduledStridedSlicedMultiThreadedEndToEndTest(const std::vector<BackendId>& backends)
-{
- using namespace armnn;
- using T = ResolveType<ArmnnType>;
-
- const TensorShape& inputShape = {3, 2, 3, 1};
- const TensorShape& outputShape = {1, 2, 3, 1};
- const std::vector<int>& beginData = {1, 0, 0, 0};
- const std::vector<int>& endData = {2, 2, 3, 1};
- const std::vector<int>& stridesData = {1, 1, 1, 1};
- int beginMask = 0;
- int endMask = 0;
- int shrinkAxisMask = 0;
- int ellipsisMask = 0;
- int newAxisMask = 0;
-
- // Builds up the structure of the network
- INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape,
- outputShape,
- beginData,
- endData,
- stridesData,
- beginMask,
- endMask,
- shrinkAxisMask,
- ellipsisMask,
- newAxisMask);
-
- // Creates structures for input & output.
- std::vector<T> inputData{
- 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
-
- 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
-
- 5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
- };
-
- std::vector<T> outputExpected{
- 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f
- };
-
- std::map<int, std::vector<T>> inputTensorData = {{0, inputData}};
- std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}};
-
- AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f, 3);
+ AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net),
+ inputTensorData,
+ expectedOutputData,
+ backends,
+ 0.000001f,
+ numThreads);
}
template<armnn::DataType ArmnnType>
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index 54b16dac38..910df292a9 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -1336,9 +1336,9 @@ BOOST_AUTO_TEST_CASE(RefStridedSliceInvalidSliceEndToEndTest)
StridedSliceInvalidSliceEndToEndTest(defaultBackends);
}
-BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedEndToEndTest)
+BOOST_AUTO_TEST_CASE(RefThreadSafeFP32StridedSlicedEndToEndTest)
{
- armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
+ armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends, 1);
}
BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedMultiThreadedEndToEndTest)
@@ -1346,15 +1346,9 @@ BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedMultiThreadedEndToEndTest)
armnn::experimental::StridedSlicedMultiThreadedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
}
-BOOST_AUTO_TEST_CASE(RefAsyncScheduledFP32StridedSlicedEndToEndTest)
+BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedScheduledMultiThreadedEndToEndTest)
{
- armnn::experimental::AsyncScheduledStridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
-}
-
-BOOST_AUTO_TEST_CASE(RefAsyncScheduledStridedSlicedMultiThreadedEndToEndTest)
-{
- using namespace armnn::experimental;
- AsyncScheduledStridedSlicedMultiThreadedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
+ armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends, 3);
}
#endif
diff --git a/tests/ExecuteNetwork/ExecuteNetwork.cpp b/tests/ExecuteNetwork/ExecuteNetwork.cpp
index 2bbb51783c..cd760a8199 100644
--- a/tests/ExecuteNetwork/ExecuteNetwork.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetwork.cpp
@@ -5,6 +5,8 @@
#include "NetworkExecutionUtils/NetworkExecutionUtils.hpp"
#include "ExecuteNetworkProgramOptions.hpp"
+#include <armnn/IAsyncExecutionCallback.hpp>
+#include <AsyncExecutionCallback.hpp>
#include <armnn/Logging.hpp>
#include <Filesystem.hpp>
@@ -276,8 +278,7 @@ template<typename TParser, typename TDataType>
int MainImpl(const ExecuteNetworkParams& params,
const std::shared_ptr<armnn::IRuntime>& runtime = nullptr)
{
- using TContainer =
- mapbox::util::variant<std::vector<float>, std::vector<int>, std::vector<unsigned char>, std::vector<int8_t>>;
+ using namespace std::chrono;
std::vector<std::vector<TContainer>> inputs;
std::vector<std::vector<TContainer>> outputs;
@@ -300,6 +301,7 @@ int MainImpl(const ExecuteNetworkParams& params,
inferenceModelParams.m_NumberOfThreads = params.m_NumberOfThreads;
inferenceModelParams.m_MLGOTuningFilePath = params.m_MLGOTuningFilePath;
inferenceModelParams.m_AsyncEnabled = params.m_Concurrent;
+ inferenceModelParams.m_ThreadPoolSize = params.m_ThreadPoolSize;
for(const std::string& inputName: params.m_InputNames)
{
@@ -390,9 +392,9 @@ int MainImpl(const ExecuteNetworkParams& params,
outputs.push_back(outputDataContainers);
}
+ // Synchronous execution
if (!params.m_Concurrent)
{
- // Synchronous Execution
for (size_t x = 0; x < params.m_Iterations; x++)
{
// model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
@@ -437,13 +439,118 @@ int MainImpl(const ExecuteNetworkParams& params,
}
}
}
+ // Asynchronous execution using the Arm NN thread pool
+ else if (params.m_ThreadPoolSize >= 2)
+ {
+ try
+ {
+ ARMNN_LOG(info) << "Asynchronous execution with Arm NN thread pool... \n";
+ std::vector<armnn::experimental::IAsyncExecutionCallbackPtr> callbacks;
+
+ // Create callbacks that will be checked post scheduling
+ for (size_t i = 0; i < params.m_SimultaneousIterations; ++i)
+ {
+ // Point to ArmNN example implementation of AsyncExecutionCallback
+ callbacks.emplace_back(std::make_shared<armnn::experimental::AsyncExecutionCallback>());
+ }
+
+ // Declare the latest and earliest inference times here to be used when calculating overall time
+ std::chrono::high_resolution_clock::time_point earliestStartTime;
+ std::chrono::high_resolution_clock::time_point latestEndTime =
+ std::chrono::high_resolution_clock::now();
+
+ // For the asynchronous execution, we are adding a pool of working memory handles (1 per thread) in the
+ // LoadedNetwork with each scheduled inference having a specific priority
+ for (size_t i = 0; i < callbacks.size(); ++i)
+ {
+ model.RunAsync(inputs[i], outputs[i], callbacks[i]);
+ }
+
+ // Check the results
+ unsigned int j = 0;
+ for (armnn::experimental::IAsyncExecutionCallbackPtr cb : callbacks)
+ {
+ // Get the results
+ auto endTime = time_point_cast<std::chrono::milliseconds>(cb->GetEndTime());
+ auto startTime = time_point_cast<std::chrono::milliseconds>(cb->GetStartTime());
+ auto inferenceDuration = endTime - startTime;
+
+ if (latestEndTime < cb->GetEndTime())
+ {
+ latestEndTime = cb->GetEndTime();
+ }
+
+ if (earliestStartTime.time_since_epoch().count() == 0)
+ {
+ earliestStartTime = cb->GetStartTime();
+ }
+ else if (earliestStartTime > cb->GetStartTime())
+ {
+ earliestStartTime = cb->GetStartTime();
+ }
+
+ if (params.m_GenerateTensorData)
+ {
+ ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
+ }
+
+ // Print output tensors
+ const auto& infosOut = model.GetOutputBindingInfos();
+ for (size_t i = 0; i < numOutputs; i++)
+ {
+ const armnn::TensorInfo& infoOut = infosOut[i].second;
+ auto outputTensorFile = params.m_OutputTensorFiles.empty()
+ ? ""
+ : params.m_OutputTensorFiles[(j * numOutputs) + i];
+
+ TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
+ infoOut,
+ outputTensorFile,
+ params.m_DequantizeOutput);
+ mapbox::util::apply_visitor(printer, outputs[j][i]);
+ }
+
+ ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
+ << std::fixed << inferenceDuration.count() << " ms\n";
+
+ // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
+ if (params.m_ThresholdTime != 0.0)
+ {
+ ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
+ << std::fixed << params.m_ThresholdTime << " ms";
+ auto thresholdMinusInference =
+ params.m_ThresholdTime - duration<double, std::milli>(inferenceDuration).count();
+ ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
+ << std::fixed << thresholdMinusInference << " ms" << "\n";
+
+ if (thresholdMinusInference < 0)
+ {
+ ARMNN_LOG(fatal) << "Elapsed inference time is greater than provided threshold time. \n";
+ }
+ }
+ ++j;
+ }
+ //print duration difference between overallStartTime and overallEndTime
+ auto overallEndTime = time_point_cast<std::chrono::milliseconds>(latestEndTime);
+ auto overallStartTime = time_point_cast<std::chrono::milliseconds>(earliestStartTime);
+ auto totalInferenceDuration = overallEndTime - overallStartTime;
+ ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2)
+ << std::fixed << totalInferenceDuration.count() << " ms\n";
+ }
+ catch (const armnn::Exception& e)
+ {
+ ARMNN_LOG(fatal) << "Armnn Error: " << e.what();
+ return EXIT_FAILURE;
+ }
+ }
+ // Asynchronous execution using std::launch::async
else
{
try
{
- ARMNN_LOG(info) << "Asynchronous Execution... \n";
+ ARMNN_LOG(info) << "Asynchronous Execution with std::launch:async... \n";
std::vector<std::future<std::tuple<armnn::profiling::ProfilingGuid,
- std::chrono::duration<double, std::milli>>>> inferenceResults;
+ std::chrono::duration<double, std::milli>>>> inferenceResults;
inferenceResults.reserve(params.m_SimultaneousIterations);
// Create WorkingMemHandles for each inference
@@ -455,6 +562,8 @@ int MainImpl(const ExecuteNetworkParams& params,
}
// Run each inference in its own thread
+ // start a timer
+ const auto start_time = armnn::GetTimeNow();
for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i)
{
armnn::experimental::IWorkingMemHandle& workingMemHandleRef = *workingMemHandles[i].get();
@@ -470,7 +579,7 @@ int MainImpl(const ExecuteNetworkParams& params,
{
// Get the results
auto inferenceResult = inferenceResults[j].get();
- auto inference_duration = std::get<1>(inferenceResult);
+ auto inferenceDuration = std::get<1>(inferenceResult);
auto inferenceID = std::get<0>(inferenceResult);
if (params.m_GenerateTensorData)
@@ -495,14 +604,14 @@ int MainImpl(const ExecuteNetworkParams& params,
}
ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
- << std::fixed << inference_duration.count() << " ms\n";
+ << std::fixed << inferenceDuration.count() << " ms\n";
// If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
if (params.m_ThresholdTime != 0.0)
{
ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
<< std::fixed << params.m_ThresholdTime << " ms";
- auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
+ auto thresholdMinusInference = params.m_ThresholdTime - inferenceDuration.count();
ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
<< std::fixed << thresholdMinusInference << " ms" << "\n";
@@ -514,13 +623,16 @@ int MainImpl(const ExecuteNetworkParams& params,
ARMNN_LOG(info) << "Asynchronous Execution is finished for Inference ID: " << inferenceID << " \n";
}
+ // finish timer
+ const auto duration = armnn::GetTimeDuration(start_time);
+ ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2)
+ << std::fixed << duration.count() << " ms\n";
}
catch (const armnn::Exception& e)
{
ARMNN_LOG(fatal) << "Armnn Error: " << e.what();
return EXIT_FAILURE;
}
-
}
}
catch (const armnn::Exception& e)
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
index 944efec17f..189ece25a7 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
@@ -138,9 +138,11 @@ void ExecuteNetworkParams::ValidateParams()
CheckModelFormat(m_ModelFormat);
// Check number of simultaneous iterations
- if ((m_SimultaneousIterations < 1))
+ // Testing std::launch::async with a single iteration is possible if concurrent is manually set
+ if ((m_SimultaneousIterations <= 1 && m_ThreadPoolSize > 1) ||
+ (m_SimultaneousIterations <= 1 && !m_Concurrent))
{
- ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 1. ";
+ ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 2.";
}
// Check input tensor shapes
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
index c325df110f..fe0c446087 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
@@ -57,6 +57,7 @@ struct ExecuteNetworkParams
std::string m_TuningPath;
std::string m_MLGOTuningFilePath;
TfLiteExecutor m_TfLiteExecutor;
+ size_t m_ThreadPoolSize;
// Ensures that the parameters for ExecuteNetwork fit together
void ValidateParams();
diff --git a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
index 042087e4f4..1f57f85252 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
@@ -195,7 +195,8 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork",
cxxopts::value<std::string>(m_RuntimeOptions.m_DynamicBackendsPath))
("n,concurrent",
- "If this option is enabled inferences will be executed in parallel asynchronously.",
+ "This option is for Arm NN internal asynchronous testing purposes. "
+ "By default it is set to true if thread-pool-size or simultaneous-iterations are greater than 1",
cxxopts::value<bool>(m_ExNetParams.m_Concurrent)->default_value("false")->implicit_value("true"))
("d,input-tensor-data",
@@ -284,9 +285,15 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork",
"This option is depreciated please use tflite-executor instead",
cxxopts::value<bool>(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true"))
- ("simultaneous-iterations",
- "Number of simultaneous iterations to async-run the network for, default is set to 1",
- cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1"));
+ ("simultaneous-iterations",
+ "Number of simultaneous iterations to async-run the network for, default is set to 1 (disabled). "
+ "When thread-pool-size is set the Arm NN thread pool is used. Otherwise std::launch::async is used.",
+ cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1"))
+
+ ("thread-pool-size",
+ "Number of Arm NN threads to use when running the network asynchronously via the Arm NN thread pool. "
+ "The default is set to 1",
+ cxxopts::value<size_t>(m_ExNetParams.m_ThreadPoolSize)->default_value("1"));
m_CxxOptions.add_options("c) Optimization")
("bf16-turbo-mode",
@@ -453,7 +460,11 @@ void ProgramOptions::ParseOptions(int ac, const char* av[])
"please use tflite-executor instead.");
}
-
+ // Set concurrent to true if the user expects to run inferences asynchronously
+ if (m_ExNetParams.m_SimultaneousIterations > 1)
+ {
+ m_ExNetParams.m_Concurrent = true;
+ }
// Parse input tensor shape from the string we got from the command-line.
std::vector<std::string> inputTensorShapesVector =
diff --git a/tests/InferenceModel.hpp b/tests/InferenceModel.hpp
index 3429598249..7c51011a22 100644
--- a/tests/InferenceModel.hpp
+++ b/tests/InferenceModel.hpp
@@ -102,6 +102,7 @@ struct Params
unsigned int m_NumberOfThreads;
std::string m_MLGOTuningFilePath;
bool m_AsyncEnabled;
+ size_t m_ThreadPoolSize;
Params()
@@ -120,6 +121,7 @@ struct Params
, m_NumberOfThreads(0)
, m_MLGOTuningFilePath("")
, m_AsyncEnabled(false)
+ , m_ThreadPoolSize(1)
{}
};
@@ -481,7 +483,8 @@ public:
const auto loading_start_time = armnn::GetTimeNow();
armnn::INetworkProperties networkProperties(params.m_AsyncEnabled,
armnn::MemorySource::Undefined,
- armnn::MemorySource::Undefined);
+ armnn::MemorySource::Undefined,
+ params.m_ThreadPoolSize);
std::string errorMessage;
ret = m_Runtime->LoadNetwork(m_NetworkIdentifier, std::move(optNet), errorMessage, networkProperties);
@@ -632,6 +635,47 @@ public:
}
}
+ void RunAsync(const std::vector<TContainer>& inputContainers,
+ std::vector<TContainer>& outputContainers,
+ armnn::experimental::IAsyncExecutionCallbackPtr cb)
+ {
+ for (unsigned int i = 0; i < outputContainers.size(); ++i)
+ {
+ const unsigned int expectedOutputDataSize = GetOutputSize(i);
+
+ mapbox::util::apply_visitor([expectedOutputDataSize, i](auto&& value)
+ {
+ const unsigned int actualOutputDataSize = armnn::numeric_cast<unsigned int>(value.size());
+ if (actualOutputDataSize < expectedOutputDataSize)
+ {
+ unsigned int outputIndex = i;
+ throw armnn::Exception(
+ fmt::format("Not enough data for output #{0}: expected "
+ "{1} elements, got {2}", outputIndex, expectedOutputDataSize, actualOutputDataSize));
+ }
+ },
+ outputContainers[i]);
+ }
+
+ std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkIdentifier);
+ if (profiler)
+ {
+ profiler->EnableProfiling(m_EnableProfiling);
+ }
+
+ m_Runtime->Schedule(m_NetworkIdentifier,
+ MakeInputTensors(inputContainers),
+ MakeOutputTensors(outputContainers),
+ armnn::QosExecPriority::Medium,
+ cb);
+
+ // if profiling is enabled print out the results
+ if (profiler && profiler->IsProfilingEnabled())
+ {
+ profiler->Print(std::cout);
+ }
+ }
+
const armnn::BindingPointInfo& GetInputBindingInfo(unsigned int inputIndex = 0u) const
{
CheckInputIndexIsValid(inputIndex);