diff options
-rw-r--r-- | include/armnn/IRuntime.hpp | 4 | ||||
-rw-r--r-- | src/armnn/LoadedNetwork.cpp | 8 | ||||
-rw-r--r-- | src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp | 109 | ||||
-rw-r--r-- | src/backends/reference/test/RefEndToEndTests.cpp | 14 | ||||
-rw-r--r-- | tests/ExecuteNetwork/ExecuteNetwork.cpp | 130 | ||||
-rw-r--r-- | tests/ExecuteNetwork/ExecuteNetworkParams.cpp | 6 | ||||
-rw-r--r-- | tests/ExecuteNetwork/ExecuteNetworkParams.hpp | 1 | ||||
-rw-r--r-- | tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp | 21 | ||||
-rw-r--r-- | tests/InferenceModel.hpp | 46 |
9 files changed, 206 insertions, 133 deletions
diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp index 870e027f33..bfc13c9c01 100644 --- a/include/armnn/IRuntime.hpp +++ b/include/armnn/IRuntime.hpp @@ -33,7 +33,7 @@ struct INetworkProperties INetworkProperties(bool importEnabled = false, bool exportEnabled = false, bool asyncEnabled = false, - size_t numThreads = 0) + size_t numThreads = 1) : m_ImportEnabled(importEnabled) , m_ExportEnabled(exportEnabled) , m_AsyncEnabled(asyncEnabled) @@ -45,7 +45,7 @@ struct INetworkProperties INetworkProperties(bool asyncEnabled, MemorySource m_InputSource, MemorySource m_OutputSource, - size_t numThreads = 0) + size_t numThreads = 1) : m_ImportEnabled(m_InputSource != MemorySource::Undefined) , m_ExportEnabled(m_OutputSource != MemorySource::Undefined) , m_AsyncEnabled(asyncEnabled) diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index 53a9e18863..25f8ddf6e5 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -169,17 +169,17 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net, } // Create the thread pool which will have working memory handles assigned to each thread - // Should occur after factories are registered so thet the WorkingMemHandles can be created - if (m_NetworkProperties.m_NumThreads > 0 && networkProperties.m_AsyncEnabled) + // Should occur after factories are registered so that the WorkingMemHandles can be created + if (m_NetworkProperties.m_NumThreads > 1 && networkProperties.m_AsyncEnabled) { CreateThreadPool(m_NetworkProperties.m_NumThreads); } if (!networkProperties.m_AsyncEnabled) { - for (auto &&layer : order) + for (auto&& layer : order) { - auto &workloadFactory = GetWorkloadFactory(*layer); + auto& workloadFactory = GetWorkloadFactory(*layer); switch (layer->GetType()) { diff --git a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp index 87cade7dca..0ad6bc4964 100644 --- a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp +++ b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp @@ -123,7 +123,7 @@ void AsyncEndToEndTestImpl(INetworkPtr network, const std::map<int, std::vector<TOutput>>& expectedOutputData, std::vector<BackendId> backends, float tolerance = 0.000001f, - size_t numThreads = 0) + size_t numThreads = 1) { // Create Runtime in which test will run IRuntime::CreationOptions options; @@ -161,7 +161,7 @@ void AsyncEndToEndTestImpl(INetworkPtr network, outputStorage.at(it.first).data())}); } - if (numThreads == 0) + if (numThreads <= 1) { // Create WorkingMemHandle for this async network std::unique_ptr<IWorkingMemHandle> workingMemHandle = runtime->CreateWorkingMemHandle(networkId); @@ -254,7 +254,7 @@ INetworkPtr CreateStridedSliceNetwork(const TensorShape& inputShape, } template<armnn::DataType ArmnnType> -void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends) +void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends, size_t numThreads) { using namespace armnn; using T = ResolveType<ArmnnType>; @@ -300,103 +300,12 @@ void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends) std::map<int, std::vector<T>> inputTensorData = {{0, inputData}}; std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}}; - AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f); -} - -template<armnn::DataType ArmnnType> -void AsyncScheduledStridedSlicedEndToEndTest(const std::vector<BackendId>& backends) -{ - using namespace armnn; - using T = ResolveType<ArmnnType>; - - const TensorShape& inputShape = {3, 2, 3, 1}; - const TensorShape& outputShape = {1, 2, 3, 1}; - const std::vector<int>& beginData = {1, 0, 0, 0}; - const std::vector<int>& endData = {2, 2, 3, 1}; - const std::vector<int>& stridesData = {1, 1, 1, 1}; - int beginMask = 0; - int endMask = 0; - int shrinkAxisMask = 0; - int ellipsisMask = 0; - int newAxisMask = 0; - - // Builds up the structure of the network - INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape, - outputShape, - beginData, - endData, - stridesData, - beginMask, - endMask, - shrinkAxisMask, - ellipsisMask, - newAxisMask); - - // Creates structures for input & output. - std::vector<T> inputData{ - 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, - - 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f, - - 5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f - }; - - std::vector<T> outputExpected{ - 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f - }; - - std::map<int, std::vector<T>> inputTensorData = {{0, inputData}}; - std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}}; - - AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f, 1); -} - -template<armnn::DataType ArmnnType> -void AsyncScheduledStridedSlicedMultiThreadedEndToEndTest(const std::vector<BackendId>& backends) -{ - using namespace armnn; - using T = ResolveType<ArmnnType>; - - const TensorShape& inputShape = {3, 2, 3, 1}; - const TensorShape& outputShape = {1, 2, 3, 1}; - const std::vector<int>& beginData = {1, 0, 0, 0}; - const std::vector<int>& endData = {2, 2, 3, 1}; - const std::vector<int>& stridesData = {1, 1, 1, 1}; - int beginMask = 0; - int endMask = 0; - int shrinkAxisMask = 0; - int ellipsisMask = 0; - int newAxisMask = 0; - - // Builds up the structure of the network - INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape, - outputShape, - beginData, - endData, - stridesData, - beginMask, - endMask, - shrinkAxisMask, - ellipsisMask, - newAxisMask); - - // Creates structures for input & output. - std::vector<T> inputData{ - 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, - - 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f, - - 5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f - }; - - std::vector<T> outputExpected{ - 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f - }; - - std::map<int, std::vector<T>> inputTensorData = {{0, inputData}}; - std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}}; - - AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends, 0.000001f, 3); + AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), + inputTensorData, + expectedOutputData, + backends, + 0.000001f, + numThreads); } template<armnn::DataType ArmnnType> diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp index 54b16dac38..910df292a9 100644 --- a/src/backends/reference/test/RefEndToEndTests.cpp +++ b/src/backends/reference/test/RefEndToEndTests.cpp @@ -1336,9 +1336,9 @@ BOOST_AUTO_TEST_CASE(RefStridedSliceInvalidSliceEndToEndTest) StridedSliceInvalidSliceEndToEndTest(defaultBackends); } -BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedEndToEndTest) +BOOST_AUTO_TEST_CASE(RefThreadSafeFP32StridedSlicedEndToEndTest) { - armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends); + armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends, 1); } BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedMultiThreadedEndToEndTest) @@ -1346,15 +1346,9 @@ BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedMultiThreadedEndToEndTest) armnn::experimental::StridedSlicedMultiThreadedEndToEndTest<armnn::DataType::Float32>(defaultBackends); } -BOOST_AUTO_TEST_CASE(RefAsyncScheduledFP32StridedSlicedEndToEndTest) +BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedScheduledMultiThreadedEndToEndTest) { - armnn::experimental::AsyncScheduledStridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends); -} - -BOOST_AUTO_TEST_CASE(RefAsyncScheduledStridedSlicedMultiThreadedEndToEndTest) -{ - using namespace armnn::experimental; - AsyncScheduledStridedSlicedMultiThreadedEndToEndTest<armnn::DataType::Float32>(defaultBackends); + armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends, 3); } #endif diff --git a/tests/ExecuteNetwork/ExecuteNetwork.cpp b/tests/ExecuteNetwork/ExecuteNetwork.cpp index 2bbb51783c..cd760a8199 100644 --- a/tests/ExecuteNetwork/ExecuteNetwork.cpp +++ b/tests/ExecuteNetwork/ExecuteNetwork.cpp @@ -5,6 +5,8 @@ #include "NetworkExecutionUtils/NetworkExecutionUtils.hpp" #include "ExecuteNetworkProgramOptions.hpp" +#include <armnn/IAsyncExecutionCallback.hpp> +#include <AsyncExecutionCallback.hpp> #include <armnn/Logging.hpp> #include <Filesystem.hpp> @@ -276,8 +278,7 @@ template<typename TParser, typename TDataType> int MainImpl(const ExecuteNetworkParams& params, const std::shared_ptr<armnn::IRuntime>& runtime = nullptr) { - using TContainer = - mapbox::util::variant<std::vector<float>, std::vector<int>, std::vector<unsigned char>, std::vector<int8_t>>; + using namespace std::chrono; std::vector<std::vector<TContainer>> inputs; std::vector<std::vector<TContainer>> outputs; @@ -300,6 +301,7 @@ int MainImpl(const ExecuteNetworkParams& params, inferenceModelParams.m_NumberOfThreads = params.m_NumberOfThreads; inferenceModelParams.m_MLGOTuningFilePath = params.m_MLGOTuningFilePath; inferenceModelParams.m_AsyncEnabled = params.m_Concurrent; + inferenceModelParams.m_ThreadPoolSize = params.m_ThreadPoolSize; for(const std::string& inputName: params.m_InputNames) { @@ -390,9 +392,9 @@ int MainImpl(const ExecuteNetworkParams& params, outputs.push_back(outputDataContainers); } + // Synchronous execution if (!params.m_Concurrent) { - // Synchronous Execution for (size_t x = 0; x < params.m_Iterations; x++) { // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds) @@ -437,13 +439,118 @@ int MainImpl(const ExecuteNetworkParams& params, } } } + // Asynchronous execution using the Arm NN thread pool + else if (params.m_ThreadPoolSize >= 2) + { + try + { + ARMNN_LOG(info) << "Asynchronous execution with Arm NN thread pool... \n"; + std::vector<armnn::experimental::IAsyncExecutionCallbackPtr> callbacks; + + // Create callbacks that will be checked post scheduling + for (size_t i = 0; i < params.m_SimultaneousIterations; ++i) + { + // Point to ArmNN example implementation of AsyncExecutionCallback + callbacks.emplace_back(std::make_shared<armnn::experimental::AsyncExecutionCallback>()); + } + + // Declare the latest and earliest inference times here to be used when calculating overall time + std::chrono::high_resolution_clock::time_point earliestStartTime; + std::chrono::high_resolution_clock::time_point latestEndTime = + std::chrono::high_resolution_clock::now(); + + // For the asynchronous execution, we are adding a pool of working memory handles (1 per thread) in the + // LoadedNetwork with each scheduled inference having a specific priority + for (size_t i = 0; i < callbacks.size(); ++i) + { + model.RunAsync(inputs[i], outputs[i], callbacks[i]); + } + + // Check the results + unsigned int j = 0; + for (armnn::experimental::IAsyncExecutionCallbackPtr cb : callbacks) + { + // Get the results + auto endTime = time_point_cast<std::chrono::milliseconds>(cb->GetEndTime()); + auto startTime = time_point_cast<std::chrono::milliseconds>(cb->GetStartTime()); + auto inferenceDuration = endTime - startTime; + + if (latestEndTime < cb->GetEndTime()) + { + latestEndTime = cb->GetEndTime(); + } + + if (earliestStartTime.time_since_epoch().count() == 0) + { + earliestStartTime = cb->GetStartTime(); + } + else if (earliestStartTime > cb->GetStartTime()) + { + earliestStartTime = cb->GetStartTime(); + } + + if (params.m_GenerateTensorData) + { + ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful"; + } + + // Print output tensors + const auto& infosOut = model.GetOutputBindingInfos(); + for (size_t i = 0; i < numOutputs; i++) + { + const armnn::TensorInfo& infoOut = infosOut[i].second; + auto outputTensorFile = params.m_OutputTensorFiles.empty() + ? "" + : params.m_OutputTensorFiles[(j * numOutputs) + i]; + + TensorPrinter printer(inferenceModelParams.m_OutputBindings[i], + infoOut, + outputTensorFile, + params.m_DequantizeOutput); + mapbox::util::apply_visitor(printer, outputs[j][i]); + } + + ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2) + << std::fixed << inferenceDuration.count() << " ms\n"; + + // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line + if (params.m_ThresholdTime != 0.0) + { + ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2) + << std::fixed << params.m_ThresholdTime << " ms"; + auto thresholdMinusInference = + params.m_ThresholdTime - duration<double, std::milli>(inferenceDuration).count(); + ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2) + << std::fixed << thresholdMinusInference << " ms" << "\n"; + + if (thresholdMinusInference < 0) + { + ARMNN_LOG(fatal) << "Elapsed inference time is greater than provided threshold time. \n"; + } + } + ++j; + } + //print duration difference between overallStartTime and overallEndTime + auto overallEndTime = time_point_cast<std::chrono::milliseconds>(latestEndTime); + auto overallStartTime = time_point_cast<std::chrono::milliseconds>(earliestStartTime); + auto totalInferenceDuration = overallEndTime - overallStartTime; + ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2) + << std::fixed << totalInferenceDuration.count() << " ms\n"; + } + catch (const armnn::Exception& e) + { + ARMNN_LOG(fatal) << "Armnn Error: " << e.what(); + return EXIT_FAILURE; + } + } + // Asynchronous execution using std::launch::async else { try { - ARMNN_LOG(info) << "Asynchronous Execution... \n"; + ARMNN_LOG(info) << "Asynchronous Execution with std::launch:async... \n"; std::vector<std::future<std::tuple<armnn::profiling::ProfilingGuid, - std::chrono::duration<double, std::milli>>>> inferenceResults; + std::chrono::duration<double, std::milli>>>> inferenceResults; inferenceResults.reserve(params.m_SimultaneousIterations); // Create WorkingMemHandles for each inference @@ -455,6 +562,8 @@ int MainImpl(const ExecuteNetworkParams& params, } // Run each inference in its own thread + // start a timer + const auto start_time = armnn::GetTimeNow(); for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i) { armnn::experimental::IWorkingMemHandle& workingMemHandleRef = *workingMemHandles[i].get(); @@ -470,7 +579,7 @@ int MainImpl(const ExecuteNetworkParams& params, { // Get the results auto inferenceResult = inferenceResults[j].get(); - auto inference_duration = std::get<1>(inferenceResult); + auto inferenceDuration = std::get<1>(inferenceResult); auto inferenceID = std::get<0>(inferenceResult); if (params.m_GenerateTensorData) @@ -495,14 +604,14 @@ int MainImpl(const ExecuteNetworkParams& params, } ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2) - << std::fixed << inference_duration.count() << " ms\n"; + << std::fixed << inferenceDuration.count() << " ms\n"; // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line if (params.m_ThresholdTime != 0.0) { ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2) << std::fixed << params.m_ThresholdTime << " ms"; - auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count(); + auto thresholdMinusInference = params.m_ThresholdTime - inferenceDuration.count(); ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2) << std::fixed << thresholdMinusInference << " ms" << "\n"; @@ -514,13 +623,16 @@ int MainImpl(const ExecuteNetworkParams& params, ARMNN_LOG(info) << "Asynchronous Execution is finished for Inference ID: " << inferenceID << " \n"; } + // finish timer + const auto duration = armnn::GetTimeDuration(start_time); + ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2) + << std::fixed << duration.count() << " ms\n"; } catch (const armnn::Exception& e) { ARMNN_LOG(fatal) << "Armnn Error: " << e.what(); return EXIT_FAILURE; } - } } catch (const armnn::Exception& e) diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp index 944efec17f..189ece25a7 100644 --- a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp +++ b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp @@ -138,9 +138,11 @@ void ExecuteNetworkParams::ValidateParams() CheckModelFormat(m_ModelFormat); // Check number of simultaneous iterations - if ((m_SimultaneousIterations < 1)) + // Testing std::launch::async with a single iteration is possible if concurrent is manually set + if ((m_SimultaneousIterations <= 1 && m_ThreadPoolSize > 1) || + (m_SimultaneousIterations <= 1 && !m_Concurrent)) { - ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 1. "; + ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 2."; } // Check input tensor shapes diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp index c325df110f..fe0c446087 100644 --- a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp +++ b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp @@ -57,6 +57,7 @@ struct ExecuteNetworkParams std::string m_TuningPath; std::string m_MLGOTuningFilePath; TfLiteExecutor m_TfLiteExecutor; + size_t m_ThreadPoolSize; // Ensures that the parameters for ExecuteNetwork fit together void ValidateParams(); diff --git a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp index 042087e4f4..1f57f85252 100644 --- a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp +++ b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp @@ -195,7 +195,8 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork", cxxopts::value<std::string>(m_RuntimeOptions.m_DynamicBackendsPath)) ("n,concurrent", - "If this option is enabled inferences will be executed in parallel asynchronously.", + "This option is for Arm NN internal asynchronous testing purposes. " + "By default it is set to true if thread-pool-size or simultaneous-iterations are greater than 1", cxxopts::value<bool>(m_ExNetParams.m_Concurrent)->default_value("false")->implicit_value("true")) ("d,input-tensor-data", @@ -284,9 +285,15 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork", "This option is depreciated please use tflite-executor instead", cxxopts::value<bool>(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true")) - ("simultaneous-iterations", - "Number of simultaneous iterations to async-run the network for, default is set to 1", - cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1")); + ("simultaneous-iterations", + "Number of simultaneous iterations to async-run the network for, default is set to 1 (disabled). " + "When thread-pool-size is set the Arm NN thread pool is used. Otherwise std::launch::async is used.", + cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1")) + + ("thread-pool-size", + "Number of Arm NN threads to use when running the network asynchronously via the Arm NN thread pool. " + "The default is set to 1", + cxxopts::value<size_t>(m_ExNetParams.m_ThreadPoolSize)->default_value("1")); m_CxxOptions.add_options("c) Optimization") ("bf16-turbo-mode", @@ -453,7 +460,11 @@ void ProgramOptions::ParseOptions(int ac, const char* av[]) "please use tflite-executor instead."); } - + // Set concurrent to true if the user expects to run inferences asynchronously + if (m_ExNetParams.m_SimultaneousIterations > 1) + { + m_ExNetParams.m_Concurrent = true; + } // Parse input tensor shape from the string we got from the command-line. std::vector<std::string> inputTensorShapesVector = diff --git a/tests/InferenceModel.hpp b/tests/InferenceModel.hpp index 3429598249..7c51011a22 100644 --- a/tests/InferenceModel.hpp +++ b/tests/InferenceModel.hpp @@ -102,6 +102,7 @@ struct Params unsigned int m_NumberOfThreads; std::string m_MLGOTuningFilePath; bool m_AsyncEnabled; + size_t m_ThreadPoolSize; Params() @@ -120,6 +121,7 @@ struct Params , m_NumberOfThreads(0) , m_MLGOTuningFilePath("") , m_AsyncEnabled(false) + , m_ThreadPoolSize(1) {} }; @@ -481,7 +483,8 @@ public: const auto loading_start_time = armnn::GetTimeNow(); armnn::INetworkProperties networkProperties(params.m_AsyncEnabled, armnn::MemorySource::Undefined, - armnn::MemorySource::Undefined); + armnn::MemorySource::Undefined, + params.m_ThreadPoolSize); std::string errorMessage; ret = m_Runtime->LoadNetwork(m_NetworkIdentifier, std::move(optNet), errorMessage, networkProperties); @@ -632,6 +635,47 @@ public: } } + void RunAsync(const std::vector<TContainer>& inputContainers, + std::vector<TContainer>& outputContainers, + armnn::experimental::IAsyncExecutionCallbackPtr cb) + { + for (unsigned int i = 0; i < outputContainers.size(); ++i) + { + const unsigned int expectedOutputDataSize = GetOutputSize(i); + + mapbox::util::apply_visitor([expectedOutputDataSize, i](auto&& value) + { + const unsigned int actualOutputDataSize = armnn::numeric_cast<unsigned int>(value.size()); + if (actualOutputDataSize < expectedOutputDataSize) + { + unsigned int outputIndex = i; + throw armnn::Exception( + fmt::format("Not enough data for output #{0}: expected " + "{1} elements, got {2}", outputIndex, expectedOutputDataSize, actualOutputDataSize)); + } + }, + outputContainers[i]); + } + + std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkIdentifier); + if (profiler) + { + profiler->EnableProfiling(m_EnableProfiling); + } + + m_Runtime->Schedule(m_NetworkIdentifier, + MakeInputTensors(inputContainers), + MakeOutputTensors(outputContainers), + armnn::QosExecPriority::Medium, + cb); + + // if profiling is enabled print out the results + if (profiler && profiler->IsProfilingEnabled()) + { + profiler->Print(std::cout); + } + } + const armnn::BindingPointInfo& GetInputBindingInfo(unsigned int inputIndex = 0u) const { CheckInputIndexIsValid(inputIndex); |