From b4b3ac91990eb5deaffca2300319f2ddf7aa0886 Mon Sep 17 00:00:00 2001 From: Kevin May Date: Fri, 21 May 2021 16:42:21 +0100 Subject: IVGCVSW-6009 Integrate threadpool into ExNet * Remove concurrent flag from ExecuteNetwork as it is possible to deduce if SimultaneousIterations > 1 * Add void RunAsync() * Refactor some unit tests Change-Id: I7021d4821b0e460470908294cbd9462850e8b361 Signed-off-by: Keith Davis Signed-off-by: Kevin May --- tests/ExecuteNetwork/ExecuteNetwork.cpp | 130 +++++++++++++++++++-- tests/ExecuteNetwork/ExecuteNetworkParams.cpp | 6 +- tests/ExecuteNetwork/ExecuteNetworkParams.hpp | 1 + .../ExecuteNetworkProgramOptions.cpp | 21 +++- tests/InferenceModel.hpp | 46 +++++++- 5 files changed, 187 insertions(+), 17 deletions(-) (limited to 'tests') diff --git a/tests/ExecuteNetwork/ExecuteNetwork.cpp b/tests/ExecuteNetwork/ExecuteNetwork.cpp index 2bbb51783c..cd760a8199 100644 --- a/tests/ExecuteNetwork/ExecuteNetwork.cpp +++ b/tests/ExecuteNetwork/ExecuteNetwork.cpp @@ -5,6 +5,8 @@ #include "NetworkExecutionUtils/NetworkExecutionUtils.hpp" #include "ExecuteNetworkProgramOptions.hpp" +#include +#include #include #include @@ -276,8 +278,7 @@ template int MainImpl(const ExecuteNetworkParams& params, const std::shared_ptr& runtime = nullptr) { - using TContainer = - mapbox::util::variant, std::vector, std::vector, std::vector>; + using namespace std::chrono; std::vector> inputs; std::vector> outputs; @@ -300,6 +301,7 @@ int MainImpl(const ExecuteNetworkParams& params, inferenceModelParams.m_NumberOfThreads = params.m_NumberOfThreads; inferenceModelParams.m_MLGOTuningFilePath = params.m_MLGOTuningFilePath; inferenceModelParams.m_AsyncEnabled = params.m_Concurrent; + inferenceModelParams.m_ThreadPoolSize = params.m_ThreadPoolSize; for(const std::string& inputName: params.m_InputNames) { @@ -390,9 +392,9 @@ int MainImpl(const ExecuteNetworkParams& params, outputs.push_back(outputDataContainers); } + // Synchronous execution if (!params.m_Concurrent) { - // Synchronous Execution for (size_t x = 0; x < params.m_Iterations; x++) { // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds) @@ -437,13 +439,118 @@ int MainImpl(const ExecuteNetworkParams& params, } } } + // Asynchronous execution using the Arm NN thread pool + else if (params.m_ThreadPoolSize >= 2) + { + try + { + ARMNN_LOG(info) << "Asynchronous execution with Arm NN thread pool... \n"; + std::vector callbacks; + + // Create callbacks that will be checked post scheduling + for (size_t i = 0; i < params.m_SimultaneousIterations; ++i) + { + // Point to ArmNN example implementation of AsyncExecutionCallback + callbacks.emplace_back(std::make_shared()); + } + + // Declare the latest and earliest inference times here to be used when calculating overall time + std::chrono::high_resolution_clock::time_point earliestStartTime; + std::chrono::high_resolution_clock::time_point latestEndTime = + std::chrono::high_resolution_clock::now(); + + // For the asynchronous execution, we are adding a pool of working memory handles (1 per thread) in the + // LoadedNetwork with each scheduled inference having a specific priority + for (size_t i = 0; i < callbacks.size(); ++i) + { + model.RunAsync(inputs[i], outputs[i], callbacks[i]); + } + + // Check the results + unsigned int j = 0; + for (armnn::experimental::IAsyncExecutionCallbackPtr cb : callbacks) + { + // Get the results + auto endTime = time_point_cast(cb->GetEndTime()); + auto startTime = time_point_cast(cb->GetStartTime()); + auto inferenceDuration = endTime - startTime; + + if (latestEndTime < cb->GetEndTime()) + { + latestEndTime = cb->GetEndTime(); + } + + if (earliestStartTime.time_since_epoch().count() == 0) + { + earliestStartTime = cb->GetStartTime(); + } + else if (earliestStartTime > cb->GetStartTime()) + { + earliestStartTime = cb->GetStartTime(); + } + + if (params.m_GenerateTensorData) + { + ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful"; + } + + // Print output tensors + const auto& infosOut = model.GetOutputBindingInfos(); + for (size_t i = 0; i < numOutputs; i++) + { + const armnn::TensorInfo& infoOut = infosOut[i].second; + auto outputTensorFile = params.m_OutputTensorFiles.empty() + ? "" + : params.m_OutputTensorFiles[(j * numOutputs) + i]; + + TensorPrinter printer(inferenceModelParams.m_OutputBindings[i], + infoOut, + outputTensorFile, + params.m_DequantizeOutput); + mapbox::util::apply_visitor(printer, outputs[j][i]); + } + + ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2) + << std::fixed << inferenceDuration.count() << " ms\n"; + + // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line + if (params.m_ThresholdTime != 0.0) + { + ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2) + << std::fixed << params.m_ThresholdTime << " ms"; + auto thresholdMinusInference = + params.m_ThresholdTime - duration(inferenceDuration).count(); + ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2) + << std::fixed << thresholdMinusInference << " ms" << "\n"; + + if (thresholdMinusInference < 0) + { + ARMNN_LOG(fatal) << "Elapsed inference time is greater than provided threshold time. \n"; + } + } + ++j; + } + //print duration difference between overallStartTime and overallEndTime + auto overallEndTime = time_point_cast(latestEndTime); + auto overallStartTime = time_point_cast(earliestStartTime); + auto totalInferenceDuration = overallEndTime - overallStartTime; + ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2) + << std::fixed << totalInferenceDuration.count() << " ms\n"; + } + catch (const armnn::Exception& e) + { + ARMNN_LOG(fatal) << "Armnn Error: " << e.what(); + return EXIT_FAILURE; + } + } + // Asynchronous execution using std::launch::async else { try { - ARMNN_LOG(info) << "Asynchronous Execution... \n"; + ARMNN_LOG(info) << "Asynchronous Execution with std::launch:async... \n"; std::vector>>> inferenceResults; + std::chrono::duration>>> inferenceResults; inferenceResults.reserve(params.m_SimultaneousIterations); // Create WorkingMemHandles for each inference @@ -455,6 +562,8 @@ int MainImpl(const ExecuteNetworkParams& params, } // Run each inference in its own thread + // start a timer + const auto start_time = armnn::GetTimeNow(); for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i) { armnn::experimental::IWorkingMemHandle& workingMemHandleRef = *workingMemHandles[i].get(); @@ -470,7 +579,7 @@ int MainImpl(const ExecuteNetworkParams& params, { // Get the results auto inferenceResult = inferenceResults[j].get(); - auto inference_duration = std::get<1>(inferenceResult); + auto inferenceDuration = std::get<1>(inferenceResult); auto inferenceID = std::get<0>(inferenceResult); if (params.m_GenerateTensorData) @@ -495,14 +604,14 @@ int MainImpl(const ExecuteNetworkParams& params, } ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2) - << std::fixed << inference_duration.count() << " ms\n"; + << std::fixed << inferenceDuration.count() << " ms\n"; // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line if (params.m_ThresholdTime != 0.0) { ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2) << std::fixed << params.m_ThresholdTime << " ms"; - auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count(); + auto thresholdMinusInference = params.m_ThresholdTime - inferenceDuration.count(); ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2) << std::fixed << thresholdMinusInference << " ms" << "\n"; @@ -514,13 +623,16 @@ int MainImpl(const ExecuteNetworkParams& params, ARMNN_LOG(info) << "Asynchronous Execution is finished for Inference ID: " << inferenceID << " \n"; } + // finish timer + const auto duration = armnn::GetTimeDuration(start_time); + ARMNN_LOG(info) << "\nOverall Inference time: " << std::setprecision(2) + << std::fixed << duration.count() << " ms\n"; } catch (const armnn::Exception& e) { ARMNN_LOG(fatal) << "Armnn Error: " << e.what(); return EXIT_FAILURE; } - } } catch (const armnn::Exception& e) diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp index 944efec17f..189ece25a7 100644 --- a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp +++ b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp @@ -138,9 +138,11 @@ void ExecuteNetworkParams::ValidateParams() CheckModelFormat(m_ModelFormat); // Check number of simultaneous iterations - if ((m_SimultaneousIterations < 1)) + // Testing std::launch::async with a single iteration is possible if concurrent is manually set + if ((m_SimultaneousIterations <= 1 && m_ThreadPoolSize > 1) || + (m_SimultaneousIterations <= 1 && !m_Concurrent)) { - ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 1. "; + ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 2."; } // Check input tensor shapes diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp index c325df110f..fe0c446087 100644 --- a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp +++ b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp @@ -57,6 +57,7 @@ struct ExecuteNetworkParams std::string m_TuningPath; std::string m_MLGOTuningFilePath; TfLiteExecutor m_TfLiteExecutor; + size_t m_ThreadPoolSize; // Ensures that the parameters for ExecuteNetwork fit together void ValidateParams(); diff --git a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp index 042087e4f4..1f57f85252 100644 --- a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp +++ b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp @@ -195,7 +195,8 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork", cxxopts::value(m_RuntimeOptions.m_DynamicBackendsPath)) ("n,concurrent", - "If this option is enabled inferences will be executed in parallel asynchronously.", + "This option is for Arm NN internal asynchronous testing purposes. " + "By default it is set to true if thread-pool-size or simultaneous-iterations are greater than 1", cxxopts::value(m_ExNetParams.m_Concurrent)->default_value("false")->implicit_value("true")) ("d,input-tensor-data", @@ -284,9 +285,15 @@ ProgramOptions::ProgramOptions() : m_CxxOptions{"ExecuteNetwork", "This option is depreciated please use tflite-executor instead", cxxopts::value(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true")) - ("simultaneous-iterations", - "Number of simultaneous iterations to async-run the network for, default is set to 1", - cxxopts::value(m_ExNetParams.m_SimultaneousIterations)->default_value("1")); + ("simultaneous-iterations", + "Number of simultaneous iterations to async-run the network for, default is set to 1 (disabled). " + "When thread-pool-size is set the Arm NN thread pool is used. Otherwise std::launch::async is used.", + cxxopts::value(m_ExNetParams.m_SimultaneousIterations)->default_value("1")) + + ("thread-pool-size", + "Number of Arm NN threads to use when running the network asynchronously via the Arm NN thread pool. " + "The default is set to 1", + cxxopts::value(m_ExNetParams.m_ThreadPoolSize)->default_value("1")); m_CxxOptions.add_options("c) Optimization") ("bf16-turbo-mode", @@ -453,7 +460,11 @@ void ProgramOptions::ParseOptions(int ac, const char* av[]) "please use tflite-executor instead."); } - + // Set concurrent to true if the user expects to run inferences asynchronously + if (m_ExNetParams.m_SimultaneousIterations > 1) + { + m_ExNetParams.m_Concurrent = true; + } // Parse input tensor shape from the string we got from the command-line. std::vector inputTensorShapesVector = diff --git a/tests/InferenceModel.hpp b/tests/InferenceModel.hpp index 3429598249..7c51011a22 100644 --- a/tests/InferenceModel.hpp +++ b/tests/InferenceModel.hpp @@ -102,6 +102,7 @@ struct Params unsigned int m_NumberOfThreads; std::string m_MLGOTuningFilePath; bool m_AsyncEnabled; + size_t m_ThreadPoolSize; Params() @@ -120,6 +121,7 @@ struct Params , m_NumberOfThreads(0) , m_MLGOTuningFilePath("") , m_AsyncEnabled(false) + , m_ThreadPoolSize(1) {} }; @@ -481,7 +483,8 @@ public: const auto loading_start_time = armnn::GetTimeNow(); armnn::INetworkProperties networkProperties(params.m_AsyncEnabled, armnn::MemorySource::Undefined, - armnn::MemorySource::Undefined); + armnn::MemorySource::Undefined, + params.m_ThreadPoolSize); std::string errorMessage; ret = m_Runtime->LoadNetwork(m_NetworkIdentifier, std::move(optNet), errorMessage, networkProperties); @@ -632,6 +635,47 @@ public: } } + void RunAsync(const std::vector& inputContainers, + std::vector& outputContainers, + armnn::experimental::IAsyncExecutionCallbackPtr cb) + { + for (unsigned int i = 0; i < outputContainers.size(); ++i) + { + const unsigned int expectedOutputDataSize = GetOutputSize(i); + + mapbox::util::apply_visitor([expectedOutputDataSize, i](auto&& value) + { + const unsigned int actualOutputDataSize = armnn::numeric_cast(value.size()); + if (actualOutputDataSize < expectedOutputDataSize) + { + unsigned int outputIndex = i; + throw armnn::Exception( + fmt::format("Not enough data for output #{0}: expected " + "{1} elements, got {2}", outputIndex, expectedOutputDataSize, actualOutputDataSize)); + } + }, + outputContainers[i]); + } + + std::shared_ptr profiler = m_Runtime->GetProfiler(m_NetworkIdentifier); + if (profiler) + { + profiler->EnableProfiling(m_EnableProfiling); + } + + m_Runtime->Schedule(m_NetworkIdentifier, + MakeInputTensors(inputContainers), + MakeOutputTensors(outputContainers), + armnn::QosExecPriority::Medium, + cb); + + // if profiling is enabled print out the results + if (profiler && profiler->IsProfilingEnabled()) + { + profiler->Print(std::cout); + } + } + const armnn::BindingPointInfo& GetInputBindingInfo(unsigned int inputIndex = 0u) const { CheckInputIndexIsValid(inputIndex); -- cgit v1.2.1