From ca3a3e0fd86a07de9e073ba31dc2b42d6ca84536 Mon Sep 17 00:00:00 2001 From: Finn Williams Date: Fri, 11 Jun 2021 15:04:02 +0100 Subject: IVGCVSW-6062 Rework the async threadpool !armnn:5801 Signed-off-by: Finn Williams Change-Id: I9964d0899ce752441f380edddbd974010257b2dd --- 1.2/ArmnnDriverImpl.cpp | 6 +++--- 1.3/ArmnnDriverImpl.cpp | 6 +++--- ArmnnDriverImpl.cpp | 6 +++--- ArmnnPreparedModel.cpp | 22 +++++++++++++++------- ArmnnPreparedModel.hpp | 35 ++++++++--------------------------- ArmnnPreparedModel_1_2.cpp | 22 +++++++++++++++------- ArmnnPreparedModel_1_2.hpp | 46 +++++++++++++--------------------------------- ArmnnPreparedModel_1_3.cpp | 22 +++++++++++++++------- ArmnnPreparedModel_1_3.hpp | 30 ++++++------------------------ 9 files changed, 81 insertions(+), 114 deletions(-) diff --git a/1.2/ArmnnDriverImpl.cpp b/1.2/ArmnnDriverImpl.cpp index 3eae526f..afe3060d 100644 --- a/1.2/ArmnnDriverImpl.cpp +++ b/1.2/ArmnnDriverImpl.cpp @@ -191,8 +191,7 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_2( std::string msg; armnn::INetworkProperties networkProperties(options.isAsyncModelExecutionEnabled(), MemorySource::Undefined, - MemorySource::Undefined, - options.getNoOfArmnnThreads()); + MemorySource::Undefined); try { if (runtime->LoadNetwork(netId, move(optNet), msg, networkProperties) != armnn::Status::Success) @@ -222,7 +221,8 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_2( model, options.GetRequestInputsAndOutputsDumpDir(), options.IsGpuProfilingEnabled(), - options.isAsyncModelExecutionEnabled())); + options.isAsyncModelExecutionEnabled(), + options.getNoOfArmnnThreads())); // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if // this is enabled) before the first 'real' inference which removes the overhead of the first inference. diff --git a/1.3/ArmnnDriverImpl.cpp b/1.3/ArmnnDriverImpl.cpp index 5c5e6071..6f1b5098 100644 --- a/1.3/ArmnnDriverImpl.cpp +++ b/1.3/ArmnnDriverImpl.cpp @@ -202,8 +202,7 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_3( std::string msg; armnn::INetworkProperties networkProperties(options.isAsyncModelExecutionEnabled(), MemorySource::Undefined, - MemorySource::Undefined, - options.getNoOfArmnnThreads()); + MemorySource::Undefined); try { if (runtime->LoadNetwork(netId, move(optNet), msg, networkProperties) != armnn::Status::Success) @@ -234,7 +233,8 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_3( options.GetRequestInputsAndOutputsDumpDir(), options.IsGpuProfilingEnabled(), priority, - options.isAsyncModelExecutionEnabled())); + options.isAsyncModelExecutionEnabled(), + options.getNoOfArmnnThreads())); // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if // this is enabled) before the first 'real' inference which removes the overhead of the first inference. diff --git a/ArmnnDriverImpl.cpp b/ArmnnDriverImpl.cpp index 0e6e8b18..78ef12fe 100644 --- a/ArmnnDriverImpl.cpp +++ b/ArmnnDriverImpl.cpp @@ -166,8 +166,7 @@ Return ArmnnDriverImpl::prepareModel( std::string msg; armnn::INetworkProperties networkProperties(options.isAsyncModelExecutionEnabled(), armnn::MemorySource::Undefined, - armnn::MemorySource::Undefined, - options.getNoOfArmnnThreads()); + armnn::MemorySource::Undefined); try { @@ -198,7 +197,8 @@ Return ArmnnDriverImpl::prepareModel( model, options.GetRequestInputsAndOutputsDumpDir(), options.IsGpuProfilingEnabled(), - options.isAsyncModelExecutionEnabled())); + options.isAsyncModelExecutionEnabled(), + options.getNoOfArmnnThreads())); // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if // this is enabled) before the first 'real' inference which removes the overhead of the first inference. diff --git a/ArmnnPreparedModel.cpp b/ArmnnPreparedModel.cpp index 259d1b8c..77709b0e 100644 --- a/ArmnnPreparedModel.cpp +++ b/ArmnnPreparedModel.cpp @@ -113,7 +113,8 @@ ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, const HalModel& model, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, - const bool asyncModelExecutionEnabled) + const bool asyncModelExecutionEnabled, + const unsigned int numberOfThreads) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) @@ -127,7 +128,14 @@ ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, if (asyncModelExecutionEnabled) { - m_WorkingMemHandle = m_Runtime->CreateWorkingMemHandle(networkId); + std::vector> memHandles; + for (int i=0; i < numberOfThreads; ++i) + { + memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId)); + } + + m_WorkingMemHandle = memHandles.back(); + m_Threadpool = std::make_unique(numberOfThreads, runtime, memHandles); } } @@ -397,11 +405,11 @@ void ArmnnPreparedModel::ScheduleGraphForExecution( outputTensors, callbackContext); - m_Runtime->Schedule(m_NetworkId, - *tpCb->m_InputTensors, - *tpCb->m_OutputTensors, - armnn::QosExecPriority::High, - tpCb); + m_Threadpool->Schedule(m_NetworkId, + *tpCb->m_InputTensors, + *tpCb->m_OutputTensors, + armnn::QosExecPriority::Medium, + tpCb); ALOGV("ArmnnPreparedModel::ScheduleGraphForExecution end"); } diff --git a/ArmnnPreparedModel.hpp b/ArmnnPreparedModel.hpp index d1c830d4..58177d92 100644 --- a/ArmnnPreparedModel.hpp +++ b/ArmnnPreparedModel.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -39,7 +40,8 @@ public: const HalModel& model, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, - const bool asyncModelExecutionEnabled = false); + const bool asyncModelExecutionEnabled = false, + const unsigned int numberOfThreads = 1); virtual ~ArmnnPreparedModel(); @@ -76,28 +78,6 @@ private: void Notify(armnn::Status status, armnn::InferenceTimingPair timeTaken) override; - // Retrieve the ArmNN Status from the AsyncExecutionCallback that has been notified - virtual armnn::Status GetStatus() const override - { - return armnn::Status::Success; - } - - // Block the calling thread until the AsyncExecutionCallback object allows it to proceed - virtual void Wait() const override - {} - - // Retrieve the start time before executing the inference - virtual armnn::HighResolutionClock GetStartTime() const override - { - return std::chrono::high_resolution_clock::now(); - } - - // Retrieve the time after executing the inference - virtual armnn::HighResolutionClock GetEndTime() const override - { - return std::chrono::high_resolution_clock::now(); - } - ArmnnPreparedModel* m_Model; std::shared_ptr> m_MemPools; std::shared_ptr m_InputTensors; @@ -116,9 +96,10 @@ private: std::shared_ptr& outputTensors, CallbackContext m_CallbackContext); - armnn::NetworkId m_NetworkId; - armnn::IRuntime* m_Runtime; - HalModel m_Model; + armnn::NetworkId m_NetworkId; + armnn::IRuntime* m_Runtime; + std::unique_ptr m_Threadpool; + HalModel m_Model; // There must be a single RequestThread for all ArmnnPreparedModel objects to ensure serial execution of workloads // It is specific to this class, so it is declared as static here static RequestThread m_RequestThread; @@ -126,7 +107,7 @@ private: const std::string& m_RequestInputsAndOutputsDumpDir; const bool m_GpuProfilingEnabled; - std::unique_ptr m_WorkingMemHandle; + std::shared_ptr m_WorkingMemHandle; const bool m_AsyncModelExecutionEnabled; }; diff --git a/ArmnnPreparedModel_1_2.cpp b/ArmnnPreparedModel_1_2.cpp index 83e4aac5..07fd5a6d 100644 --- a/ArmnnPreparedModel_1_2.cpp +++ b/ArmnnPreparedModel_1_2.cpp @@ -148,7 +148,8 @@ ArmnnPreparedModel_1_2::ArmnnPreparedModel_1_2(armnn::NetworkId netw const V1_2::Model& model, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, - const bool asyncModelExecutionEnabled) + const bool asyncModelExecutionEnabled, + const unsigned int numberOfThreads) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) @@ -162,7 +163,14 @@ ArmnnPreparedModel_1_2::ArmnnPreparedModel_1_2(armnn::NetworkId netw if (asyncModelExecutionEnabled) { - m_WorkingMemHandle = m_Runtime->CreateWorkingMemHandle(networkId); + std::vector> memHandles; + for (int i=0; i < numberOfThreads; ++i) + { + memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId)); + } + + m_WorkingMemHandle = memHandles.back(); + m_Threadpool = std::make_unique(numberOfThreads, runtime, memHandles); } } @@ -661,11 +669,11 @@ void ArmnnPreparedModel_1_2::ScheduleGraphForExecution( outputTensors, callbackContext); - m_Runtime->Schedule(m_NetworkId, - *tpCb->m_InputTensors, - *tpCb->m_OutputTensors, - armnn::QosExecPriority::High, - tpCb); + m_Threadpool->Schedule(m_NetworkId, + *tpCb->m_InputTensors, + *tpCb->m_OutputTensors, + armnn::QosExecPriority::Medium, + tpCb); ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution end"); } diff --git a/ArmnnPreparedModel_1_2.hpp b/ArmnnPreparedModel_1_2.hpp index 6c630c56..4ee2b817 100644 --- a/ArmnnPreparedModel_1_2.hpp +++ b/ArmnnPreparedModel_1_2.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -45,7 +46,8 @@ public: const HalModel& model, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, - const bool asyncModelExecutionEnabled = false); + const bool asyncModelExecutionEnabled = false, + const unsigned int numberOfThreads = 1); virtual ~ArmnnPreparedModel_1_2(); @@ -98,28 +100,6 @@ private: void Notify(armnn::Status status, armnn::InferenceTimingPair timeTaken) override; - // Retrieve the Arm NN Status from the AsyncExecutionCallback that has been notified - virtual armnn::Status GetStatus() const override - { - return armnn::Status::Success; - } - - // Block the calling thread until the AsyncExecutionCallback object allows it to proceed - virtual void Wait() const override - {} - - // Retrieve the start time before executing the inference - virtual armnn::HighResolutionClock GetStartTime() const override - { - return std::chrono::high_resolution_clock::now(); - } - - // Retrieve the time after executing the inference - virtual armnn::HighResolutionClock GetEndTime() const override - { - return std::chrono::high_resolution_clock::now(); - } - ArmnnPreparedModel_1_2* m_Model; std::shared_ptr> m_MemPools; std::vector m_OutputShapes; @@ -161,20 +141,20 @@ private: std::shared_ptr& outputTensors, CallbackContext m_CallbackContext); - armnn::NetworkId m_NetworkId; - armnn::IRuntime* m_Runtime; - V1_2::Model m_Model; + armnn::NetworkId m_NetworkId; + armnn::IRuntime* m_Runtime; + std::unique_ptr m_Threadpool; + V1_2::Model m_Model; // There must be a single RequestThread for all ArmnnPreparedModel objects to ensure serial execution of workloads // It is specific to this class, so it is declared as static here static RequestThread m_RequestThread; - uint32_t m_RequestCount; - const std::string& m_RequestInputsAndOutputsDumpDir; - const bool m_GpuProfilingEnabled; - - std::unique_ptr m_WorkingMemHandle; - const bool m_AsyncModelExecutionEnabled; + CallbackContext_1_2> m_RequestThread; + uint32_t m_RequestCount; + const std::string& m_RequestInputsAndOutputsDumpDir; + const bool m_GpuProfilingEnabled; + std::shared_ptr m_WorkingMemHandle; + const bool m_AsyncModelExecutionEnabled; }; } diff --git a/ArmnnPreparedModel_1_3.cpp b/ArmnnPreparedModel_1_3.cpp index 63918725..cd3df516 100644 --- a/ArmnnPreparedModel_1_3.cpp +++ b/ArmnnPreparedModel_1_3.cpp @@ -169,7 +169,8 @@ ArmnnPreparedModel_1_3::ArmnnPreparedModel_1_3(armnn::NetworkId netw const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, V1_3::Priority priority, - const bool asyncModelExecutionEnabled) + const bool asyncModelExecutionEnabled, + const unsigned int numberOfThreads) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) @@ -184,7 +185,14 @@ ArmnnPreparedModel_1_3::ArmnnPreparedModel_1_3(armnn::NetworkId netw if (asyncModelExecutionEnabled) { - m_WorkingMemHandle = m_Runtime->CreateWorkingMemHandle(networkId); + std::vector> memHandles; + for (int i=0; i < numberOfThreads; ++i) + { + memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId)); + } + + m_WorkingMemHandle = memHandles.back(); + m_Threadpool = std::make_unique(numberOfThreads, runtime, memHandles); } } @@ -821,11 +829,11 @@ void ArmnnPreparedModel_1_3::ScheduleGraphForExecution( outputTensors, callbackContext); - m_Runtime->Schedule(m_NetworkId, - *tpCb->m_InputTensors, - *tpCb->m_OutputTensors, - priority, - tpCb); + m_Threadpool->Schedule(m_NetworkId, + *tpCb->m_InputTensors, + *tpCb->m_OutputTensors, + priority, + tpCb); ALOGV("ArmnnPreparedModel_1_3::ScheduleGraphForExecution end"); } diff --git a/ArmnnPreparedModel_1_3.hpp b/ArmnnPreparedModel_1_3.hpp index 11299cc4..46798cde 100644 --- a/ArmnnPreparedModel_1_3.hpp +++ b/ArmnnPreparedModel_1_3.hpp @@ -12,6 +12,8 @@ #include #include +#include + #include #include @@ -52,7 +54,8 @@ public: const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, V1_3::Priority priority = V1_3::Priority::MEDIUM, - const bool asyncModelExecutionEnabled = false); + const bool asyncModelExecutionEnabled = false, + const unsigned int numberOfThreads = 1); virtual ~ArmnnPreparedModel_1_3(); @@ -131,28 +134,6 @@ private: void Notify(armnn::Status status, armnn::InferenceTimingPair timeTaken) override; - // Retrieve the Arm NN Status from the AsyncExecutionCallback that has been notified - virtual armnn::Status GetStatus() const override - { - return armnn::Status::Success; - } - - // Block the calling thread until the AsyncExecutionCallback object allows it to proceed - virtual void Wait() const override - {} - - // Retrieve the start time before executing the inference - virtual armnn::HighResolutionClock GetStartTime() const override - { - return std::chrono::high_resolution_clock::now(); - } - - // Retrieve the time after executing the inference - virtual armnn::HighResolutionClock GetEndTime() const override - { - return std::chrono::high_resolution_clock::now(); - } - ArmnnPreparedModel_1_3* m_Model; std::shared_ptr> m_MemPools; std::vector m_OutputShapes; @@ -196,6 +177,7 @@ private: armnn::NetworkId m_NetworkId; armnn::IRuntime* m_Runtime; + std::unique_ptr m_Threadpool; V1_3::Model m_Model; // There must be a single RequestThread for all ArmnnPreparedModel objects to ensure serial execution of workloads // It is specific to this class, so it is declared as static here @@ -205,7 +187,7 @@ private: const bool m_GpuProfilingEnabled; V1_3::Priority m_ModelPriority; - std::unique_ptr m_WorkingMemHandle; + std::shared_ptr m_WorkingMemHandle; const bool m_AsyncModelExecutionEnabled; }; -- cgit v1.2.1