From d8fb540568b29fd1d81a1cca667a1ad3e33ef5a1 Mon Sep 17 00:00:00 2001 From: Finn Williams Date: Wed, 19 May 2021 20:52:00 +0100 Subject: IVGCVSW-5781 Add Async Support to Android-NN-Driver Signed-off-by: Finn Williams Change-Id: I1f13d04100fdb119495b9e3054425bf3babc59f1 --- 1.2/ArmnnDriverImpl.cpp | 10 ++- 1.3/ArmnnDriverImpl.cpp | 10 ++- ArmnnDriverImpl.cpp | 11 +++- ArmnnPreparedModel.cpp | 96 ++++++++++++++++++++++++++-- ArmnnPreparedModel.hpp | 62 +++++++++++++++++- ArmnnPreparedModel_1_2.cpp | 117 +++++++++++++++++++++++++++++++++- ArmnnPreparedModel_1_2.hpp | 65 ++++++++++++++++++- ArmnnPreparedModel_1_3.cpp | 153 ++++++++++++++++++++++++++++++++++++++++++--- ArmnnPreparedModel_1_3.hpp | 66 ++++++++++++++++++- DriverOptions.cpp | 17 ++++- DriverOptions.hpp | 4 ++ 11 files changed, 586 insertions(+), 25 deletions(-) diff --git a/1.2/ArmnnDriverImpl.cpp b/1.2/ArmnnDriverImpl.cpp index ccf82d0e..3eae526f 100644 --- a/1.2/ArmnnDriverImpl.cpp +++ b/1.2/ArmnnDriverImpl.cpp @@ -188,9 +188,14 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_2( // Load it into the runtime. armnn::NetworkId netId = 0; + std::string msg; + armnn::INetworkProperties networkProperties(options.isAsyncModelExecutionEnabled(), + MemorySource::Undefined, + MemorySource::Undefined, + options.getNoOfArmnnThreads()); try { - if (runtime->LoadNetwork(netId, move(optNet)) != armnn::Status::Success) + if (runtime->LoadNetwork(netId, move(optNet), msg, networkProperties) != armnn::Status::Success) { return FailPrepareModel(V1_0::ErrorStatus::GENERAL_FAILURE, "Network could not be loaded", cb); } @@ -216,7 +221,8 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_2( runtime.get(), model, options.GetRequestInputsAndOutputsDumpDir(), - options.IsGpuProfilingEnabled())); + options.IsGpuProfilingEnabled(), + options.isAsyncModelExecutionEnabled())); // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if // this is enabled) before the first 'real' inference which removes the overhead of the first inference. diff --git a/1.3/ArmnnDriverImpl.cpp b/1.3/ArmnnDriverImpl.cpp index 6d8fbe64..5c5e6071 100644 --- a/1.3/ArmnnDriverImpl.cpp +++ b/1.3/ArmnnDriverImpl.cpp @@ -199,9 +199,14 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_3( // Load it into the runtime. armnn::NetworkId netId = 0; + std::string msg; + armnn::INetworkProperties networkProperties(options.isAsyncModelExecutionEnabled(), + MemorySource::Undefined, + MemorySource::Undefined, + options.getNoOfArmnnThreads()); try { - if (runtime->LoadNetwork(netId, move(optNet)) != armnn::Status::Success) + if (runtime->LoadNetwork(netId, move(optNet), msg, networkProperties) != armnn::Status::Success) { return FailPrepareModel(V1_3::ErrorStatus::GENERAL_FAILURE, "Network could not be loaded", cb); } @@ -228,7 +233,8 @@ Return ArmnnDriverImpl::prepareArmnnModel_1_3( model, options.GetRequestInputsAndOutputsDumpDir(), options.IsGpuProfilingEnabled(), - priority)); + priority, + options.isAsyncModelExecutionEnabled())); // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if // this is enabled) before the first 'real' inference which removes the overhead of the first inference. diff --git a/ArmnnDriverImpl.cpp b/ArmnnDriverImpl.cpp index 3e4aab3c..0e6e8b18 100644 --- a/ArmnnDriverImpl.cpp +++ b/ArmnnDriverImpl.cpp @@ -163,9 +163,15 @@ Return ArmnnDriverImpl::prepareModel( // Load it into the runtime. armnn::NetworkId netId = 0; + std::string msg; + armnn::INetworkProperties networkProperties(options.isAsyncModelExecutionEnabled(), + armnn::MemorySource::Undefined, + armnn::MemorySource::Undefined, + options.getNoOfArmnnThreads()); + try { - if (runtime->LoadNetwork(netId, move(optNet)) != armnn::Status::Success) + if (runtime->LoadNetwork(netId, move(optNet), msg, networkProperties) != armnn::Status::Success) { return FailPrepareModel(V1_0::ErrorStatus::GENERAL_FAILURE, "Network could not be loaded", cb); } @@ -191,7 +197,8 @@ Return ArmnnDriverImpl::prepareModel( runtime.get(), model, options.GetRequestInputsAndOutputsDumpDir(), - options.IsGpuProfilingEnabled())); + options.IsGpuProfilingEnabled(), + options.isAsyncModelExecutionEnabled())); // Run a single 'dummy' inference of the model. This means that CL kernels will get compiled (and tuned if // this is enabled) before the first 'real' inference which removes the overhead of the first inference. diff --git a/ArmnnPreparedModel.cpp b/ArmnnPreparedModel.cpp index 60beac4f..978f3787 100644 --- a/ArmnnPreparedModel.cpp +++ b/ArmnnPreparedModel.cpp @@ -112,16 +112,23 @@ ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, armnn::IRuntime* runtime, const HalModel& model, const std::string& requestInputsAndOutputsDumpDir, - const bool gpuProfilingEnabled) + const bool gpuProfilingEnabled, + const bool asyncModelExecutionEnabled) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) , m_RequestCount(0) , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) + , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) { // Enable profiling if required. m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); + + if (asyncModelExecutionEnabled) + { + m_WorkingMemHandle = m_Runtime->CreateWorkingMemHandle(networkId); + } } template @@ -225,8 +232,6 @@ Return ArmnnPreparedModel::execute( return V1_0::ErrorStatus::GENERAL_FAILURE; } - ALOGV("ArmnnPreparedModel::execute(...) before PostMsg"); - auto cb = [callback](V1_0::ErrorStatus errorStatus, std::string callingFunction) { NotifyCallbackAndCheck(callback, errorStatus, callingFunction); @@ -234,7 +239,17 @@ Return ArmnnPreparedModel::execute( CallbackContext_1_0 armnnCb; armnnCb.callback = cb; + + if (m_AsyncModelExecutionEnabled) + { + ALOGV("ArmnnPreparedModel::execute(...) before ScheduleGraphForExecution"); + ScheduleGraphForExecution(pMemPools, pInputTensors, pOutputTensors, armnnCb); + ALOGV("ArmnnPreparedModel::execute(...) after ScheduleGraphForExecution"); + return V1_0::ErrorStatus::NONE; + } + // post the request for asynchronous execution + ALOGV("ArmnnPreparedModel::execute(...) before PostMsg"); m_RequestThread.PostMsg(this, pMemPools, pInputTensors, pOutputTensors, armnnCb); ALOGV("ArmnnPreparedModel::execute(...) after PostMsg"); return V1_0::ErrorStatus::NONE; // successfully queued @@ -254,7 +269,18 @@ void ArmnnPreparedModel::ExecuteGraph( // run it try { - armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + armnn::Status status; + if (m_AsyncModelExecutionEnabled) + { + ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled true"); + status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors); + } + else + { + ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false"); + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + } + if (status != armnn::Status::Success) { ALOGW("EnqueueWorkload failed"); @@ -340,11 +366,73 @@ bool ArmnnPreparedModel::ExecuteWithDummyInputs() return true; } +/// Schedule the graph prepared from the request for execution +template +template +void ArmnnPreparedModel::ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext callbackContext) +{ + ALOGV("ArmnnPreparedModel::ScheduleGraphForExecution(...)"); + + DumpTensorsIfRequired("Input", *inputTensors); + + + auto tpCb = std::make_shared< + ArmnnThreadPoolCallback>(this, + pMemPools, + inputTensors, + outputTensors, + callbackContext); + + m_Runtime->Schedule(m_NetworkId, + *tpCb->m_InputTensors, + *tpCb->m_OutputTensors, + armnn::QosExecPriority::High, + tpCb); + ALOGV("ArmnnPreparedModel::ScheduleGraphForExecution end"); +} + +template +template +void ArmnnPreparedModel::ArmnnThreadPoolCallback::Notify( + armnn::Status status, armnn::InferenceTimingPair timeTaken) +{ + armnn::IgnoreUnused(status, timeTaken); + ALOGV("ArmnnPreparedModel::ArmnnThreadPoolCallback_1_2 Notify"); + + m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors); + + // Commit output buffers. + // Note that we update *all* pools, even if they aren't actually used as outputs - + // this is simpler and is what the CpuExecutor does. + for (android::nn::RunTimePoolInfo& pool : *m_MemPools) + { + // Type android::nn::RunTimePoolInfo has changed between Android P & Q and Android R, where + // update() has been removed and flush() added. + #if defined(ARMNN_ANDROID_R) || defined(ARMNN_ANDROID_S) // Use the new Android implementation. + pool.flush(); + #else + pool.update(); + #endif + } + + m_CallbackContext.callback(V1_0::ErrorStatus::NONE, "ArmnnPreparedModel::ArmnnThreadPoolCallback_1_2 Notify"); + return; +} + /// /// Class template specializations /// template class ArmnnPreparedModel; +template void ArmnnPreparedModel::ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext_1_0 callbackContext); #ifdef ARMNN_ANDROID_NN_V1_1 template class ArmnnPreparedModel; diff --git a/ArmnnPreparedModel.hpp b/ArmnnPreparedModel.hpp index 89f6226f..d1c830d4 100644 --- a/ArmnnPreparedModel.hpp +++ b/ArmnnPreparedModel.hpp @@ -38,7 +38,8 @@ public: armnn::IRuntime* runtime, const HalModel& model, const std::string& requestInputsAndOutputsDumpDir, - const bool gpuProfilingEnabled); + const bool gpuProfilingEnabled, + const bool asyncModelExecutionEnabled = false); virtual ~ArmnnPreparedModel(); @@ -56,9 +57,65 @@ public: bool ExecuteWithDummyInputs(); private: + + template + class ArmnnThreadPoolCallback : public armnn::IAsyncExecutionCallback + { + public: + ArmnnThreadPoolCallback(ArmnnPreparedModel* model, + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext callbackContext) : + m_Model(model), + m_MemPools(pMemPools), + m_InputTensors(inputTensors), + m_OutputTensors(outputTensors), + m_CallbackContext(callbackContext) + {} + + void Notify(armnn::Status status, armnn::InferenceTimingPair timeTaken) override; + + // Retrieve the ArmNN Status from the AsyncExecutionCallback that has been notified + virtual armnn::Status GetStatus() const override + { + return armnn::Status::Success; + } + + // Block the calling thread until the AsyncExecutionCallback object allows it to proceed + virtual void Wait() const override + {} + + // Retrieve the start time before executing the inference + virtual armnn::HighResolutionClock GetStartTime() const override + { + return std::chrono::high_resolution_clock::now(); + } + + // Retrieve the time after executing the inference + virtual armnn::HighResolutionClock GetEndTime() const override + { + return std::chrono::high_resolution_clock::now(); + } + + ArmnnPreparedModel* m_Model; + std::shared_ptr> m_MemPools; + std::shared_ptr m_InputTensors; + std::shared_ptr m_OutputTensors; + CallbackContext m_CallbackContext; + }; + template void DumpTensorsIfRequired(char const* tensorNamePrefix, const TensorBindingCollection& tensorBindings); + /// schedule the graph prepared from the request for execution + template + void ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext m_CallbackContext); + armnn::NetworkId m_NetworkId; armnn::IRuntime* m_Runtime; HalModel m_Model; @@ -68,6 +125,9 @@ private: uint32_t m_RequestCount; const std::string& m_RequestInputsAndOutputsDumpDir; const bool m_GpuProfilingEnabled; + + std::unique_ptr m_WorkingMemHandle; + const bool m_AsyncModelExecutionEnabled; }; } diff --git a/ArmnnPreparedModel_1_2.cpp b/ArmnnPreparedModel_1_2.cpp index a2148c29..c129fd69 100644 --- a/ArmnnPreparedModel_1_2.cpp +++ b/ArmnnPreparedModel_1_2.cpp @@ -6,6 +6,7 @@ #define LOG_TAG "ArmnnDriver" #include "ArmnnPreparedModel_1_2.hpp" + #include "Utils.hpp" #include @@ -146,16 +147,23 @@ ArmnnPreparedModel_1_2::ArmnnPreparedModel_1_2(armnn::NetworkId netw armnn::IRuntime* runtime, const V1_2::Model& model, const std::string& requestInputsAndOutputsDumpDir, - const bool gpuProfilingEnabled) + const bool gpuProfilingEnabled, + const bool asyncModelExecutionEnabled) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) , m_RequestCount(0) , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) + , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) { // Enable profiling if required. m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); + + if (asyncModelExecutionEnabled) + { + m_WorkingMemHandle = m_Runtime->CreateWorkingMemHandle(networkId); + } } template @@ -440,7 +448,17 @@ bool ArmnnPreparedModel_1_2::ExecuteGraph( deviceStart = Now(); } - armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + armnn::Status status; + if (m_AsyncModelExecutionEnabled) + { + ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled true"); + status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors); + } + else + { + ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled false"); + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + } if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES) { @@ -567,12 +585,21 @@ Return ArmnnPreparedModel_1_2::Execute(const V1_ {} } - ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg"); // post the request for asynchronous execution CallbackContext_1_2 cb; cb.callback = callback; cb.ctx = ctx; + + if (m_AsyncModelExecutionEnabled) + { + ALOGV("ArmnnPreparedModel_1_2::execute(...) before ScheduleGraphForExecution"); + ScheduleGraphForExecution(memPools, inputTensors, outputTensors, cb); + ALOGV("ArmnnPreparedModel_1_2::execute(...) after ScheduleGraphForExecution"); + return V1_0::ErrorStatus::NONE; + } + + ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg"); m_RequestThread.PostMsg(this, memPools, inputTensors, outputTensors, cb); ALOGV("ArmnnPreparedModel_1_2::execute(...) after PostMsg"); return V1_0::ErrorStatus::NONE; @@ -602,6 +629,84 @@ Return ArmnnPreparedModel_1_2::configureExecutionBurst( return Void(); } +/// Schedule the graph prepared from the request for execution +template +template +void ArmnnPreparedModel_1_2::ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext callbackContext) +{ + ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution(...)"); + + DumpTensorsIfRequired("Input", *inputTensors); + + unsigned int outputTensorSize = outputTensors.get()->size(); + std::vector outputShapes(outputTensorSize); + for (unsigned int i = 0; i < outputTensorSize; i++) + { + std::pair outputTensorPair = outputTensors.get()->at(i); + const armnn::Tensor outputTensor = outputTensorPair.second; + const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo(); + + outputShapes[i] = ComputeShape(outputTensorInfo); + } + + auto tpCb = std::make_shared< + ArmnnThreadPoolCallback_1_2>(this, + pMemPools, + outputShapes, + inputTensors, + outputTensors, + callbackContext); + + m_Runtime->Schedule(m_NetworkId, + *tpCb->m_InputTensors, + *tpCb->m_OutputTensors, + armnn::QosExecPriority::High, + tpCb); + ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution end"); +} + +template +template +void ArmnnPreparedModel_1_2::ArmnnThreadPoolCallback_1_2::Notify( + armnn::Status status, armnn::InferenceTimingPair timeTaken) +{ + ALOGV("ArmnnPreparedModel_1_2::ArmnnThreadPoolCallback_1_2 Notify"); + + TimePoint driverEnd; + + CommitPools(*m_MemPools); + + m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors); + + if (status != armnn::Status::Success) + { + ALOGW("ArmnnThreadPoolCallback::Notify EnqueueWorkload failed"); + m_CallbackContext.callback( + V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel::ExecuteGraph"); + return; + } + + if (m_CallbackContext.ctx.measureTimings == V1_2::MeasureTiming::YES) + { + driverEnd = std::chrono::steady_clock::now(); + V1_2::Timing timing; + timing.timeOnDevice = MicrosecondsDuration(timeTaken.second, timeTaken.first); + timing.timeInDriver = MicrosecondsDuration(driverEnd, m_CallbackContext.ctx.driverStart); + ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu", timing.timeOnDevice, + timing.timeInDriver); + m_CallbackContext.callback( + V1_0::ErrorStatus::NONE, m_OutputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph"); + } else { + m_CallbackContext.callback( + V1_0::ErrorStatus::NONE, m_OutputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph"); + } + return; +} + #if defined(ARMNN_ANDROID_NN_V1_2) || defined(ARMNN_ANDROID_NN_V1_3) template class ArmnnPreparedModel_1_2; template bool ArmnnPreparedModel_1_2::ExecuteGraph( @@ -609,6 +714,12 @@ template bool ArmnnPreparedModel_1_2::ExecuteGraph::ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext_1_2 callbackContext); #endif } // namespace armnn_driver diff --git a/ArmnnPreparedModel_1_2.hpp b/ArmnnPreparedModel_1_2.hpp index 13d7494e..6c630c56 100644 --- a/ArmnnPreparedModel_1_2.hpp +++ b/ArmnnPreparedModel_1_2.hpp @@ -44,7 +44,8 @@ public: armnn::IRuntime* runtime, const HalModel& model, const std::string& requestInputsAndOutputsDumpDir, - const bool gpuProfilingEnabled); + const bool gpuProfilingEnabled, + const bool asyncModelExecutionEnabled = false); virtual ~ArmnnPreparedModel_1_2(); @@ -76,6 +77,57 @@ public: bool ExecuteWithDummyInputs(); private: + + template + class ArmnnThreadPoolCallback_1_2 : public armnn::IAsyncExecutionCallback + { + public: + ArmnnThreadPoolCallback_1_2(ArmnnPreparedModel_1_2* model, + std::shared_ptr>& pMemPools, + std::vector outputShapes, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext callbackContext) : + m_Model(model), + m_MemPools(pMemPools), + m_OutputShapes(outputShapes), + m_InputTensors(inputTensors), + m_OutputTensors(outputTensors), + m_CallbackContext(callbackContext) + {} + + void Notify(armnn::Status status, armnn::InferenceTimingPair timeTaken) override; + + // Retrieve the Arm NN Status from the AsyncExecutionCallback that has been notified + virtual armnn::Status GetStatus() const override + { + return armnn::Status::Success; + } + + // Block the calling thread until the AsyncExecutionCallback object allows it to proceed + virtual void Wait() const override + {} + + // Retrieve the start time before executing the inference + virtual armnn::HighResolutionClock GetStartTime() const override + { + return std::chrono::high_resolution_clock::now(); + } + + // Retrieve the time after executing the inference + virtual armnn::HighResolutionClock GetEndTime() const override + { + return std::chrono::high_resolution_clock::now(); + } + + ArmnnPreparedModel_1_2* m_Model; + std::shared_ptr> m_MemPools; + std::vector m_OutputShapes; + std::shared_ptr m_InputTensors; + std::shared_ptr m_OutputTensors; + CallbackContext m_CallbackContext; + }; + Return Execute(const V1_0::Request& request, V1_2::MeasureTiming measureTiming, CallbackAsync_1_2 callback); @@ -101,6 +153,14 @@ private: template void DumpTensorsIfRequired(char const* tensorNamePrefix, const TensorBindingCollection& tensorBindings); + /// schedule the graph prepared from the request for execution + template + void ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext m_CallbackContext); + armnn::NetworkId m_NetworkId; armnn::IRuntime* m_Runtime; V1_2::Model m_Model; @@ -112,6 +172,9 @@ private: uint32_t m_RequestCount; const std::string& m_RequestInputsAndOutputsDumpDir; const bool m_GpuProfilingEnabled; + + std::unique_ptr m_WorkingMemHandle; + const bool m_AsyncModelExecutionEnabled; }; } diff --git a/ArmnnPreparedModel_1_3.cpp b/ArmnnPreparedModel_1_3.cpp index 3d93b99d..5a370321 100644 --- a/ArmnnPreparedModel_1_3.cpp +++ b/ArmnnPreparedModel_1_3.cpp @@ -168,7 +168,8 @@ ArmnnPreparedModel_1_3::ArmnnPreparedModel_1_3(armnn::NetworkId netw const V1_3::Model& model, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, - V1_3::Priority priority) + V1_3::Priority priority, + const bool asyncModelExecutionEnabled) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) @@ -176,9 +177,15 @@ ArmnnPreparedModel_1_3::ArmnnPreparedModel_1_3(armnn::NetworkId netw , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) , m_ModelPriority(priority) + , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) { // Enable profiling if required. m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); + + if (asyncModelExecutionEnabled) + { + m_WorkingMemHandle = m_Runtime->CreateWorkingMemHandle(networkId); + } } template @@ -726,8 +733,17 @@ Return ArmnnPreparedModel_1_3::ExecuteGraph( { cb.ctx.deviceStart = Now(); } - - armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + armnn::Status status; + if (m_AsyncModelExecutionEnabled) + { + ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph m_AsyncModelExecutionEnabled true"); + status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors); + } + else + { + ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph m_AsyncModelExecutionEnabled false"); + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + } if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES) { @@ -735,7 +751,7 @@ Return ArmnnPreparedModel_1_3::ExecuteGraph( } if (status != armnn::Status::Success) { - ALOGW("EnqueueWorkload failed"); + ALOGW("ArmnnPreparedModel_1_3::ExecuteGraph EnqueueWorkload failed"); cb.callback(V1_3::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph"); return V1_3::ErrorStatus::GENERAL_FAILURE; } @@ -773,6 +789,47 @@ Return ArmnnPreparedModel_1_3::ExecuteGraph( return V1_3::ErrorStatus::NONE; } +/// Schedule the graph prepared from the request for execution +template +template +void ArmnnPreparedModel_1_3::ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext callbackContext, + armnn::QosExecPriority priority) +{ + ALOGV("ArmnnPreparedModel_1_3::ScheduleGraphForExecution(...)"); + + DumpTensorsIfRequired("Input", *inputTensors); + + unsigned int outputTensorSize = outputTensors.get()->size(); + std::vector outputShapes(outputTensorSize); + for (unsigned int i = 0; i < outputTensorSize; i++) + { + std::pair outputTensorPair = outputTensors.get()->at(i); + const armnn::Tensor outputTensor = outputTensorPair.second; + const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo(); + + outputShapes[i] = ComputeShape(outputTensorInfo); + } + + auto tpCb = std::make_shared< + ArmnnThreadPoolCallback_1_3>(this, + pMemPools, + outputShapes, + inputTensors, + outputTensors, + callbackContext); + + m_Runtime->Schedule(m_NetworkId, + *tpCb->m_InputTensors, + *tpCb->m_OutputTensors, + priority, + tpCb); + ALOGV("ArmnnPreparedModel_1_3::ScheduleGraphForExecution end"); +} + template bool ArmnnPreparedModel_1_3::ExecuteWithDummyInputs() { @@ -862,13 +919,46 @@ Return ArmnnPreparedModel_1_3::Execute(const V1_ default: {} } - - ALOGV("ArmnnPreparedModel_1_3::execute(...) before PostMsg"); - - // post the request for asynchronous execution CallbackContext_1_3 cb; cb.callback = callback; cb.ctx = ctx; + + + enum class QosExecPriority + { + Low = 0, + Medium = 1, + High = 2 + }; + + + if (m_AsyncModelExecutionEnabled) + { + armnn::QosExecPriority priority; + + switch (GetModelPriority()) { + case V1_3::Priority::LOW: + priority = armnn::QosExecPriority::Low; + break; + case V1_3::Priority::MEDIUM: + priority = armnn::QosExecPriority::Medium; + break; + case V1_3::Priority::HIGH: + priority = armnn::QosExecPriority::High; + break; + default: + priority = armnn::QosExecPriority::Medium; + + } + + ALOGV("ArmnnPreparedModel_1_3::execute(...) before ScheduleGraphForExecution"); + ScheduleGraphForExecution(memPools, inputTensors, outputTensors, cb, priority); + ALOGV("ArmnnPreparedModel_1_3::execute(...) after ScheduleGraphForExecution"); + return V1_3::ErrorStatus::NONE; + } + + ALOGV("ArmnnPreparedModel_1_3::execute(...) before PostMsg"); + // post the request for asynchronous execution m_RequestThread.PostMsg(this, memPools, inputTensors, outputTensors, cb); ALOGV("ArmnnPreparedModel_1_3::execute(...) after PostMsg"); return V1_3::ErrorStatus::NONE; @@ -880,6 +970,46 @@ V1_3::Priority ArmnnPreparedModel_1_3::GetModelPriority() return m_ModelPriority; } +template +template +void ArmnnPreparedModel_1_3::ArmnnThreadPoolCallback_1_3::Notify( + armnn::Status status, armnn::InferenceTimingPair timeTaken) +{ + ALOGV("ArmnnPreparedModel_1_3::ArmnnThreadPoolCallback_1_3::Notify"); + CommitPools(*m_MemPools); + + m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors); + + if (status != armnn::Status::Success) + { + ALOGW("ArmnnThreadPoolCallback_1_3::Notify EnqueueWorkload failed"); + m_CallbackContext.callback(V1_3::ErrorStatus::GENERAL_FAILURE, + {}, + g_NoTiming, + "ArmnnPreparedModel_1_3::ArmnnThreadPoolCallback_1_3"); + return; + } + + if (m_CallbackContext.ctx.measureTimings == V1_2::MeasureTiming::YES) + { + m_CallbackContext.ctx.deviceStart = timeTaken.first; + m_CallbackContext.ctx.deviceEnd = timeTaken.second; + m_CallbackContext.ctx.driverEnd = std::chrono::steady_clock::now(); + V1_2::Timing timing; + timing.timeOnDevice = MicrosecondsDuration(m_CallbackContext.ctx.deviceEnd, m_CallbackContext.ctx.deviceStart); + timing.timeInDriver = MicrosecondsDuration(m_CallbackContext.ctx.driverEnd, m_CallbackContext.ctx.driverStart); + ALOGV("ArmnnPreparedModel_1_3::execute timing - Device = %lu Driver = %lu", timing.timeOnDevice, + timing.timeInDriver); + m_CallbackContext.callback( + V1_3::ErrorStatus::NONE, m_OutputShapes, timing, "ArmnnPreparedModel_1_3::ExecuteGraph"); + } else + { + m_CallbackContext.callback( + V1_3::ErrorStatus::NONE, m_OutputShapes, g_NoTiming, "ArmnnPreparedModel_1_3::ExecuteGraph"); + } + return; +} + #ifdef ARMNN_ANDROID_NN_V1_3 template class ArmnnPreparedModel_1_3; template Return ArmnnPreparedModel_1_3::ExecuteGraph( @@ -887,6 +1017,13 @@ template Return ArmnnPreparedModel_1_3:: armnn::InputTensors& pInputTensors, armnn::OutputTensors& pOutputTensors, CallbackContext_1_3 cb); + +template void ArmnnPreparedModel_1_3::ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext_1_3 callbackContext, + armnn::QosExecPriority priority); #endif } // namespace armnn_driver diff --git a/ArmnnPreparedModel_1_3.hpp b/ArmnnPreparedModel_1_3.hpp index c6cdcdc7..11299cc4 100644 --- a/ArmnnPreparedModel_1_3.hpp +++ b/ArmnnPreparedModel_1_3.hpp @@ -51,7 +51,8 @@ public: const HalModel& model, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, - V1_3::Priority priority = V1_3::Priority::MEDIUM); + V1_3::Priority priority = V1_3::Priority::MEDIUM, + const bool asyncModelExecutionEnabled = false); virtual ~ArmnnPreparedModel_1_3(); @@ -109,6 +110,57 @@ public: V1_3::Priority GetModelPriority(); private: + + template + class ArmnnThreadPoolCallback_1_3 : public armnn::IAsyncExecutionCallback + { + public: + ArmnnThreadPoolCallback_1_3(ArmnnPreparedModel_1_3* model, + std::shared_ptr>& pMemPools, + std::vector outputShapes, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext callbackContext) : + m_Model(model), + m_MemPools(pMemPools), + m_OutputShapes(outputShapes), + m_InputTensors(inputTensors), + m_OutputTensors(outputTensors), + m_CallbackContext(callbackContext) + {} + + void Notify(armnn::Status status, armnn::InferenceTimingPair timeTaken) override; + + // Retrieve the Arm NN Status from the AsyncExecutionCallback that has been notified + virtual armnn::Status GetStatus() const override + { + return armnn::Status::Success; + } + + // Block the calling thread until the AsyncExecutionCallback object allows it to proceed + virtual void Wait() const override + {} + + // Retrieve the start time before executing the inference + virtual armnn::HighResolutionClock GetStartTime() const override + { + return std::chrono::high_resolution_clock::now(); + } + + // Retrieve the time after executing the inference + virtual armnn::HighResolutionClock GetEndTime() const override + { + return std::chrono::high_resolution_clock::now(); + } + + ArmnnPreparedModel_1_3* m_Model; + std::shared_ptr> m_MemPools; + std::vector m_OutputShapes; + std::shared_ptr m_InputTensors; + std::shared_ptr m_OutputTensors; + CallbackContext m_CallbackContext; + }; + Return Execute(const V1_3::Request& request, V1_2::MeasureTiming measureTiming, CallbackAsync_1_3 callback); @@ -133,6 +185,15 @@ private: template void DumpTensorsIfRequired(char const* tensorNamePrefix, const TensorBindingCollection& tensorBindings); + /// schedule the graph prepared from the request for execution + template + void ScheduleGraphForExecution( + std::shared_ptr>& pMemPools, + std::shared_ptr& inputTensors, + std::shared_ptr& outputTensors, + CallbackContext m_CallbackContext, + armnn::QosExecPriority priority); + armnn::NetworkId m_NetworkId; armnn::IRuntime* m_Runtime; V1_3::Model m_Model; @@ -143,6 +204,9 @@ private: const std::string& m_RequestInputsAndOutputsDumpDir; const bool m_GpuProfilingEnabled; V1_3::Priority m_ModelPriority; + + std::unique_ptr m_WorkingMemHandle; + const bool m_AsyncModelExecutionEnabled; }; } diff --git a/DriverOptions.cpp b/DriverOptions.cpp index 42f7ea9e..5b67aa36 100644 --- a/DriverOptions.cpp +++ b/DriverOptions.cpp @@ -39,6 +39,8 @@ DriverOptions::DriverOptions(armnn::Compute computeDevice, bool fp16Enabled) , m_ShouldExit(false) , m_SaveCachedNetwork(false) , m_NumberOfThreads(0) + , m_EnableAsyncModelExecution(false) + , m_ArmnnNumberOfThreads(1) { } @@ -53,6 +55,8 @@ DriverOptions::DriverOptions(const std::vector& backends, bool , m_ShouldExit(false) , m_SaveCachedNetwork(false) , m_NumberOfThreads(0) + , m_EnableAsyncModelExecution(false) + , m_ArmnnNumberOfThreads(1) { } @@ -66,6 +70,8 @@ DriverOptions::DriverOptions(int argc, char** argv) , m_ShouldExit(false) , m_SaveCachedNetwork(false) , m_NumberOfThreads(0) + , m_EnableAsyncModelExecution(false) + , m_ArmnnNumberOfThreads(1) { std::string unsupportedOperationsAsString; std::string clTunedParametersModeAsString; @@ -154,7 +160,16 @@ DriverOptions::DriverOptions(int argc, char** argv) cxxopts::value(m_VerboseLogging)->default_value("false")) ("V,version", "Show version information", - cxxopts::value(showVersion)->default_value("false")); + cxxopts::value(showVersion)->default_value("false")) + + ("A,asyncModelExecution", "Enable AsynModel Execution", + cxxopts::value(m_EnableAsyncModelExecution)->default_value("false")) + + ("T,armnn-threads", + "Assign the number of threads used by ArmNN. " + "Input value must be at least 1. " + "Default is set to 1.", + cxxopts::value(m_ArmnnNumberOfThreads)->default_value("1")); } catch (const std::exception& e) { diff --git a/DriverOptions.hpp b/DriverOptions.hpp index 8b3f5743..e1d25c48 100644 --- a/DriverOptions.hpp +++ b/DriverOptions.hpp @@ -40,6 +40,8 @@ public: const std::string& GetCachedNetworkFilePath() const { return m_CachedNetworkFilePath; } bool SaveCachedNetwork() const { return m_SaveCachedNetwork; } unsigned int GetNumberOfThreads() const { return m_NumberOfThreads; } + bool isAsyncModelExecutionEnabled() const { return m_EnableAsyncModelExecution; }; + unsigned int getNoOfArmnnThreads() const { return m_ArmnnNumberOfThreads; }; private: std::vector m_Backends; @@ -59,6 +61,8 @@ private: std::string m_CachedNetworkFilePath; bool m_SaveCachedNetwork; unsigned int m_NumberOfThreads; + bool m_EnableAsyncModelExecution; + unsigned int m_ArmnnNumberOfThreads; }; } // namespace armnn_driver -- cgit v1.2.1