diff options
Diffstat (limited to 'ArmnnPreparedModel.cpp')
-rw-r--r-- | ArmnnPreparedModel.cpp | 212 |
1 files changed, 200 insertions, 12 deletions
diff --git a/ArmnnPreparedModel.cpp b/ArmnnPreparedModel.cpp index 60beac4f..d87f9f82 100644 --- a/ArmnnPreparedModel.cpp +++ b/ArmnnPreparedModel.cpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // @@ -8,11 +8,13 @@ #include "ArmnnPreparedModel.hpp" #include "Utils.hpp" +#include <armnn/Types.hpp> + #include <log/log.h> #include <OperationsUtils.h> #include <ValidateHal.h> -#include <cassert> +#include <chrono> #include <cinttypes> #ifdef ARMNN_ANDROID_S @@ -90,6 +92,9 @@ RequestThread<ArmnnPreparedModel, HalVersion, CallbackContext_1_0> ArmnnPreparedModel<HalVersion>::m_RequestThread; template<typename HalVersion> +std::unique_ptr<armnn::Threadpool> ArmnnPreparedModel<HalVersion>::m_Threadpool(nullptr); + +template<typename HalVersion> template <typename TensorBindingCollection> void ArmnnPreparedModel<HalVersion>::DumpTensorsIfRequired(char const* tensorNamePrefix, const TensorBindingCollection& tensorBindings) @@ -112,16 +117,43 @@ ArmnnPreparedModel<HalVersion>::ArmnnPreparedModel(armnn::NetworkId networkId, armnn::IRuntime* runtime, const HalModel& model, const std::string& requestInputsAndOutputsDumpDir, - const bool gpuProfilingEnabled) + const bool gpuProfilingEnabled, + const bool asyncModelExecutionEnabled, + const unsigned int numberOfThreads, + const bool importEnabled, + const bool exportEnabled) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) , m_RequestCount(0) , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) + , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) + , m_EnableImport(importEnabled) + , m_EnableExport(exportEnabled) { // Enable profiling if required. m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); + + if (m_AsyncModelExecutionEnabled) + { + std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles; + for (unsigned int i=0; i < numberOfThreads; ++i) + { + memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId)); + } + + if (!m_Threadpool) + { + m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles); + } + else + { + m_Threadpool->LoadMemHandles(memHandles); + } + + m_WorkingMemHandle = memHandles.back(); + } } template<typename HalVersion> @@ -129,12 +161,21 @@ ArmnnPreparedModel<HalVersion>::~ArmnnPreparedModel() { // Get a hold of the profiler used by this model. std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkId); + if (profiler && m_GpuProfilingEnabled) + { + // Dump the profiling info to a file if required. + DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId, + profiler.get()); + } // Unload the network associated with this model. m_Runtime->UnloadNetwork(m_NetworkId); - // Dump the profiling info to a file if required. - DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId, profiler.get()); + // Unload the network memhandles from the threadpool + if (m_AsyncModelExecutionEnabled) + { + m_Threadpool->UnloadMemHandles(m_NetworkId); + } } template<typename HalVersion> @@ -177,6 +218,7 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel<HalVersion>::execute( NotifyCallbackAndCheck(callback, V1_0::ErrorStatus::GENERAL_FAILURE, "ArmnnPreparedModel::execute"); return V1_0::ErrorStatus::GENERAL_FAILURE; } + // add the inputs and outputs with their data try { @@ -184,8 +226,19 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel<HalVersion>::execute( for (unsigned int i = 0; i < request.inputs.size(); i++) { const auto& inputArg = request.inputs[i]; + armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); + // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors. + // Therefore, set all TensorInfo isConstant parameters of input Tensors to true. + inputTensorInfo.SetConstant(); + auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request, + inputTensorInfo, + inputArg, + "input"); + if (result != V1_0::ErrorStatus::NONE) + { + return result; + } - const armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, *pMemPools); if (inputTensor.GetMemoryArea() == nullptr) { @@ -200,8 +253,17 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel<HalVersion>::execute( for (unsigned int i = 0; i < request.outputs.size(); i++) { const auto& outputArg = request.outputs[i]; - const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i); + auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request, + outputTensorInfo, + outputArg, + "output"); + + if (result != V1_0::ErrorStatus::NONE) + { + return result; + } + const armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, *pMemPools); if (outputTensor.GetMemoryArea() == nullptr) { @@ -225,8 +287,6 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel<HalVersion>::execute( return V1_0::ErrorStatus::GENERAL_FAILURE; } - ALOGV("ArmnnPreparedModel::execute(...) before PostMsg"); - auto cb = [callback](V1_0::ErrorStatus errorStatus, std::string callingFunction) { NotifyCallbackAndCheck(callback, errorStatus, callingFunction); @@ -234,7 +294,17 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel<HalVersion>::execute( CallbackContext_1_0 armnnCb; armnnCb.callback = cb; + + if (m_AsyncModelExecutionEnabled) + { + ALOGV("ArmnnPreparedModel::execute(...) before ScheduleGraphForExecution"); + ScheduleGraphForExecution(pMemPools, pInputTensors, pOutputTensors, armnnCb); + ALOGV("ArmnnPreparedModel::execute(...) after ScheduleGraphForExecution"); + return V1_0::ErrorStatus::NONE; + } + // post the request for asynchronous execution + ALOGV("ArmnnPreparedModel::execute(...) before PostMsg"); m_RequestThread.PostMsg(this, pMemPools, pInputTensors, pOutputTensors, armnnCb); ALOGV("ArmnnPreparedModel::execute(...) after PostMsg"); return V1_0::ErrorStatus::NONE; // successfully queued @@ -248,13 +318,37 @@ void ArmnnPreparedModel<HalVersion>::ExecuteGraph( CallbackContext_1_0 cb) { ALOGV("ArmnnPreparedModel::ExecuteGraph(...)"); + // Capture the graph execution start time. + std::chrono::time_point<std::chrono::system_clock> graphExecutionStart = std::chrono::system_clock::now(); DumpTensorsIfRequired("Input", inputTensors); // run it try { - armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + armnn::Status status; + if (m_AsyncModelExecutionEnabled) + { + ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled true"); + status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors); + } + else + { + ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false"); + // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied. + std::vector<armnn::ImportedInputId> importedInputIds; + if (m_EnableImport) + { + importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc); + } + std::vector<armnn::ImportedOutputId> importedOutputIds; + if (m_EnableExport) + { + importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc); + } + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors, + importedInputIds, importedOutputIds); + } if (status != armnn::Status::Success) { ALOGW("EnqueueWorkload failed"); @@ -291,6 +385,12 @@ void ArmnnPreparedModel<HalVersion>::ExecuteGraph( #endif } + // Log the total time in this call. This is a good number to compare to that printed out by + // RuntimeImpl::EnqueueWorkload. The difference should be the execution overhead of the driver. + ALOGI("ArmnnPreparedModel::ExecuteGraph Execution time = %lld µs", + std::chrono::duration_cast<std::chrono::microseconds> + (std::chrono::system_clock::now() - graphExecutionStart).count()); + cb.callback(V1_0::ErrorStatus::NONE, "ExecuteGraph"); } @@ -301,7 +401,11 @@ bool ArmnnPreparedModel<HalVersion>::ExecuteWithDummyInputs() armnn::InputTensors inputTensors; for (unsigned int i = 0; i < getMainModel(m_Model).inputIndexes.size(); i++) { - const armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); + armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); + // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors. + // Therefore, set all TensorInfo isConstant parameters of input Tensors to true. + inputTensorInfo.SetConstant(); + storage.emplace_back(inputTensorInfo.GetNumBytes()); const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data()); @@ -320,7 +424,29 @@ bool ArmnnPreparedModel<HalVersion>::ExecuteWithDummyInputs() try { - armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + armnn::Status status; + if (m_AsyncModelExecutionEnabled) + { + ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled true"); + status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors); + } + else + { + ALOGW("ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false"); + // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied. + std::vector<armnn::ImportedInputId> importedInputIds; + if (m_EnableImport) + { + importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc); + } + std::vector<armnn::ImportedOutputId> importedOutputIds; + if (m_EnableExport) + { + importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc); + } + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors, + importedInputIds, importedOutputIds); + } if (status != armnn::Status::Success) { ALOGW("ExecuteWithDummyInputs: EnqueueWorkload failed"); @@ -340,11 +466,73 @@ bool ArmnnPreparedModel<HalVersion>::ExecuteWithDummyInputs() return true; } +/// Schedule the graph prepared from the request for execution +template<typename HalVersion> +template<typename CallbackContext> +void ArmnnPreparedModel<HalVersion>::ScheduleGraphForExecution( + std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools, + std::shared_ptr<armnn::InputTensors>& inputTensors, + std::shared_ptr<armnn::OutputTensors>& outputTensors, + CallbackContext callbackContext) +{ + ALOGV("ArmnnPreparedModel::ScheduleGraphForExecution(...)"); + + DumpTensorsIfRequired("Input", *inputTensors); + + + auto tpCb = std::make_shared< + ArmnnThreadPoolCallback<CallbackContext_1_0>>(this, + pMemPools, + inputTensors, + outputTensors, + callbackContext); + + m_Threadpool->Schedule(m_NetworkId, + *tpCb->m_InputTensors, + *tpCb->m_OutputTensors, + armnn::QosExecPriority::Medium, + tpCb); + ALOGV("ArmnnPreparedModel::ScheduleGraphForExecution end"); +} + +template<typename HalVersion> +template <typename CallbackContext> +void ArmnnPreparedModel<HalVersion>::ArmnnThreadPoolCallback<CallbackContext>::Notify( + armnn::Status status, armnn::InferenceTimingPair timeTaken) +{ + armnn::IgnoreUnused(status, timeTaken); + ALOGV("ArmnnPreparedModel::ArmnnThreadPoolCallback_1_2 Notify"); + + m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors); + + // Commit output buffers. + // Note that we update *all* pools, even if they aren't actually used as outputs - + // this is simpler and is what the CpuExecutor does. + for (android::nn::RunTimePoolInfo& pool : *m_MemPools) + { + // Type android::nn::RunTimePoolInfo has changed between Android P & Q and Android R, where + // update() has been removed and flush() added. + #if defined(ARMNN_ANDROID_R) || defined(ARMNN_ANDROID_S) // Use the new Android implementation. + pool.flush(); + #else + pool.update(); + #endif + } + + m_CallbackContext.callback(V1_0::ErrorStatus::NONE, "ArmnnPreparedModel::ArmnnThreadPoolCallback_1_2 Notify"); + return; +} + /// /// Class template specializations /// template class ArmnnPreparedModel<hal_1_0::HalPolicy>; +template void ArmnnPreparedModel<hal_1_0::HalPolicy>::ScheduleGraphForExecution<CallbackContext_1_0>( + std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools, + std::shared_ptr<armnn::InputTensors>& inputTensors, + std::shared_ptr<armnn::OutputTensors>& outputTensors, + CallbackContext_1_0 callbackContext); #ifdef ARMNN_ANDROID_NN_V1_1 template class ArmnnPreparedModel<hal_1_1::HalPolicy>; |