diff options
Diffstat (limited to 'ArmnnPreparedModel_1_2.cpp')
-rw-r--r-- | ArmnnPreparedModel_1_2.cpp | 281 |
1 files changed, 261 insertions, 20 deletions
diff --git a/ArmnnPreparedModel_1_2.cpp b/ArmnnPreparedModel_1_2.cpp index a2148c29..a401b30e 100644 --- a/ArmnnPreparedModel_1_2.cpp +++ b/ArmnnPreparedModel_1_2.cpp @@ -1,19 +1,22 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #define LOG_TAG "ArmnnDriver" #include "ArmnnPreparedModel_1_2.hpp" + #include "Utils.hpp" +#include <armnn/Types.hpp> + #include <log/log.h> #include <OperationsUtils.h> #include <ExecutionBurstServer.h> #include <ValidateHal.h> -#include <cassert> +#include <chrono> #include <cinttypes> #ifdef ARMNN_ANDROID_S @@ -124,6 +127,9 @@ RequestThread<ArmnnPreparedModel_1_2, HalVersion, CallbackContext_1_2> ArmnnPreparedModel_1_2<HalVersion>::m_RequestThread; template<typename HalVersion> +std::unique_ptr<armnn::Threadpool> ArmnnPreparedModel_1_2<HalVersion>::m_Threadpool(nullptr); + +template<typename HalVersion> template<typename TensorBindingCollection> void ArmnnPreparedModel_1_2<HalVersion>::DumpTensorsIfRequired(char const* tensorNamePrefix, const TensorBindingCollection& tensorBindings) @@ -146,16 +152,88 @@ ArmnnPreparedModel_1_2<HalVersion>::ArmnnPreparedModel_1_2(armnn::NetworkId netw armnn::IRuntime* runtime, const V1_2::Model& model, const std::string& requestInputsAndOutputsDumpDir, - const bool gpuProfilingEnabled) + const bool gpuProfilingEnabled, + const bool asyncModelExecutionEnabled, + const unsigned int numberOfThreads, + const bool importEnabled, + const bool exportEnabled) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) , m_RequestCount(0) , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) + , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) + , m_EnableImport(importEnabled) + , m_EnableExport(exportEnabled) + , m_PreparedFromCache(false) { // Enable profiling if required. m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); + + if (m_AsyncModelExecutionEnabled) + { + std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles; + for (unsigned int i=0; i < numberOfThreads; ++i) + { + memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId)); + } + + if (!m_Threadpool) + { + m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles); + } + else + { + m_Threadpool->LoadMemHandles(memHandles); + } + + m_WorkingMemHandle = memHandles.back(); + } +} + +template<typename HalVersion> +ArmnnPreparedModel_1_2<HalVersion>::ArmnnPreparedModel_1_2(armnn::NetworkId networkId, + armnn::IRuntime* runtime, + const std::string& requestInputsAndOutputsDumpDir, + const bool gpuProfilingEnabled, + const bool asyncModelExecutionEnabled, + const unsigned int numberOfThreads, + const bool importEnabled, + const bool exportEnabled, + const bool preparedFromCache) + : m_NetworkId(networkId) + , m_Runtime(runtime) + , m_RequestCount(0) + , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) + , m_GpuProfilingEnabled(gpuProfilingEnabled) + , m_AsyncModelExecutionEnabled(asyncModelExecutionEnabled) + , m_EnableImport(importEnabled) + , m_EnableExport(exportEnabled) + , m_PreparedFromCache(preparedFromCache) +{ + // Enable profiling if required. + m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); + + if (m_AsyncModelExecutionEnabled) + { + std::vector<std::shared_ptr<armnn::IWorkingMemHandle>> memHandles; + for (unsigned int i=0; i < numberOfThreads; ++i) + { + memHandles.emplace_back(m_Runtime->CreateWorkingMemHandle(networkId)); + } + + if (!m_Threadpool) + { + m_Threadpool = std::make_unique<armnn::Threadpool>(numberOfThreads, runtime, memHandles); + } + else + { + m_Threadpool->LoadMemHandles(memHandles); + } + + m_WorkingMemHandle = memHandles.back(); + } } template<typename HalVersion> @@ -163,12 +241,21 @@ ArmnnPreparedModel_1_2<HalVersion>::~ArmnnPreparedModel_1_2() { // Get a hold of the profiler used by this model. std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkId); + if (profiler && m_GpuProfilingEnabled) + { + // Dump the profiling info to a file if required. + DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId, + profiler.get()); + } // Unload the network associated with this model. m_Runtime->UnloadNetwork(m_NetworkId); - // Dump the profiling info to a file if required. - DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId, profiler.get()); + // Unload the network memhandles from the threadpool + if (m_AsyncModelExecutionEnabled) + { + m_Threadpool->UnloadMemHandles(m_NetworkId); + } } template<typename HalVersion> @@ -225,8 +312,20 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForIn for (unsigned int i = 0; i < request.inputs.size(); i++) { const auto& inputArg = request.inputs[i]; + armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); + // inputs (of type InputTensors) is composed of a vector of ConstTensors. + // Therefore, set all TensorInfo isConstant parameters of input Tensors to true. + inputTensorInfo.SetConstant(); + auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request, + inputTensorInfo, + inputArg, + "input"); + + if (result != V1_0::ErrorStatus::NONE) + { + return result; + } - const armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, memPools); if (inputTensor.GetMemoryArea() == nullptr) @@ -252,8 +351,17 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForOu for (unsigned int i = 0; i < request.outputs.size(); i++) { const auto& outputArg = request.outputs[i]; + armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i); + auto result = ValidateRequestArgument<V1_0::ErrorStatus, V1_0::Request>(request, + outputTensorInfo, + outputArg, + "output"); + + if (result != V1_0::ErrorStatus::NONE) + { + return result; + } - const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i); const armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, memPools); if (outputTensor.GetMemoryArea() == nullptr) { @@ -277,7 +385,7 @@ Return<V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::PrepareMemoryForOu return V1_0::ErrorStatus::OUTPUT_INSUFFICIENT_SIZE; } #else - const size_t bufferSize = memPools.at(outputArg.location.poolIndex).getMemory().size; + const size_t bufferSize = memPools.at(outputArg.location.poolIndex).getSize(); if (bufferSize < outputSize) { ALOGW("ArmnnPreparedModel_1_2::Execute failed bufferSize (%s) < outputSize (%s)", @@ -352,7 +460,10 @@ Return<void> ArmnnPreparedModel_1_2<HalVersion>::executeSynchronously(const V1_0 V1_2::MeasureTiming measureTiming, executeSynchronously_cb cb) { - ALOGV("ArmnnPreparedModel_1_2::executeSynchronously(): %s", GetModelSummary(m_Model).c_str()); + if (!m_PreparedFromCache) + { + ALOGV("ArmnnPreparedModel_1_2::executeSynchronously(): %s", GetModelSummary(m_Model).c_str()); + } m_RequestCount++; if (cb == nullptr) @@ -368,7 +479,7 @@ Return<void> ArmnnPreparedModel_1_2<HalVersion>::executeSynchronously(const V1_0 driverStart = Now(); } - if (!android::nn::validateRequest(request, m_Model)) + if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model)) { ALOGE("ArmnnPreparedModel_1_2::executeSynchronously invalid request model"); cb(V1_0::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming); @@ -419,6 +530,8 @@ bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteGraph( ALOGV("ArmnnPreparedModel_1_2::ExecuteGraph(...)"); TimePoint driverEnd, deviceStart, deviceEnd; + // Capture the graph execution start time. + std::chrono::time_point<std::chrono::system_clock> graphExecutionStart = std::chrono::system_clock::now(); DumpTensorsIfRequired("Input", inputTensors); @@ -440,7 +553,30 @@ bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteGraph( deviceStart = Now(); } - armnn::Status status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); + armnn::Status status; + if (m_AsyncModelExecutionEnabled) + { + ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled true"); + status = m_Runtime->Execute(*m_WorkingMemHandle, inputTensors, outputTensors); + } + else + { + ALOGW("ArmnnPreparedModel_1_2::ExecuteGraph m_AsyncModelExecutionEnabled false"); + + // Create a vector of Input and Output Ids which can be imported. An empty vector means all will be copied. + std::vector<armnn::ImportedInputId> importedInputIds; + if (m_EnableImport) + { + importedInputIds = m_Runtime->ImportInputs(m_NetworkId, inputTensors, armnn::MemorySource::Malloc); + } + std::vector<armnn::ImportedOutputId> importedOutputIds; + if (m_EnableExport) + { + importedOutputIds = m_Runtime->ImportOutputs(m_NetworkId, outputTensors, armnn::MemorySource::Malloc); + } + status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors, + importedInputIds, importedOutputIds); + } if (cb.ctx.measureTimings == V1_2::MeasureTiming::YES) { @@ -477,24 +613,33 @@ bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteGraph( V1_2::Timing timing; timing.timeOnDevice = MicrosecondsDuration(deviceEnd, deviceStart); timing.timeInDriver = MicrosecondsDuration(driverEnd, cb.ctx.driverStart); - ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu", timing.timeOnDevice, - timing.timeInDriver); + ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu", + static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver)); cb.callback(V1_0::ErrorStatus::NONE, outputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph"); } else { cb.callback(V1_0::ErrorStatus::NONE, outputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph"); } + // Log the total time in this call. This is a good number to compare to that printed out by + // RuntimeImpl::EnqueueWorkload. The difference should be the execution overhead of the driver. + ALOGI("ArmnnPreparedModel_1_2::ExecuteGraph Execution time = %lld µs", + std::chrono::duration_cast<std::chrono::microseconds> + (std::chrono::system_clock::now() - graphExecutionStart).count()); return true; } template<typename HalVersion> -bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteWithDummyInputs() +bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteWithDummyInputs(unsigned int numInputs, unsigned int numOutputs) { std::vector<std::vector<char>> storage; armnn::InputTensors inputTensors; - for (unsigned int i = 0; i < getMainModel(m_Model).inputIndexes.size(); i++) + for (unsigned int i = 0; i < numInputs; i++) { - const armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); + armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); + // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors. + // Therefore, set all TensorInfo isConstant parameters of input Tensors to true. + inputTensorInfo.SetConstant(); + storage.emplace_back(inputTensorInfo.GetNumBytes()); const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data()); @@ -502,7 +647,7 @@ bool ArmnnPreparedModel_1_2<HalVersion>::ExecuteWithDummyInputs() } armnn::OutputTensors outputTensors; - for (unsigned int i = 0; i < getMainModel(m_Model).outputIndexes.size(); i++) + for (unsigned int i = 0; i < numOutputs; i++) { const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i); storage.emplace_back(outputTensorInfo.GetNumBytes()); @@ -534,10 +679,13 @@ Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::Execute(const V1_ ctx.driverStart = Now(); } - ALOGV("ArmnnPreparedModel_1_2::execute(): %s", GetModelSummary(m_Model).c_str()); + if (!m_PreparedFromCache) + { + ALOGV("ArmnnPreparedModel_1_2::execute(): %s", GetModelSummary(m_Model).c_str()); + } m_RequestCount++; - if (!android::nn::validateRequest(request, m_Model)) + if (!m_PreparedFromCache && !android::nn::validateRequest(request, m_Model)) { callback(V1_0::ErrorStatus::INVALID_ARGUMENT, {}, g_NoTiming, "ArmnnPreparedModel_1_2::execute"); return V1_0::ErrorStatus::INVALID_ARGUMENT; @@ -567,12 +715,21 @@ Return <V1_0::ErrorStatus> ArmnnPreparedModel_1_2<HalVersion>::Execute(const V1_ {} } - ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg"); // post the request for asynchronous execution CallbackContext_1_2 cb; cb.callback = callback; cb.ctx = ctx; + + if (m_AsyncModelExecutionEnabled) + { + ALOGV("ArmnnPreparedModel_1_2::execute(...) before ScheduleGraphForExecution"); + ScheduleGraphForExecution(memPools, inputTensors, outputTensors, cb); + ALOGV("ArmnnPreparedModel_1_2::execute(...) after ScheduleGraphForExecution"); + return V1_0::ErrorStatus::NONE; + } + + ALOGV("ArmnnPreparedModel_1_2::execute(...) before PostMsg"); m_RequestThread.PostMsg(this, memPools, inputTensors, outputTensors, cb); ALOGV("ArmnnPreparedModel_1_2::execute(...) after PostMsg"); return V1_0::ErrorStatus::NONE; @@ -602,6 +759,84 @@ Return<void> ArmnnPreparedModel_1_2<HalVersion>::configureExecutionBurst( return Void(); } +/// Schedule the graph prepared from the request for execution +template<typename HalVersion> +template<typename CallbackContext> +void ArmnnPreparedModel_1_2<HalVersion>::ScheduleGraphForExecution( + std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools, + std::shared_ptr<armnn::InputTensors>& inputTensors, + std::shared_ptr<armnn::OutputTensors>& outputTensors, + CallbackContext callbackContext) +{ + ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution(...)"); + + DumpTensorsIfRequired("Input", *inputTensors); + + unsigned int outputTensorSize = outputTensors.get()->size(); + std::vector<V1_2::OutputShape> outputShapes(outputTensorSize); + for (unsigned int i = 0; i < outputTensorSize; i++) + { + std::pair<int, armnn::Tensor> outputTensorPair = outputTensors.get()->at(i); + const armnn::Tensor outputTensor = outputTensorPair.second; + const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo(); + + outputShapes[i] = ComputeShape(outputTensorInfo); + } + + auto tpCb = std::make_shared< + ArmnnThreadPoolCallback_1_2<CallbackContext_1_2>>(this, + pMemPools, + outputShapes, + inputTensors, + outputTensors, + callbackContext); + + m_Threadpool->Schedule(m_NetworkId, + *tpCb->m_InputTensors, + *tpCb->m_OutputTensors, + armnn::QosExecPriority::Medium, + tpCb); + ALOGV("ArmnnPreparedModel_1_2::ScheduleGraphForExecution end"); +} + +template<typename HalVersion> +template <typename CallbackContext> +void ArmnnPreparedModel_1_2<HalVersion>::ArmnnThreadPoolCallback_1_2<CallbackContext>::Notify( + armnn::Status status, armnn::InferenceTimingPair timeTaken) +{ + ALOGV("ArmnnPreparedModel_1_2::ArmnnThreadPoolCallback_1_2 Notify"); + + TimePoint driverEnd; + + CommitPools(*m_MemPools); + + m_Model->DumpTensorsIfRequired("Output", *m_OutputTensors); + + if (status != armnn::Status::Success) + { + ALOGW("ArmnnThreadPoolCallback::Notify EnqueueWorkload failed"); + m_CallbackContext.callback( + V1_0::ErrorStatus::GENERAL_FAILURE, {}, g_NoTiming, "ArmnnPreparedModel::ExecuteGraph"); + return; + } + + if (m_CallbackContext.ctx.measureTimings == V1_2::MeasureTiming::YES) + { + driverEnd = std::chrono::steady_clock::now(); + V1_2::Timing timing; + timing.timeOnDevice = MicrosecondsDuration(timeTaken.second, timeTaken.first); + timing.timeInDriver = MicrosecondsDuration(driverEnd, m_CallbackContext.ctx.driverStart); + ALOGV("ArmnnPreparedModel_1_2::execute timing - Device = %lu Driver = %lu", + static_cast<unsigned long>(timing.timeOnDevice), static_cast<unsigned long>(timing.timeInDriver)); + m_CallbackContext.callback( + V1_0::ErrorStatus::NONE, m_OutputShapes, timing, "ArmnnPreparedModel_1_2::ExecuteGraph"); + } else { + m_CallbackContext.callback( + V1_0::ErrorStatus::NONE, m_OutputShapes, g_NoTiming, "ArmnnPreparedModel_1_2::ExecuteGraph"); + } + return; +} + #if defined(ARMNN_ANDROID_NN_V1_2) || defined(ARMNN_ANDROID_NN_V1_3) template class ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>; template bool ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ExecuteGraph<CallbackContext_1_2>( @@ -609,6 +844,12 @@ template bool ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ExecuteGraph<CallbackC armnn::InputTensors& pInputTensors, armnn::OutputTensors& pOutputTensors, CallbackContext_1_2 cb); + +template void ArmnnPreparedModel_1_2<hal_1_2::HalPolicy>::ScheduleGraphForExecution<CallbackContext_1_2>( + std::shared_ptr<std::vector<::android::nn::RunTimePoolInfo>>& pMemPools, + std::shared_ptr<armnn::InputTensors>& inputTensors, + std::shared_ptr<armnn::OutputTensors>& outputTensors, + CallbackContext_1_2 callbackContext); #endif } // namespace armnn_driver |