// // Copyright © 2022 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #define LOG_TAG "arm-armnn-sl" #include "ArmnnPreparedModel.hpp" #include "CanonicalUtils.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace android; using namespace android::nn; static const Timing g_NoTiming = {}; namespace { using namespace armnn_driver; unsigned long MicrosecondsDuration(android::nn::TimePoint endPoint, android::nn::TimePoint startPoint) { return static_cast(std::chrono::duration_cast( endPoint - startPoint).count()); } bool ValidateRequestArgument(const Request::Argument& requestArg, const armnn::TensorInfo& tensorInfo) { if (requestArg.dimensions.size() != 0) { if (requestArg.dimensions.size() != tensorInfo.GetNumDimensions()) { VLOG(DRIVER) << "Mismatched dimensions (request argument: " << requestArg.dimensions.size() << " expected: " << tensorInfo.GetNumDimensions(); return false; } for (unsigned int d = 0; d < tensorInfo.GetNumDimensions(); ++d) { if (requestArg.dimensions[d] != 0 && requestArg.dimensions[d] != tensorInfo.GetShape()[d]) { VLOG(DRIVER) << "Mismatched dimensions " << d << " (request argument: " << requestArg.dimensions[d] << " expected: " << tensorInfo.GetShape()[d]; return false; } } } return true; } armnn::Tensor GetTensorForRequestArgument(const Request::Argument& requestArg, const armnn::TensorInfo& tensorInfo, const std::vector<::android::nn::RunTimePoolInfo>& requestPools) { if (!ValidateRequestArgument(requestArg, tensorInfo)) { return armnn::Tensor(); } if (requestArg.lifetime == Request::Argument::LifeTime::POINTER) { return armnn::Tensor(tensorInfo, GetMemoryFromPointer(requestArg)); } else if (requestArg.lifetime == Request::Argument::LifeTime::POOL) { return armnn::Tensor(tensorInfo, GetMemoryFromPool(requestArg.location, requestPools)); } return armnn::Tensor(); } inline std::string BuildTensorName(const char* tensorNamePrefix, std::size_t index) { return tensorNamePrefix + std::to_string(index); } bool IsPointerTypeMemory(const Request& request) { for (auto& input : request.inputs) { if (input.lifetime == Request::Argument::LifeTime::POINTER) { return true; } } for (auto& output: request.outputs) { if (output.lifetime == Request::Argument::LifeTime::POINTER) { return true; } } return false; } } // anonymous namespace using namespace android::nn; namespace armnn_driver { void ArmnnPreparedModel::Init() { // Enable profiling if required. m_Runtime->GetProfiler(m_NetworkId)->EnableProfiling(m_GpuProfilingEnabled); } ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, armnn::IRuntime* runtime, const Model& model, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, Priority priority) : m_NetworkId(networkId) , m_Runtime(runtime) , m_Model(model) , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) , m_ModelPriority(priority) , m_PrepareFromCache(false) { Init(); } ArmnnPreparedModel::ArmnnPreparedModel(armnn::NetworkId networkId, armnn::IRuntime* runtime, const std::string& requestInputsAndOutputsDumpDir, const bool gpuProfilingEnabled, Priority priority, const bool prepareModelFromCache) : m_NetworkId(networkId) , m_Runtime(runtime) , m_RequestInputsAndOutputsDumpDir(requestInputsAndOutputsDumpDir) , m_GpuProfilingEnabled(gpuProfilingEnabled) , m_ModelPriority(priority) , m_PrepareFromCache(prepareModelFromCache) { Init(); } ErrorStatus ArmnnPreparedModel::PrepareMemoryForInputs( armnn::InputTensors& inputs, const Request& request, const std::vector& memPools) const { inputs.reserve(request.inputs.size()); for (unsigned int i = 0; i < request.inputs.size(); i++) { const auto& inputArg = request.inputs[i]; armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); // inputs (of type InputTensors) is composed of a vector of ConstTensors. // Therefore, set all TensorInfo isConstant parameters of input Tensors to true. inputTensorInfo.SetConstant(); const armnn::Tensor inputTensor = GetTensorForRequestArgument(inputArg, inputTensorInfo, memPools); if (inputTensor.GetMemoryArea() == nullptr) { VLOG(DRIVER) << "Cannot execute request. Error converting request input " << i << "to tensor."; return ErrorStatus::GENERAL_FAILURE; } inputs.emplace_back(i, inputTensor); } return ErrorStatus::NONE; } ErrorStatus ArmnnPreparedModel::PrepareMemoryForOutputs( armnn::OutputTensors& outputs, std::vector &outputShapes, const Request& request, const std::vector& memPools) const { outputs.reserve(request.outputs.size()); for (unsigned int i = 0; i < request.outputs.size(); i++) { auto& outputArg = request.outputs[i]; armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i); armnn::Tensor outputTensor = GetTensorForRequestArgument(outputArg, outputTensorInfo, memPools); if (outputTensor.GetMemoryArea() == nullptr) { VLOG(DRIVER) << "Cannot execute request. Error converting request output " << i << "to tensor."; return ErrorStatus::GENERAL_FAILURE; } const size_t outputSize = outputTensorInfo.GetNumBytes(); unsigned int count = 0; std::for_each(outputArg.dimensions.begin(), outputArg.dimensions.end(), [&](auto dim) { if (dim != 0) { outputTensorInfo.GetShape()[count] = dim; } else { outputTensorInfo.GetShape()[count] = outputArg.dimensions.size(); } count++; }); outputs.emplace_back(i, outputTensor); outputShapes[i] = ComputeShape(outputTensorInfo); if (outputArg.location.length < outputSize) { VLOG(DRIVER) << "ArmnnPreparedModel::Execute failed outputArg.location.length " << std::to_string(outputArg.location.length).c_str() << " < outputSize " << std::to_string(outputSize).c_str(); outputShapes[i].isSufficient = false; return ErrorStatus::OUTPUT_INSUFFICIENT_SIZE; } //TODO: Need to check for Request::Argument::LifeTime::POINTER if (outputArg.lifetime == Request::Argument::LifeTime::POOL) { size_t bufferSize = memPools.at(outputArg.location.poolIndex).getSize(); if (bufferSize < outputSize) { VLOG(DRIVER) << "ArmnnPreparedModel::Execute failed bufferSize " << std::to_string(outputArg.location.length).c_str() << " < outputSize " << std::to_string(outputSize).c_str(); outputShapes[i].isSufficient = false; return ErrorStatus::OUTPUT_INSUFFICIENT_SIZE; } } } return ErrorStatus::NONE; } ErrorStatus ArmnnPreparedModel::PrepareMemoryForIO(armnn::InputTensors& inputs, armnn::OutputTensors& outputs, std::vector& memPools, const Request& request) const { //Check memory pools are not empty // add the inputs and outputs with their data try { if (!setRunTimePoolInfosFromMemoryPools(&memPools, request.pools)) { return ErrorStatus::INVALID_ARGUMENT; } if (PrepareMemoryForInputs(inputs, request, memPools) != ErrorStatus::NONE) { VLOG(DRIVER) << "Failed when preparing memory for Inputs"; return ErrorStatus::GENERAL_FAILURE; } std::vector outputShapes(request.outputs.size()); auto errorStatus = PrepareMemoryForOutputs(outputs, outputShapes, request, memPools); if (errorStatus != ErrorStatus::NONE) { return errorStatus; } } catch (armnn::Exception& e) { VLOG(DRIVER) << "armnn::Exception caught while preparing for EnqueueWorkload: " << e.what(); return ErrorStatus::GENERAL_FAILURE; } catch (std::exception& e) { VLOG(DRIVER) << "std::exception caught while preparing for EnqueueWorkload: " << e.what(); return ErrorStatus::GENERAL_FAILURE; } return ErrorStatus::NONE; } ExecutionResult, Timing>> ArmnnPreparedModel::execute( const Request& request, MeasureTiming measureTiming, const OptionalTimePoint& deadline, const OptionalDuration&, const std::vector& hints, const std::vector& extensionNameToPrefix) const { VLOG(DRIVER) << "CanonicalDriver::PreparedModel::execute()"; CanonicalExecutionContext ctx; if (measureTiming == MeasureTiming::YES) { ctx.measureTimings = measureTiming; ctx.driverStart = Clock::now(); } if (!m_PrepareFromCache) { const auto modelRequest = validateRequestForModel(request, m_Model); if (!modelRequest.ok()) { return NN_ERROR(ErrorStatus::INVALID_ARGUMENT) << modelRequest.error(); } VLOG(DRIVER) << "ArmnnPreparedModel::execute(): " << GetModelSummary(m_Model).c_str(); } if (hasDeadlinePassed(deadline)) { return NN_ERROR(ErrorStatus::MISSED_DEADLINE_PERSISTENT); } // map the memory pool into shared pointers // use a shared memory pools vector on the heap, as it is passed to the request thread auto memPools = std::make_shared>(); // allocate the tensors on the heap, as they are passed to the request thread auto inputTensors = std::make_shared(); auto outputTensors = std::make_shared(); ErrorStatus theErrorStatus = ErrorStatus::NONE; auto isPointerTypeMemory = IsPointerTypeMemory(request); nn::RequestRelocation relocation; if (isPointerTypeMemory) { std::optional maybeRequestInShared; auto executionResult = nn::convertRequestFromPointerToShared( &request, nn::kDefaultRequestMemoryAlignment, nn::kMinMemoryPadding, &maybeRequestInShared, &relocation); if(!executionResult.has_value()) { VLOG(DRIVER) << "ArmnnPreparedModel::PrepareMemoryForIO::Failed to convertRequestFromPointerToShared."; return NN_ERROR(ErrorStatus::GENERAL_FAILURE) << "ArmnnPreparedModel convertRequestFromPointerToShared failed"; } const nn::Request& requestInShared = std::move(executionResult).value(); if (relocation.input) { relocation.input->flush(); } theErrorStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, requestInShared); } else { theErrorStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, request); } switch(theErrorStatus) { case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: return NN_ERROR(ErrorStatus::OUTPUT_INSUFFICIENT_SIZE); case ErrorStatus::GENERAL_FAILURE: return NN_ERROR(ErrorStatus::GENERAL_FAILURE); case ErrorStatus::INVALID_ARGUMENT: return NN_ERROR(ErrorStatus::INVALID_ARGUMENT); default: {} } std::vector outputShapes(outputTensors->size()); for (unsigned int i = 0; i < outputTensors->size(); i++) { std::pair outputTensorPair = (*outputTensors)[i]; const armnn::Tensor outputTensor = outputTensorPair.second; const armnn::TensorInfo outputTensorInfo = outputTensor.GetInfo(); outputShapes[i] = ComputeShape(outputTensorInfo); } Timing theTiming; VLOG(DRIVER) << "ArmnnPreparedModel::execute(...) before ExecuteGraph"; auto errorStatus = ExecuteGraph(memPools, *inputTensors, *outputTensors, ctx); if (errorStatus != ErrorStatus::NONE) { return NN_ERROR(errorStatus) << "execute() failed"; } VLOG(DRIVER) << "ArmnnPreparedModel::execute(...) after ExecuteGraph"; if (isPointerTypeMemory && relocation.output) { relocation.output->flush(); } return std::make_pair(outputShapes, theTiming); } ErrorStatus ArmnnPreparedModel::ExecuteGraph( std::shared_ptr>& pMemPools, armnn::InputTensors& inputTensors, armnn::OutputTensors& outputTensors, CanonicalExecutionContext ctx) const { VLOG(DRIVER) << "ArmnnPreparedModel::ExecuteGraph(...)"; DumpTensorsIfRequired("Input", inputTensors); try { if (ctx.measureTimings == MeasureTiming::YES) { ctx.deviceStart = Clock::now(); } armnn::Status status; VLOG(DRIVER) << "ArmnnPreparedModel::ExecuteGraph m_AsyncModelExecutionEnabled false"; status = m_Runtime->EnqueueWorkload(m_NetworkId, inputTensors, outputTensors); if (ctx.measureTimings == MeasureTiming::YES) { ctx.deviceEnd = Clock::now(); } if (status != armnn::Status::Success) { VLOG(DRIVER) << "ArmnnPreparedModel:ExecuteGraph EnqueueWorkload failed"; return ErrorStatus::GENERAL_FAILURE; } } catch (armnn::Exception& e) { VLOG(DRIVER) << "armnn:Exception caught from EnqueueWorkload: " << e.what(); return ErrorStatus::GENERAL_FAILURE; } catch (std::exception& e) { VLOG(DRIVER) << "std::exception caught from EnqueueWorkload: " << e.what(); return ErrorStatus::GENERAL_FAILURE; } CommitPools(*pMemPools); DumpTensorsIfRequired("Output", outputTensors); if (ctx.measureTimings == MeasureTiming::YES) { ctx.driverEnd = Clock::now(); Timing timing; timing.timeOnDevice = ctx.deviceEnd - ctx.deviceStart; timing.timeInDriver = ctx.driverEnd - ctx.driverStart; VLOG(DRIVER) << "ArmnnPreparedModel::execute timing - Device = " << timing.timeOnDevice << "Driver = " << timing.timeInDriver; } return ErrorStatus::NONE; } Priority ArmnnPreparedModel::GetModelPriority() const { return m_ModelPriority; } GeneralResult> ArmnnPreparedModel::executeFenced( const Request& request, const std::vector& waitFor, MeasureTiming measureTiming, const OptionalTimePoint& deadline, const OptionalDuration&, const OptionalDuration&, const std::vector& hints, const std::vector& extensionNameToPrefix) const { VLOG(DRIVER) << "ArmnnPreparedModel::executeFenced()"; if (!m_PrepareFromCache) { const auto modelRequest = validateRequestForModel(request, m_Model); if (!modelRequest.ok()) { return NN_ERROR(ErrorStatus::INVALID_ARGUMENT) << modelRequest.error(); } VLOG(DRIVER) << "ArmnnPreparedModel::executeFenced(): " << GetModelSummary(m_Model).c_str(); } if (hasDeadlinePassed(deadline)) { return NN_ERROR(ErrorStatus::MISSED_DEADLINE_PERSISTENT); } CanonicalExecutionContext ctx; if (measureTiming == MeasureTiming::YES) { ctx.measureTimings = measureTiming; ctx.driverStart = Clock::now(); } // Wait for the dependent events to signal for (const auto& syncFence : waitFor) { if (!syncFence.getSharedHandle()) { return NN_ERROR(ErrorStatus::INVALID_ARGUMENT); } if (syncFence.syncWait({}) != SyncFence::FenceState::SIGNALED) { return NN_ERROR(ErrorStatus::GENERAL_FAILURE) << "syncWait failed"; } } android::nn::TimePoint fenceExecutionStart; if (measureTiming == MeasureTiming::YES) { fenceExecutionStart = Clock::now(); } // map the memory pool into shared pointers // use a shared memory pools vector on the heap, as it is passed to the request thread auto memPools = std::make_shared>(); // allocate the tensors on the heap, as they are passed to the request thread auto inputTensors = std::make_shared(); auto outputTensors = std::make_shared(); ErrorStatus theErrorStatus = ErrorStatus::NONE; auto isPointerTypeMemory = IsPointerTypeMemory(request); nn::RequestRelocation relocation; if (isPointerTypeMemory) { std::optional maybeRequestInShared; auto executionResult = nn::convertRequestFromPointerToShared( &request, nn::kDefaultRequestMemoryAlignment, nn::kMinMemoryPadding, &maybeRequestInShared, &relocation); if(!executionResult.has_value()) { VLOG(DRIVER) << "ArmnnPreparedModel::PrepareMemoryForIO::Failed to convertRequestFromPointerToShared."; return NN_ERROR(ErrorStatus::GENERAL_FAILURE) << "ArmnnPreparedModel convertRequestFromPointerToShared failed"; } const nn::Request& requestInShared = std::move(executionResult).value(); if (relocation.input) { relocation.input->flush(); } theErrorStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, requestInShared); } else { theErrorStatus = PrepareMemoryForIO(*inputTensors, *outputTensors, *memPools, request); } if (theErrorStatus != ErrorStatus::NONE) { return NN_ERROR(ErrorStatus::INVALID_ARGUMENT) << "executeFenced() failed"; } Timing timingSinceLaunch = {}; Timing timingAfterFence = {}; if (measureTiming == MeasureTiming::YES) { timingAfterFence.timeOnDevice = ctx.deviceEnd - ctx.deviceStart; timingAfterFence.timeInDriver = ctx.driverEnd - fenceExecutionStart; VLOG(DRIVER) << "executeFenced timingSinceLaunch = " << timingAfterFence.timeOnDevice; VLOG(DRIVER) << "executeFenced timingAfterFence = " << timingAfterFence.timeInDriver; } VLOG(DRIVER) << "ArmnnCanonicalPreparedModel::executeFenced(...) before ExecuteGraph"; auto errorStatus = ExecuteGraph(memPools, *inputTensors, *outputTensors, ctx); VLOG(DRIVER) << "ArmnnCanonicalPreparedModel::executeFenced(...) after ExecuteGraph"; if (isPointerTypeMemory && relocation.output) { relocation.output->flush(); } ExecuteFencedInfoCallback armnnFencedExecutionCallback = [timingSinceLaunch, timingAfterFence, errorStatus]() { GeneralResult> result; switch(errorStatus) { case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: result.error().code = (ErrorStatus::OUTPUT_INSUFFICIENT_SIZE); case ErrorStatus::GENERAL_FAILURE: result.error().code = (ErrorStatus::GENERAL_FAILURE); case ErrorStatus::INVALID_ARGUMENT: result.error().code = (ErrorStatus::INVALID_ARGUMENT); default: { result.value() = std::make_pair(timingSinceLaunch, timingAfterFence); } } return result; }; return std::make_pair(SyncFence::createAsSignaled(), std::move(armnnFencedExecutionCallback )); } GeneralResult ArmnnPreparedModel::createReusableExecution( const Request& request, MeasureTiming measureTiming, const OptionalDuration& loopTimeoutDuration, const std::vector& hints, const std::vector& extensionNameToPrefix) const { VLOG(DRIVER) << "ArmnnPreparedModel::createReusableExecution()"; return std::make_shared(shared_from_this(), request, measureTiming, loopTimeoutDuration); } GeneralResult ArmnnPreparedModel::configureExecutionBurst() const { // TODO: Implement BURST return nullptr; } std::any ArmnnPreparedModel::getUnderlyingResource() const { return &m_Model; } template void ArmnnPreparedModel::DumpTensorsIfRequired(char const* tensorNamePrefix, const TensorBindingCollection& tensorBindings) const { if (!m_RequestInputsAndOutputsDumpDir.empty()) { const std::string requestName = std::to_string(m_NetworkId) + ".dump"; for (std::size_t i = 0u; i < tensorBindings.size(); ++i) { DumpTensor(m_RequestInputsAndOutputsDumpDir, requestName, BuildTensorName(tensorNamePrefix, i), tensorBindings[i].second); } } } ArmnnPreparedModel::~ArmnnPreparedModel() { VLOG(DRIVER) << "ArmnnPreparedModel::~ArmnnPreparedModel()"; // Get a hold of the profiler used by this model. if (m_GpuProfilingEnabled) { auto profiler = m_Runtime->GetProfiler(m_NetworkId); if (profiler) { // Dump the profiling info to a file if required. DumpJsonProfilingIfRequired(m_GpuProfilingEnabled, m_RequestInputsAndOutputsDumpDir, m_NetworkId, profiler.get()); } } // Unload the network associated with this model m_Runtime->UnloadNetwork(m_NetworkId); } bool ArmnnPreparedModel::ExecuteWithDummyInputs(unsigned int numInputs, unsigned int numOutputs) const { std::vector> storage; armnn::InputTensors inputTensors; for (unsigned int i = 0; i < numInputs; i++) { armnn::TensorInfo inputTensorInfo = m_Runtime->GetInputTensorInfo(m_NetworkId, i); // pInputTensors (of type InputTensors) is composed of a vector of ConstTensors. // Therefore, set all TensorInfo isConstant parameters of input Tensors to true. inputTensorInfo.SetConstant(); storage.emplace_back(inputTensorInfo.GetNumBytes()); const armnn::ConstTensor inputTensor(inputTensorInfo, storage.back().data()); inputTensors.emplace_back(i, inputTensor); } armnn::OutputTensors outputTensors; for (unsigned int i = 0; i < numOutputs; i++) { const armnn::TensorInfo outputTensorInfo = m_Runtime->GetOutputTensorInfo(m_NetworkId, i); storage.emplace_back(outputTensorInfo.GetNumBytes()); const armnn::Tensor outputTensor(outputTensorInfo, storage.back().data()); outputTensors.emplace_back(i, outputTensor); } CanonicalExecutionContext ctx; ctx.measureTimings = MeasureTiming::NO; auto memPools = std::make_shared>(); auto errorStatus = ExecuteGraph(memPools, inputTensors, outputTensors, ctx); return errorStatus == ErrorStatus::NONE; } } // namespace armnn_driver