From 55a8ffda24fff5515803df10fb4863d46a1effdf Mon Sep 17 00:00:00 2001 From: Mike Kelly Date: Wed, 7 Apr 2021 20:10:49 +0100 Subject: IVGCVSW-5823 Refactor Async Network API * Moved IAsyncNetwork into IRuntime. * All LoadedNetworks can be executed Asynchronously. Signed-off-by: Mike Kelly Change-Id: Ibbc901ab9110dc2f881425b75489bccf9ad54169 --- src/armnn/LoadedNetwork.cpp | 362 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 356 insertions(+), 6 deletions(-) (limited to 'src/armnn/LoadedNetwork.cpp') diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index ea09231c3c..d75a2021b2 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -10,6 +10,7 @@ #include #include "Profiling.hpp" #include "HeapProfiling.hpp" +#include "WorkingMemHandle.hpp" #include #include @@ -119,8 +120,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, const INetworkProperties& networkProperties, profiling::ProfilingService& profilingService) : m_OptimizedNetwork(std::move(net)), - m_IsImportEnabled(networkProperties.m_ImportEnabled), - m_IsExportEnabled(networkProperties.m_ExportEnabled), + m_NetworkProperties(networkProperties), m_TensorHandleFactoryRegistry(), m_ProfilingService(profilingService) { @@ -172,7 +172,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, case LayerType::MemImport: { // If IsImportEnabled is true then we need to set IsMemoryManaged to false when creating TensorHandles - layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsImportEnabled); + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, + !m_NetworkProperties.m_ImportEnabled); break; } default: @@ -183,7 +184,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, (layer->GetOutputSlots()[0].GetNumConnections() == 1) && (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) { - layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsExportEnabled); + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, + !m_NetworkProperties.m_ExportEnabled); } else { @@ -576,7 +578,7 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tens MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags(); bool needMemCopy = true; - if (m_IsImportEnabled) // Try import the input tensor + if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor { if(CheckFlag(importFlags, MemorySource::Malloc) ) { @@ -647,7 +649,8 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) // e) m_IsExportEnabled must be set to true bool needMemCopy = true; - if (m_IsExportEnabled && (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) + if (m_NetworkProperties.m_ExportEnabled && + (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) { if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) { @@ -792,6 +795,353 @@ bool LoadedNetwork::Execute(std::unique_ptr& timelineUti return success; } +void LoadedNetwork::EnqueueInput(const BindableLayer& layer, + const ConstTensor& inputTensor, + WorkingMemHandle& context) +{ + if (layer.GetType() != LayerType::Input) + { + throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer"); + } + LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid(); + WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id); + ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output"); + + MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags(); + if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor + { + if (CheckFlag(importFlags, MemorySource::Malloc) ) + { + // This assumes a CPU Tensor handle + std::unique_ptr tensorHandle = + std::make_unique(inputTensor.GetInfo(), + inputTensor.GetMemoryArea()); + + void* mem = tensorHandle->Map(false); + if (descriptor.m_Outputs[0]->Import(mem, MemorySource::Malloc)) + { + tensorHandle->Unmap(); + return; + } + tensorHandle->Unmap(); + throw MemoryImportException("EnqueueInput: Memory Import failed"); + } + else + { + throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import"); + } + } + else + { + std::unique_ptr tensorHandle = + std::make_unique(inputTensor.GetInfo(), inputTensor.GetMemoryArea()); + + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; + + for (const auto& input : descriptor.m_Inputs) + { + CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc); + } + } +} + +void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle) +{ + if (layer.GetType() != LayerType::Output) + { + throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer"); + } + ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input."); + + LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid(); + WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id); + + ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0]; + ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated."); + + // Try import the output tensor. + // Note: We can only import the output pointer if all of the following hold true: + // a) The imported pointer is aligned sufficiently + // b) The tensor has zero padding + // c) There is only one connection to the OutputSlot and it is to an OutputLayer. + // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) + // e) m_IsExportEnabled must be set to true + if (m_NetworkProperties.m_ExportEnabled && + (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) + { + if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) + { + MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); + if (CheckFlag(importFlags, MemorySource::Malloc)) + { + std::unique_ptr tensorHandle = + std::make_unique(outputTensor.GetInfo(), + outputTensor.GetMemoryArea()); + + void* mem = tensorHandle->Map(false); + bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc); + tensorHandle->Unmap(); + + if (importOk) + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute"); + descriptor.m_Inputs[0]->Map(true); + descriptor.m_Inputs[0]->Unmap(); + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed"); + } + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export"); + } + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer"); + } + } + else + { + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; + + std::unique_ptr tensorHandle = + std::make_unique(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); + + CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc); + } +} + +Status LoadedNetwork::Execute(const InputTensors& inputTensors, + const OutputTensors& outputTensors, + IWorkingMemHandle& iWorkingMemHandle) +{ + const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); + + // Walk graph to determine the order of execution. + if (graph.GetNumLayers() < 2) + { + ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph"; + return Status::Failure; + } + + if (graph.GetNumInputs() != inputTensors.size()) + { + throw InvalidArgumentException("Number of inputs provided does not match network."); + } + + std::unique_ptr timelineUtils = + profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService); + profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid(); + if (timelineUtils) + { + // Add inference timeline trace if profiling is enabled. + profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); + timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID); + timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink, + networkGuid, + inferenceGuid, + profiling::LabelsAndEventClasses::EXECUTION_OF_GUID); + timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS); + } + + bool executionSucceeded = true; + + if (timelineUtils) + { + // Add end of life of the inference timeline if profiling is enabled. + timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS); + timelineUtils->Commit(); + } + WorkingMemHandle& workingMemHandle = dynamic_cast(iWorkingMemHandle); + std::lock_guard lockGuard(workingMemHandle.GetMutex()); + + if (!workingMemHandle.IsAllocated()) + { + workingMemHandle.Allocate(); + } + + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); + unsigned int i = 0; + + for (const BindableLayer* inputLayer : graph.GetInputLayers()) + { + EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle); + ++i; + } + } + + auto Fail = [&](const std::exception& error) + { + ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what(); + executionSucceeded = false; + }; + profiling::ProfilingDynamicGuid workloadInferenceID(0); + + try + { + for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i) + { + auto& workload = m_WorkloadQueue[i]; + if (timelineUtils) + { + workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(), + inferenceGuid); + } + workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i)); + + if (timelineUtils) + { + timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); + } + } + } + catch (const RuntimeException& error) + { + Fail(error); + } + catch (const std::runtime_error& error) + { + Fail(error); + } + // For each output to the network, call EnqueueOutput with the data passed by the user. + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); + unsigned int i = static_cast(m_WorkloadQueue.size() - graph.GetNumOutputs()); + + for (const BindableLayer* outputLayer : graph.GetOutputLayers()) + { + EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle); + ++i; + } + } + return executionSucceeded ? Status::Success : Status::Failure; +} +// Need something like the collectors to get the correct tensors for the inputs +void LoadedNetwork::CollectInputTensorHandles( + std::unordered_map >& tensorHandles, + std::vector& inputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged) +{ + for (auto&& inputSlot : layer->GetInputSlots()) + { + // The graph must be well-formed at this point. + ARMNN_ASSERT(inputSlot.GetConnection()); + auto outputSlot = inputSlot.GetConnectedOutputSlot(); + auto key = outputSlot->GetOwningLayer().GetGuid(); + auto search = tensorHandles.find(key); + + if (search == tensorHandles.end()) + { + ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId(); + const TensorInfo& tensorInfo = outputSlot->GetTensorInfo(); + + ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); + ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); + ARMNN_ASSERT(handleFactory); + std::unique_ptr tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); + ITensorHandle* tensorPtr = tensor.release(); + inputs.push_back(tensorPtr); + } + else + { + unsigned int index = outputSlot->CalculateIndexOnOwner(); + inputs.push_back(search->second[index]); + } + } +} + +void LoadedNetwork::CreateOutputTensorHandles( + std::unordered_map >& tensorHandles, + std::vector& outputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged) +{ + auto guid = layer->GetGuid(); + std::vector tensorHandleVectors; + tensorHandleVectors.reserve(layer->GetNumOutputSlots()); + + for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++) + { + const OutputSlot& slot = layer->GetOutputSlot(idx); + ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId(); + const TensorInfo& tensorInfo = slot.GetTensorInfo(); + + ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); + ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); + ARMNN_ASSERT(handleFactory); + std::unique_ptr tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); + ITensorHandle* tensorPtr = tensor.release(); + outputs.push_back(tensorPtr); + tensorHandleVectors.push_back(tensorPtr); + } + tensorHandles.insert({guid, tensorHandleVectors}); +} + +/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have +/// overlapped Execution by calling this function from different threads. +std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId) +{ + Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); + std::unordered_map > tensorHandles; + std::vector workingMemDescriptors; + std::unordered_map workingMemDescriptorMap; + + for (auto&& layer : order) + { + if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output) + { + continue; + } + WorkingMemDescriptor workingMemDescriptor; + // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer + // If Export is enabled disable memory management so we can export, otherwise we do a copy + if((layer->GetNumOutputSlots() == 1) && + (layer->GetOutputSlots()[0].GetNumConnections() == 1) && + (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) + { + CollectInputTensorHandles(tensorHandles, + workingMemDescriptor.m_Inputs, + layer, + m_TensorHandleFactoryRegistry, + !m_NetworkProperties.m_ExportEnabled); + CreateOutputTensorHandles(tensorHandles, + workingMemDescriptor.m_Outputs, + layer, + m_TensorHandleFactoryRegistry, + !m_NetworkProperties.m_ExportEnabled); + } + else + { + CollectInputTensorHandles(tensorHandles, + workingMemDescriptor.m_Inputs, + layer, + m_TensorHandleFactoryRegistry); + CreateOutputTensorHandles(tensorHandles, + workingMemDescriptor.m_Outputs, + layer, + m_TensorHandleFactoryRegistry); + } + workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor}); + workingMemDescriptors.push_back(workingMemDescriptor); + } + return std::make_unique(networkId, + workingMemDescriptors, + workingMemDescriptorMap); +} + void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func) { for (auto&& workloadPtr: m_WorkloadQueue) -- cgit v1.2.1