From 55a8ffda24fff5515803df10fb4863d46a1effdf Mon Sep 17 00:00:00 2001 From: Mike Kelly Date: Wed, 7 Apr 2021 20:10:49 +0100 Subject: IVGCVSW-5823 Refactor Async Network API * Moved IAsyncNetwork into IRuntime. * All LoadedNetworks can be executed Asynchronously. Signed-off-by: Mike Kelly Change-Id: Ibbc901ab9110dc2f881425b75489bccf9ad54169 --- Android.mk | 1 - CMakeLists.txt | 3 - include/armnn/ArmNN.hpp | 2 +- include/armnn/IAsyncNetwork.hpp | 64 -- include/armnn/IRuntime.hpp | 34 +- include/armnn/IWorkingMemHandle.hpp | 5 + include/armnn/NetworkFwd.hpp | 7 - src/armnn/AsyncNetwork.cpp | 665 --------------------- src/armnn/AsyncNetwork.hpp | 106 ---- src/armnn/LoadedNetwork.cpp | 362 ++++++++++- src/armnn/LoadedNetwork.hpp | 33 +- src/armnn/Runtime.cpp | 135 +++-- src/armnn/Runtime.hpp | 20 +- src/armnn/WorkingMemHandle.cpp | 4 +- src/armnn/WorkingMemHandle.hpp | 12 +- .../test/StridedSliceAsyncEndToEndTest.hpp | 12 +- 16 files changed, 528 insertions(+), 937 deletions(-) delete mode 100644 include/armnn/IAsyncNetwork.hpp delete mode 100644 src/armnn/AsyncNetwork.cpp delete mode 100644 src/armnn/AsyncNetwork.hpp diff --git a/Android.mk b/Android.mk index 806d81bcd5..416c00238c 100644 --- a/Android.mk +++ b/Android.mk @@ -108,7 +108,6 @@ LOCAL_SRC_FILES := \ profiling/server/src/timelineDecoder/TimelineCaptureCommandHandler.cpp \ profiling/server/src/timelineDecoder/TimelineDecoder.cpp \ profiling/server/src/timelineDecoder/TimelineDirectoryCaptureCommandHandler.cpp \ - src/armnn/AsyncNetwork.cpp \ src/armnn/BackendHelper.cpp \ src/armnn/BackendRegistry.cpp \ src/armnn/Descriptors.cpp \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 62417bebb3..049a4f1e1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -242,7 +242,6 @@ list(APPEND armnn_sources include/armnn/Descriptors.hpp include/armnn/DescriptorsFwd.hpp include/armnn/Exceptions.hpp - include/armnn/IAsyncNetwork.hpp include/armnn/ILayerSupport.hpp include/armnn/ILayerVisitor.hpp include/armnn/INetwork.hpp @@ -408,8 +407,6 @@ list(APPEND armnn_sources src/armnn/layers/TransposeLayer.cpp src/armnn/layers/UnmapLayer.cpp src/armnn/layers/UnmapLayer.hpp - src/armnn/AsyncNetwork.cpp - src/armnn/AsyncNetwork.hpp src/armnn/BackendRegistry.cpp src/armnn/BackendSettings.hpp src/armnn/BackendHelper.cpp diff --git a/include/armnn/ArmNN.hpp b/include/armnn/ArmNN.hpp index ac4d33f737..e4d5ce1fa1 100644 --- a/include/armnn/ArmNN.hpp +++ b/include/armnn/ArmNN.hpp @@ -7,9 +7,9 @@ #include "BackendId.hpp" #include "Descriptors.hpp" #include "Exceptions.hpp" -#include "IAsyncNetwork.hpp" #include "INetwork.hpp" #include "IRuntime.hpp" +#include "IWorkingMemHandle.hpp" #include "LstmParams.hpp" #include "Optional.hpp" #include "QuantizedLstmParams.hpp" diff --git a/include/armnn/IAsyncNetwork.hpp b/include/armnn/IAsyncNetwork.hpp deleted file mode 100644 index c234ae55ac..0000000000 --- a/include/armnn/IAsyncNetwork.hpp +++ /dev/null @@ -1,64 +0,0 @@ -// -// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include - -#include "INetwork.hpp" -#include "IProfiler.hpp" -#include "IWorkingMemHandle.hpp" -#include "Tensor.hpp" -#include "Types.hpp" - -#include - -namespace armnn -{ -struct INetworkProperties; - -namespace profiling -{ -class ProfilingService; -} - -namespace experimental -{ -class AsyncNetworkImpl; - -class IAsyncNetwork -{ -public: - IAsyncNetwork(std::unique_ptr net, - const INetworkProperties& networkProperties, - profiling::ProfilingService& profilingService); - ~IAsyncNetwork(); - - TensorInfo GetInputTensorInfo(LayerBindingId layerId) const; - TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const; - - /// Thread safe execution of the network. Returns once execution is complete. - /// Will block until this and any other thread using the same workingMem object completes. - Status Execute(const InputTensors& inputTensors, - const OutputTensors& outputTensors, - IWorkingMemHandle& workingMemHandle); - - /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have - /// overlapped Execution by calling this function from different threads. - std::unique_ptr CreateWorkingMemHandle(); - - /// Get the profiler used for this network - std::shared_ptr GetProfiler() const; - - /// Register a debug callback function to be used with this network - void RegisterDebugCallback(const DebugCallbackFunction& func); - -private: - std::unique_ptr pAsyncNetworkImpl; -}; - -} // end experimental namespace - -} // end armnn namespace diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp index 9f7032914f..fc203e67e4 100644 --- a/include/armnn/IRuntime.hpp +++ b/include/armnn/IRuntime.hpp @@ -5,9 +5,9 @@ #pragma once #include "BackendOptions.hpp" -#include "IAsyncNetwork.hpp" #include "INetwork.hpp" #include "IProfiler.hpp" +#include "IWorkingMemHandle.hpp" #include "Tensor.hpp" #include "Types.hpp" #include "TypesUtils.hpp" @@ -28,12 +28,14 @@ using IRuntimePtr = std::unique_ptr; struct INetworkProperties { - INetworkProperties(bool importEnabled = false, bool exportEnabled = false) + INetworkProperties(bool importEnabled = false, bool exportEnabled = false, bool asyncEnabled = false) : m_ImportEnabled(importEnabled), - m_ExportEnabled(exportEnabled) {} + m_ExportEnabled(exportEnabled), + m_AsyncEnabled(asyncEnabled) {} const bool m_ImportEnabled; const bool m_ExportEnabled; + const bool m_AsyncEnabled; virtual ~INetworkProperties() {} }; @@ -145,20 +147,6 @@ public: std::string& errorMessage, const INetworkProperties& networkProperties); - /// This is an experimental function. - /// Creates an executable network. This network is thread safe allowing for multiple networks to be - /// loaded simultaneously via different threads. - /// Note that the network is never registered with the runtime so does not need to be 'Unloaded'. - /// @param [out] networkIdOut Unique identifier for the network is returned in this reference. - /// @param [in] network Complete network to load into the IRuntime. - /// @param [out] errorMessage Error message if there were any errors. - /// @param [out] networkProperties the INetworkProperties that govern how the network should operate. - /// @return The IAsyncNetwork - std::unique_ptr CreateAsyncNetwork(NetworkId& networkIdOut, - IOptimizedNetworkPtr network, - std::string& errorMessage, - const INetworkProperties& networkProperties); - TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; @@ -167,6 +155,14 @@ public: const InputTensors& inputTensors, const OutputTensors& outputTensors); + /// This is an experimental function. + /// Evaluates a network using input in inputTensors and outputs filled into outputTensors. + /// This function performs a thread safe execution of the network. Returns once execution is complete. + /// Will block until this and any other thread using the same workingMem object completes. + Status Execute(IWorkingMemHandle& workingMemHandle, + const InputTensors& inputTensors, + const OutputTensors& outputTensors); + /// Unloads a network from the IRuntime. /// At the moment this only removes the network from the m_Impl->m_Network. /// This might need more work in the future to be AndroidNN compliant. @@ -176,6 +172,10 @@ public: const IDeviceSpec& GetDeviceSpec() const; + /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have + /// overlapped Execution by calling this function from different threads. + std::unique_ptr CreateWorkingMemHandle(NetworkId networkId); + /// Gets the profiler corresponding to the given network id. /// @param networkId The id of the network for which to get the profile. /// @return A pointer to the requested profiler, or nullptr if not found. diff --git a/include/armnn/IWorkingMemHandle.hpp b/include/armnn/IWorkingMemHandle.hpp index 921b7e1f40..171fa3d81c 100644 --- a/include/armnn/IWorkingMemHandle.hpp +++ b/include/armnn/IWorkingMemHandle.hpp @@ -10,6 +10,8 @@ namespace armnn { +using NetworkId = int; + namespace experimental { @@ -20,6 +22,9 @@ class IWorkingMemHandle public: virtual ~IWorkingMemHandle() {}; + /// Returns the NetworkId of the Network that this IWorkingMemHandle works with. + virtual NetworkId GetNetworkId() = 0; + /// Allocate the backing memory required for execution. If this is not called, then allocation will be /// deferred to execution time. The mutex must be locked. virtual void Allocate() = 0; diff --git a/include/armnn/NetworkFwd.hpp b/include/armnn/NetworkFwd.hpp index 6c2970f28b..5db9ec4ebe 100644 --- a/include/armnn/NetworkFwd.hpp +++ b/include/armnn/NetworkFwd.hpp @@ -10,13 +10,6 @@ namespace armnn struct LstmInputParams; struct QuantizedLstmInputParams; -namespace experimental -{ - -class IAsyncNetwork; - -} // end experimental namespace - class INetwork; class IOptimizedNetwork; class Graph; diff --git a/src/armnn/AsyncNetwork.cpp b/src/armnn/AsyncNetwork.cpp deleted file mode 100644 index 230346a0c3..0000000000 --- a/src/armnn/AsyncNetwork.cpp +++ /dev/null @@ -1,665 +0,0 @@ -// -// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#include "AsyncNetwork.hpp" -#include "Graph.hpp" -#include "Layer.hpp" -#include "Profiling.hpp" - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - -namespace armnn -{ - -namespace experimental -{ - -IAsyncNetwork::IAsyncNetwork(std::unique_ptr net, - const INetworkProperties& networkProperties, - profiling::ProfilingService& profilingService) - : pAsyncNetworkImpl( new AsyncNetworkImpl(std::move(net), networkProperties, profilingService)) {}; - -IAsyncNetwork::~IAsyncNetwork() = default; - -TensorInfo IAsyncNetwork::GetInputTensorInfo(LayerBindingId layerId) const -{ - return pAsyncNetworkImpl->GetInputTensorInfo(layerId); -} - -TensorInfo IAsyncNetwork::GetOutputTensorInfo(LayerBindingId layerId) const -{ - return pAsyncNetworkImpl->GetOutputTensorInfo(layerId); -} - -Status IAsyncNetwork::Execute(const InputTensors& inputTensors, - const OutputTensors& outputTensors, - IWorkingMemHandle& workingMemHandle) -{ - return pAsyncNetworkImpl->Execute(inputTensors, outputTensors, workingMemHandle); -} - -std::unique_ptr IAsyncNetwork::CreateWorkingMemHandle() -{ - return pAsyncNetworkImpl->CreateWorkingMemHandle(); -} - -std::shared_ptr IAsyncNetwork::GetProfiler() const -{ - return pAsyncNetworkImpl->GetProfiler(); -} - -void IAsyncNetwork::RegisterDebugCallback(const DebugCallbackFunction& func) -{ - pAsyncNetworkImpl->RegisterDebugCallback(func); -} - -void AddLayerStructure(std::unique_ptr& timelineUtils, - const Layer& layer, - profiling::ProfilingGuid networkGuid) -{ - // Add layer to the post-optimisation network structure - std::string layerName = layer.GetNameStr().empty() ? "" : layer.GetNameStr(); - timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(), - networkGuid, - layerName, - profiling::LabelsAndEventClasses::LAYER_GUID); - for (auto&& input : layer.GetInputSlots()) - { - const IOutputSlot* source = input.GetConnectedOutputSlot(); - ARMNN_ASSERT(source != NULL); - timelineUtils->CreateConnectionRelationship(profiling::ProfilingRelationshipType::RetentionLink, - source->GetOwningLayerGuid(), - layer.GetGuid()); - } -} - -void AddWorkloadStructure(std::unique_ptr& timelineUtils, - std::unique_ptr& workload, - const Layer& layer) -{ - // Add workload to the post-optimisation network structure - timelineUtils->CreateTypedEntity(workload->GetGuid(), profiling::LabelsAndEventClasses::WORKLOAD_GUID); - timelineUtils->MarkEntityWithLabel(workload->GetGuid(), - layer.GetBackendId().Get(), - profiling::LabelsAndEventClasses::BACKENDID_GUID); - - // Link the workload to the layer - timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink, - layer.GetGuid(), - workload->GetGuid(), - profiling::LabelsAndEventClasses::CHILD_GUID); -} - -TensorInfo AsyncNetworkImpl::GetInputTensorInfo(LayerBindingId layerId) const -{ - for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers()) - { - ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot"); - if (inputLayer->GetBindingId() == layerId) - { - return inputLayer->GetOutputSlot(0).GetTensorInfo(); - } - } - - throw InvalidArgumentException(fmt::format("No input layer is associated with id {0}}", layerId)); -} - -TensorInfo AsyncNetworkImpl::GetOutputTensorInfo(LayerBindingId layerId) const -{ - for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers()) - { - ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot"); - ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected"); - if (outputLayer->GetBindingId() == layerId) - { - return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo(); - } - } - - throw InvalidArgumentException(fmt::format("No output layer is associated with id {0}}", layerId)); -} - -// Need something like the collectors to get the correct tensors for the inputs -void AsyncNetworkImpl::CollectInputTensorHandles( - std::unordered_map >& tensorHandles, - std::vector& inputs, - const armnn::Layer* layer, - const TensorHandleFactoryRegistry& registry, - const bool isMemoryManaged) -{ - for (auto&& inputSlot : layer->GetInputSlots()) - { - // The graph must be well-formed at this point. - ARMNN_ASSERT(inputSlot.GetConnection()); - auto outputSlot = inputSlot.GetConnectedOutputSlot(); - auto key = outputSlot->GetOwningLayer().GetGuid(); - auto search = tensorHandles.find(key); - - if (search == tensorHandles.end()) - { - ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId(); - const TensorInfo& tensorInfo = outputSlot->GetTensorInfo(); - - ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); - ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); - ARMNN_ASSERT(handleFactory); - std::unique_ptr tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); - ITensorHandle* tensorPtr = tensor.release(); - inputs.push_back(tensorPtr); - } - else - { - unsigned int index = outputSlot->CalculateIndexOnOwner(); - inputs.push_back(search->second[index]); - } - } -} - -void AsyncNetworkImpl::CreateOutputTensorHandles( - std::unordered_map >& tensorHandles, - std::vector& outputs, - const armnn::Layer* layer, - const TensorHandleFactoryRegistry& registry, - const bool isMemoryManaged) -{ - auto guid = layer->GetGuid(); - std::vector tensorHandleVectors; - tensorHandleVectors.reserve(layer->GetNumOutputSlots()); - - for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++) - { - const OutputSlot& slot = layer->GetOutputSlot(idx); - ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId(); - const TensorInfo& tensorInfo = slot.GetTensorInfo(); - - ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); - ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); - ARMNN_ASSERT(handleFactory); - std::unique_ptr tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); - ITensorHandle* tensorPtr = tensor.release(); - outputs.push_back(tensorPtr); - tensorHandleVectors.push_back(tensorPtr); - } - tensorHandles.insert({guid, tensorHandleVectors}); -} - -const IWorkloadFactory& AsyncNetworkImpl::GetWorkloadFactory(const Layer& layer) const -{ - const IWorkloadFactory* workloadFactory = nullptr; - - auto it = m_WorkloadFactories.find(layer.GetBackendId()); - if (it == m_WorkloadFactories.end()) - { - throw RuntimeException( - fmt::format("No workload factory for {0} to be used for layer: {1}}", - layer.GetBackendId().Get(), - layer.GetNameStr()), - CHECK_LOCATION()); - } - - workloadFactory = it->second.first.get(); - - ARMNN_ASSERT_MSG(workloadFactory, "No workload factory"); - - std::string reasonIfUnsupported; - ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, {}, reasonIfUnsupported), - "Factory does not support layer"); - IgnoreUnused(reasonIfUnsupported); - return *workloadFactory; -} - -void AsyncNetworkImpl::EnqueueInput(const BindableLayer& layer, - const ConstTensor& inputTensor, - WorkingMemHandle& context) -{ - if (layer.GetType() != LayerType::Input) - { - throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer"); - } - LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid(); - WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id); - ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output"); - - MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags(); - if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor - { - if (CheckFlag(importFlags, MemorySource::Malloc) ) - { - // This assumes a CPU Tensor handle - std::unique_ptr tensorHandle = - std::make_unique(inputTensor.GetInfo(), - inputTensor.GetMemoryArea()); - - void* mem = tensorHandle->Map(false); - if (descriptor.m_Outputs[0]->Import(mem, MemorySource::Malloc)) - { - tensorHandle->Unmap(); - return; - } - tensorHandle->Unmap(); - throw MemoryImportException("EnqueueInput: Memory Import failed"); - } - else - { - throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import"); - } - } - else - { - std::unique_ptr tensorHandle = - std::make_unique(inputTensor.GetInfo(), inputTensor.GetMemoryArea()); - - auto copyFunc = [](void* dst, const void* src, size_t size) - { - memcpy(dst, src, size); - }; - - for (const auto& input : descriptor.m_Inputs) - { - CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc); - } - } -} - -void AsyncNetworkImpl::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle) -{ - if (layer.GetType() != LayerType::Output) - { - throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer"); - } - ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input."); - - LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid(); - WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id); - - ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0]; - ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated."); - - // Try import the output tensor. - // Note: We can only import the output pointer if all of the following hold true: - // a) The imported pointer is aligned sufficiently - // b) The tensor has zero padding - // c) There is only one connection to the OutputSlot and it is to an OutputLayer. - // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) - // e) m_IsExportEnabled must be set to true - if (m_NetworkProperties.m_ExportEnabled && - (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) - { - if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) - { - MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); - if (CheckFlag(importFlags, MemorySource::Malloc)) - { - std::unique_ptr tensorHandle = - std::make_unique(outputTensor.GetInfo(), - outputTensor.GetMemoryArea()); - - void* mem = tensorHandle->Map(false); - bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc); - tensorHandle->Unmap(); - - if (importOk) - { - ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute"); - descriptor.m_Inputs[0]->Map(true); - descriptor.m_Inputs[0]->Unmap(); - } - else - { - throw MemoryExportException("EnqueueOutput: Memory Export failed"); - } - } - else - { - throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export"); - } - } - else - { - throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer"); - } - } - else - { - auto copyFunc = [](void* dst, const void* src, size_t size) - { - memcpy(dst, src, size); - }; - - std::unique_ptr tensorHandle = - std::make_unique(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); - - CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc); - } -} - -AsyncNetworkImpl::AsyncNetworkImpl(std::unique_ptr net, - const INetworkProperties& networkProperties, - profiling::ProfilingService& profilingService) : - m_OptimizedNetwork(std::move(net)), - m_NetworkProperties(networkProperties), - m_ProfilingService(profilingService) -{ - // Create a profiler and register it for the current thread. - m_Profiler = std::make_shared(); - ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get()); - - Graph &order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); - - //First create tensor handlers, backends and workload factories. - //Handlers are created before workloads are. - //Because workload creation can modify some of the handlers, - //(for example the splitter and concat layers). - for (auto &&layer : order) - { - auto const &backendId = layer->GetBackendId(); - if (m_Backends.count(backendId) == 0) - { - auto createBackend = BackendRegistryInstance().GetFactory(backendId); - auto it = m_Backends.emplace(std::make_pair(backendId, createBackend())); - - IBackendInternal* backend = it.first->second.get(); - - if (backend->SupportsTensorAllocatorAPI()) - { - backend->RegisterTensorHandleFactories(m_TensorHandleFactoryRegistry); - - auto workloadFactory = backend->CreateWorkloadFactory(m_TensorHandleFactoryRegistry); - m_WorkloadFactories.emplace( - std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr))); - } - else - { - IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager(); - auto workloadFactory = backend->CreateWorkloadFactory(memoryManager); - - m_WorkloadFactories.emplace( - std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager))); - } - } - } - - // Check backends support BackendCapability::AsyncExecution - for (auto const& backend : m_Backends) - { - if (!IsCapabilitySupported(backend.first, BackendCapability::AsyncExecution)) - { - ARMNN_LOG(warning) << fmt::format("AsyncNetworkImpl() Backend: '{0}' does not support Async Execution. " - "Will fall back to default implementation.", - backend.first.Get()); - } - - } - - profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); - std::unique_ptr timelineUtils = - profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService); - if (timelineUtils) - { - timelineUtils->CreateTypedEntity(networkGuid, profiling::LabelsAndEventClasses::NETWORK_GUID); - } - - //Then create workloads. - for (auto &&layer : order) - { - if (timelineUtils) - { - // Add layer to the post-optimisation network structure - AddLayerStructure(timelineUtils, *layer, networkGuid); - } - - const IWorkloadFactory &workloadFactory = GetWorkloadFactory(*layer); - - switch (layer->GetType()) - { - case LayerType::Input: - case LayerType::Output: - { - // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput(). - break; - } - default: - { - auto workload = layer->CreateWorkload(workloadFactory); - - if (!workload) - { - const char* const layerName = - layer->GetNameStr().length() != 0 ? layer->GetName() : ""; - throw InvalidArgumentException( - fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')", - layerName, - static_cast(layer->GetType()), - layer->GetBackendId().Get() - )); - } - - if (timelineUtils) - { - // Add workload to the post-optimisation network structure - AddWorkloadStructure(timelineUtils, workload, *layer); - } - - m_WorkloadQueue.push_back(move(workload)); - // release the constant data in the layer.. - layer->ReleaseConstantData(); - break; - } - } - } - - if (timelineUtils) - { - // Commit to send the post-optimisation network structure - timelineUtils->Commit(); - } - - // Now that the intermediate tensor memory has been set-up, do any post allocation configuration for each workload. - // PostAllocationConfiguure will now need to be handled in the ExecuteOn(WorkingMemDescriptor) - for (auto &workload : m_WorkloadQueue) - { - workload->PostAllocationConfigure(); - } -} - -Status AsyncNetworkImpl::Execute(const InputTensors& inputTensors, - const OutputTensors& outputTensors, - IWorkingMemHandle& iWorkingMemHandle) -{ - const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); - - // Walk graph to determine the order of execution. - if (graph.GetNumLayers() < 2) - { - ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph"; - return Status::Failure; - } - - if (graph.GetNumInputs() != inputTensors.size()) - { - throw InvalidArgumentException("Number of inputs provided does not match network."); - } - - std::unique_ptr timelineUtils = - profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService); - profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid(); - if (timelineUtils) - { - // Add inference timeline trace if profiling is enabled. - profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); - timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID); - timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink, - networkGuid, - inferenceGuid, - profiling::LabelsAndEventClasses::EXECUTION_OF_GUID); - timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS); - } - - bool executionSucceeded = true; - - if (timelineUtils) - { - // Add end of life of the inference timeline if profiling is enabled. - timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS); - timelineUtils->Commit(); - } - WorkingMemHandle& workingMemHandle = dynamic_cast(iWorkingMemHandle); - std::lock_guard lockGuard(workingMemHandle.GetMutex()); - - if (!workingMemHandle.IsAllocated()) - { - workingMemHandle.Allocate(); - } - - { - ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); - unsigned int i = 0; - - for (const BindableLayer* inputLayer : graph.GetInputLayers()) - { - EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle); - ++i; - } - } - - auto Fail = [&](const std::exception& error) - { - ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what(); - executionSucceeded = false; - }; - profiling::ProfilingDynamicGuid workloadInferenceID(0); - - try - { - for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i) - { - auto& workload = m_WorkloadQueue[i]; - if (timelineUtils) - { - workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(), - inferenceGuid); - } - workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i)); - - if (timelineUtils) - { - timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); - } - } - } - catch (const RuntimeException& error) - { - Fail(error); - } - catch (const std::runtime_error& error) - { - Fail(error); - } - // For each output to the network, call EnqueueOutput with the data passed by the user. - { - ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); - unsigned int i = static_cast(m_WorkloadQueue.size() - graph.GetNumOutputs()); - - for (const BindableLayer* outputLayer : graph.GetOutputLayers()) - { - EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle); - ++i; - } - } - return executionSucceeded ? Status::Success : Status::Failure; -} - -/// Get the profiler used for this network -std::shared_ptr AsyncNetworkImpl::GetProfiler() const -{ - return m_Profiler; -} - -void AsyncNetworkImpl::RegisterDebugCallback(const DebugCallbackFunction& func) -{ - for (auto&& workloadPtr: m_WorkloadQueue) - { - workloadPtr.get()->RegisterDebugCallback(func); - } -} - -/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have -/// overlapped Execution by calling this function from different threads. -std::unique_ptr AsyncNetworkImpl::CreateWorkingMemHandle() -{ - Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); - std::unordered_map > tensorHandles; - std::vector workingMemDescriptors; - std::unordered_map workingMemDescriptorMap; - - for (auto&& layer : order) - { - if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output) - { - continue; - } - WorkingMemDescriptor workingMemDescriptor; - // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer - // If Export is enabled disable memory management so we can export, otherwise we do a copy - if((layer->GetNumOutputSlots() == 1) && - (layer->GetOutputSlots()[0].GetNumConnections() == 1) && - (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) - { - CollectInputTensorHandles(tensorHandles, - workingMemDescriptor.m_Inputs, - layer, - m_TensorHandleFactoryRegistry, - !m_NetworkProperties.m_ExportEnabled); - CreateOutputTensorHandles(tensorHandles, - workingMemDescriptor.m_Outputs, - layer, - m_TensorHandleFactoryRegistry, - !m_NetworkProperties.m_ExportEnabled); - } - else - { - CollectInputTensorHandles(tensorHandles, - workingMemDescriptor.m_Inputs, - layer, - m_TensorHandleFactoryRegistry); - CreateOutputTensorHandles(tensorHandles, - workingMemDescriptor.m_Outputs, - layer, - m_TensorHandleFactoryRegistry); - } - workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor}); - workingMemDescriptors.push_back(workingMemDescriptor); - } - return std::make_unique(workingMemDescriptors, workingMemDescriptorMap); -} - -void AsyncNetworkImpl::FreeWorkingMemory() -{ - // Informs the memory managers to release memory in it's respective memory group - for (auto&& workloadFactory : m_WorkloadFactories) - { - IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second; - if (memoryManager) - { - memoryManager->Release(); - } - } - m_TensorHandleFactoryRegistry.ReleaseMemory(); -} - -} // end experimental namespace - -} // end armnn namespace diff --git a/src/armnn/AsyncNetwork.hpp b/src/armnn/AsyncNetwork.hpp deleted file mode 100644 index 9bdc7eebd7..0000000000 --- a/src/armnn/AsyncNetwork.hpp +++ /dev/null @@ -1,106 +0,0 @@ -// -// Copyright © 2021 Arm Ltd and Contributors. All rights reserved. -// SPDX-License-Identifier: MIT -// - -#pragma once - -#include -#include -#include - -#include "LayerFwd.hpp" -#include "Network.hpp" -#include "Profiling.hpp" -#include "WorkingMemHandle.hpp" - -#include -#include -#include -#include -#include -#include - -#include - -namespace armnn -{ - -namespace experimental -{ - -class AsyncNetworkImpl final -{ -public: - using WorkloadQueue = std::vector>; - - AsyncNetworkImpl(std::unique_ptr net, - const INetworkProperties &networkProperties, - profiling::ProfilingService &profilingService); - - ~AsyncNetworkImpl() { FreeWorkingMemory(); } - - TensorInfo GetInputTensorInfo(LayerBindingId layerId) const; - TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const; - - /// Thread safe execution of the network. Returns once execution is complete. - /// Will block until this and any other thread using the same workingMem object completes. - virtual Status Execute(const InputTensors& inputTensors, - const OutputTensors& outputTensors, - IWorkingMemHandle& workingMemHandle); - - /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have - /// overlapped Execution by calling this function from different threads. - std::unique_ptr CreateWorkingMemHandle(); - - /// Get the profiler used for this network - std::shared_ptr GetProfiler() const; - - /// Register a debug callback function to be used with this network - void RegisterDebugCallback(const DebugCallbackFunction& func); - -private: - void FreeWorkingMemory(); - - void CollectInputTensorHandles(std::unordered_map >& tensorHandles, - std::vector& inputs, - const armnn::Layer* layer, - const TensorHandleFactoryRegistry& registry, - const bool isMemoryManaged = false); - - void CreateOutputTensorHandles(std::unordered_map >& tensorHandles, - std::vector& outputs, - const armnn::Layer* layer, - const TensorHandleFactoryRegistry& registry, - const bool isMemoryManaged = false); - - void EnqueueInput(const BindableLayer& layer, const ConstTensor& inputTensor, WorkingMemHandle& handle); - - void EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle); - - using BackendPtrMap = std::unordered_map; - - using WorkloadFactoryWithMemoryManager = - std::pair; - - using WorkloadFactoryMap = std::unordered_map; - - const IWorkloadFactory& GetWorkloadFactory(const Layer& layer) const; - - BackendPtrMap m_Backends; - WorkloadFactoryMap m_WorkloadFactories; - - std::unique_ptr m_OptimizedNetwork; - INetworkProperties m_NetworkProperties; - WorkloadQueue m_WorkloadQueue; - std::shared_ptr m_Profiler; - - TensorHandleFactoryRegistry m_TensorHandleFactoryRegistry; - - /// Profiling Service Instance - profiling::ProfilingService& m_ProfilingService; -}; - -} // end experimental namespace - -} // end armnn namespace diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index ea09231c3c..d75a2021b2 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -10,6 +10,7 @@ #include #include "Profiling.hpp" #include "HeapProfiling.hpp" +#include "WorkingMemHandle.hpp" #include #include @@ -119,8 +120,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, const INetworkProperties& networkProperties, profiling::ProfilingService& profilingService) : m_OptimizedNetwork(std::move(net)), - m_IsImportEnabled(networkProperties.m_ImportEnabled), - m_IsExportEnabled(networkProperties.m_ExportEnabled), + m_NetworkProperties(networkProperties), m_TensorHandleFactoryRegistry(), m_ProfilingService(profilingService) { @@ -172,7 +172,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, case LayerType::MemImport: { // If IsImportEnabled is true then we need to set IsMemoryManaged to false when creating TensorHandles - layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsImportEnabled); + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, + !m_NetworkProperties.m_ImportEnabled); break; } default: @@ -183,7 +184,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, (layer->GetOutputSlots()[0].GetNumConnections() == 1) && (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) { - layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !m_IsExportEnabled); + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, + !m_NetworkProperties.m_ExportEnabled); } else { @@ -576,7 +578,7 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tens MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags(); bool needMemCopy = true; - if (m_IsImportEnabled) // Try import the input tensor + if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor { if(CheckFlag(importFlags, MemorySource::Malloc) ) { @@ -647,7 +649,8 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) // e) m_IsExportEnabled must be set to true bool needMemCopy = true; - if (m_IsExportEnabled && (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) + if (m_NetworkProperties.m_ExportEnabled && + (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) { if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) { @@ -792,6 +795,353 @@ bool LoadedNetwork::Execute(std::unique_ptr& timelineUti return success; } +void LoadedNetwork::EnqueueInput(const BindableLayer& layer, + const ConstTensor& inputTensor, + WorkingMemHandle& context) +{ + if (layer.GetType() != LayerType::Input) + { + throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer"); + } + LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid(); + WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id); + ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output"); + + MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags(); + if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor + { + if (CheckFlag(importFlags, MemorySource::Malloc) ) + { + // This assumes a CPU Tensor handle + std::unique_ptr tensorHandle = + std::make_unique(inputTensor.GetInfo(), + inputTensor.GetMemoryArea()); + + void* mem = tensorHandle->Map(false); + if (descriptor.m_Outputs[0]->Import(mem, MemorySource::Malloc)) + { + tensorHandle->Unmap(); + return; + } + tensorHandle->Unmap(); + throw MemoryImportException("EnqueueInput: Memory Import failed"); + } + else + { + throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import"); + } + } + else + { + std::unique_ptr tensorHandle = + std::make_unique(inputTensor.GetInfo(), inputTensor.GetMemoryArea()); + + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; + + for (const auto& input : descriptor.m_Inputs) + { + CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc); + } + } +} + +void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle) +{ + if (layer.GetType() != LayerType::Output) + { + throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer"); + } + ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input."); + + LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid(); + WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id); + + ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0]; + ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated."); + + // Try import the output tensor. + // Note: We can only import the output pointer if all of the following hold true: + // a) The imported pointer is aligned sufficiently + // b) The tensor has zero padding + // c) There is only one connection to the OutputSlot and it is to an OutputLayer. + // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) + // e) m_IsExportEnabled must be set to true + if (m_NetworkProperties.m_ExportEnabled && + (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) + { + if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) + { + MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); + if (CheckFlag(importFlags, MemorySource::Malloc)) + { + std::unique_ptr tensorHandle = + std::make_unique(outputTensor.GetInfo(), + outputTensor.GetMemoryArea()); + + void* mem = tensorHandle->Map(false); + bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc); + tensorHandle->Unmap(); + + if (importOk) + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute"); + descriptor.m_Inputs[0]->Map(true); + descriptor.m_Inputs[0]->Unmap(); + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed"); + } + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export"); + } + } + else + { + throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer"); + } + } + else + { + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; + + std::unique_ptr tensorHandle = + std::make_unique(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); + + CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc); + } +} + +Status LoadedNetwork::Execute(const InputTensors& inputTensors, + const OutputTensors& outputTensors, + IWorkingMemHandle& iWorkingMemHandle) +{ + const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); + + // Walk graph to determine the order of execution. + if (graph.GetNumLayers() < 2) + { + ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph"; + return Status::Failure; + } + + if (graph.GetNumInputs() != inputTensors.size()) + { + throw InvalidArgumentException("Number of inputs provided does not match network."); + } + + std::unique_ptr timelineUtils = + profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService); + profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid(); + if (timelineUtils) + { + // Add inference timeline trace if profiling is enabled. + profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); + timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID); + timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink, + networkGuid, + inferenceGuid, + profiling::LabelsAndEventClasses::EXECUTION_OF_GUID); + timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS); + } + + bool executionSucceeded = true; + + if (timelineUtils) + { + // Add end of life of the inference timeline if profiling is enabled. + timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS); + timelineUtils->Commit(); + } + WorkingMemHandle& workingMemHandle = dynamic_cast(iWorkingMemHandle); + std::lock_guard lockGuard(workingMemHandle.GetMutex()); + + if (!workingMemHandle.IsAllocated()) + { + workingMemHandle.Allocate(); + } + + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); + unsigned int i = 0; + + for (const BindableLayer* inputLayer : graph.GetInputLayers()) + { + EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle); + ++i; + } + } + + auto Fail = [&](const std::exception& error) + { + ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what(); + executionSucceeded = false; + }; + profiling::ProfilingDynamicGuid workloadInferenceID(0); + + try + { + for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i) + { + auto& workload = m_WorkloadQueue[i]; + if (timelineUtils) + { + workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(), + inferenceGuid); + } + workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i)); + + if (timelineUtils) + { + timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); + } + } + } + catch (const RuntimeException& error) + { + Fail(error); + } + catch (const std::runtime_error& error) + { + Fail(error); + } + // For each output to the network, call EnqueueOutput with the data passed by the user. + { + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); + unsigned int i = static_cast(m_WorkloadQueue.size() - graph.GetNumOutputs()); + + for (const BindableLayer* outputLayer : graph.GetOutputLayers()) + { + EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle); + ++i; + } + } + return executionSucceeded ? Status::Success : Status::Failure; +} +// Need something like the collectors to get the correct tensors for the inputs +void LoadedNetwork::CollectInputTensorHandles( + std::unordered_map >& tensorHandles, + std::vector& inputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged) +{ + for (auto&& inputSlot : layer->GetInputSlots()) + { + // The graph must be well-formed at this point. + ARMNN_ASSERT(inputSlot.GetConnection()); + auto outputSlot = inputSlot.GetConnectedOutputSlot(); + auto key = outputSlot->GetOwningLayer().GetGuid(); + auto search = tensorHandles.find(key); + + if (search == tensorHandles.end()) + { + ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId(); + const TensorInfo& tensorInfo = outputSlot->GetTensorInfo(); + + ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); + ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); + ARMNN_ASSERT(handleFactory); + std::unique_ptr tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); + ITensorHandle* tensorPtr = tensor.release(); + inputs.push_back(tensorPtr); + } + else + { + unsigned int index = outputSlot->CalculateIndexOnOwner(); + inputs.push_back(search->second[index]); + } + } +} + +void LoadedNetwork::CreateOutputTensorHandles( + std::unordered_map >& tensorHandles, + std::vector& outputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged) +{ + auto guid = layer->GetGuid(); + std::vector tensorHandleVectors; + tensorHandleVectors.reserve(layer->GetNumOutputSlots()); + + for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++) + { + const OutputSlot& slot = layer->GetOutputSlot(idx); + ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId(); + const TensorInfo& tensorInfo = slot.GetTensorInfo(); + + ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); + ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); + ARMNN_ASSERT(handleFactory); + std::unique_ptr tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); + ITensorHandle* tensorPtr = tensor.release(); + outputs.push_back(tensorPtr); + tensorHandleVectors.push_back(tensorPtr); + } + tensorHandles.insert({guid, tensorHandleVectors}); +} + +/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have +/// overlapped Execution by calling this function from different threads. +std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId) +{ + Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); + std::unordered_map > tensorHandles; + std::vector workingMemDescriptors; + std::unordered_map workingMemDescriptorMap; + + for (auto&& layer : order) + { + if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output) + { + continue; + } + WorkingMemDescriptor workingMemDescriptor; + // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer + // If Export is enabled disable memory management so we can export, otherwise we do a copy + if((layer->GetNumOutputSlots() == 1) && + (layer->GetOutputSlots()[0].GetNumConnections() == 1) && + (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) + { + CollectInputTensorHandles(tensorHandles, + workingMemDescriptor.m_Inputs, + layer, + m_TensorHandleFactoryRegistry, + !m_NetworkProperties.m_ExportEnabled); + CreateOutputTensorHandles(tensorHandles, + workingMemDescriptor.m_Outputs, + layer, + m_TensorHandleFactoryRegistry, + !m_NetworkProperties.m_ExportEnabled); + } + else + { + CollectInputTensorHandles(tensorHandles, + workingMemDescriptor.m_Inputs, + layer, + m_TensorHandleFactoryRegistry); + CreateOutputTensorHandles(tensorHandles, + workingMemDescriptor.m_Outputs, + layer, + m_TensorHandleFactoryRegistry); + } + workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor}); + workingMemDescriptors.push_back(workingMemDescriptor); + } + return std::make_unique(networkId, + workingMemDescriptors, + workingMemDescriptorMap); +} + void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func) { for (auto&& workloadPtr: m_WorkloadQueue) diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp index c7dd37fdea..2bcf5c8c08 100644 --- a/src/armnn/LoadedNetwork.hpp +++ b/src/armnn/LoadedNetwork.hpp @@ -37,11 +37,19 @@ public: using WorkloadQueue = std::vector< std::unique_ptr >; ~LoadedNetwork(){ FreeWorkingMemory(); } + /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have + /// overlapped Execution by calling this function from different threads. + std::unique_ptr CreateWorkingMemHandle(NetworkId networkId); + TensorInfo GetInputTensorInfo(LayerBindingId layerId) const; TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const; Status EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors); + Status Execute(const InputTensors& inputTensors, + const OutputTensors& outputTensors, + IWorkingMemHandle& workingMemHandle); + static std::unique_ptr MakeLoadedNetwork(std::unique_ptr net, std::string & errorMessage, const INetworkProperties& networkProperties, @@ -58,6 +66,11 @@ public: void SendNetworkStructure(); + bool IsAsyncEnabled() + { + return m_NetworkProperties.m_AsyncEnabled; + } + profiling::ProfilingGuid GetNetworkGuid(); private: @@ -67,14 +80,29 @@ private: const INetworkProperties& networkProperties, profiling::ProfilingService& profilingService); + void CollectInputTensorHandles(std::unordered_map >& tensorHandles, + std::vector& inputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged = false); + + void CreateOutputTensorHandles(std::unordered_map >& tensorHandles, + std::vector& outputs, + const armnn::Layer* layer, + const TensorHandleFactoryRegistry& registry, + const bool isMemoryManaged = false); + void EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo); void EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo); + void EnqueueInput(const BindableLayer& layer, const ConstTensor& inputTensor, WorkingMemHandle& handle); + + void EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle); + bool Execute(std::unique_ptr& timelineUtils, profiling::ProfilingGuid inferenceGuid); - const IWorkloadFactory& GetWorkloadFactory(const Layer& layer) const; using BackendPtrMap = std::unordered_map; @@ -96,8 +124,7 @@ private: mutable std::mutex m_WorkingMemMutex; bool m_IsWorkingMemAllocated=false; - bool m_IsImportEnabled=false; - bool m_IsExportEnabled=false; + INetworkProperties m_NetworkProperties; TensorHandleFactoryRegistry m_TensorHandleFactoryRegistry; diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp index 57aaabd277..91a21d4b53 100644 --- a/src/armnn/Runtime.cpp +++ b/src/armnn/Runtime.cpp @@ -64,14 +64,6 @@ Status IRuntime::LoadNetwork(NetworkId& networkIdOut, return pRuntimeImpl->LoadNetwork(networkIdOut, std::move(network), errorMessage, networkProperties); } -std::unique_ptr IRuntime::CreateAsyncNetwork(NetworkId& networkIdOut, - IOptimizedNetworkPtr network, - std::string& errorMessage, - const INetworkProperties& networkProperties) -{ - return pRuntimeImpl->CreateAsyncNetwork(networkIdOut, std::move(network), errorMessage, networkProperties); -} - TensorInfo IRuntime::GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const { return pRuntimeImpl->GetInputTensorInfo(networkId, layerId); @@ -89,6 +81,13 @@ Status IRuntime::EnqueueWorkload(NetworkId networkId, return pRuntimeImpl->EnqueueWorkload(networkId, inputTensors, outputTensors); } +Status IRuntime::Execute(IWorkingMemHandle& workingMemHandle, + const InputTensors& inputTensors, + const OutputTensors& outputTensors) +{ + return pRuntimeImpl->Execute(workingMemHandle, inputTensors, outputTensors); +} + Status IRuntime::UnloadNetwork(NetworkId networkId) { return pRuntimeImpl->UnloadNetwork(networkId); @@ -99,6 +98,11 @@ const IDeviceSpec& IRuntime::GetDeviceSpec() const return pRuntimeImpl->GetDeviceSpec(); } +std::unique_ptr IRuntime::CreateWorkingMemHandle(NetworkId networkId) +{ + return pRuntimeImpl->CreateWorkingMemHandle(networkId); +} + const std::shared_ptr IRuntime::GetProfiler(NetworkId networkId) const { return pRuntimeImpl->GetProfiler(networkId); @@ -173,43 +177,6 @@ Status RuntimeImpl::LoadNetwork(NetworkId& networkIdOut, return Status::Success; } -std::unique_ptr RuntimeImpl::CreateAsyncNetwork(NetworkId& networkIdOut, - IOptimizedNetworkPtr network, - std::string&, - const INetworkProperties& networkProperties) -{ - IOptimizedNetwork* rawNetwork = network.release(); - - networkIdOut = GenerateNetworkId(); - - for (auto&& context : m_BackendContexts) - { - context.second->BeforeLoadNetwork(networkIdOut); - } - - unique_ptr asyncNetwork = std::make_unique( - std::unique_ptr(rawNetwork), - networkProperties, - m_ProfilingService); - - if (!asyncNetwork) - { - return nullptr; - } - - for (auto&& context : m_BackendContexts) - { - context.second->AfterLoadNetwork(networkIdOut); - } - - if (m_ProfilingService.IsProfilingEnabled()) - { - m_ProfilingService.IncrementCounterValue(armnn::profiling::NETWORK_LOADS); - } - - return asyncNetwork; -} - Status RuntimeImpl::UnloadNetwork(NetworkId networkId) { bool unloadOk = true; @@ -430,6 +397,17 @@ Status RuntimeImpl::EnqueueWorkload(NetworkId networkId, const OutputTensors& outputTensors) { LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId); + + if (!loadedNetwork) + { + ARMNN_LOG(error) << "A Network with an id of " << networkId << " does not exist.\n"; + return Status::Failure; + } + if (loadedNetwork->IsAsyncEnabled()) + { + ARMNN_LOG(error) << "Network " << networkId << " is async enabled.\n"; + return Status::Failure; + } ProfilerManager::GetInstance().RegisterProfiler(loadedNetwork->GetProfiler().get()); ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "EnqueueWorkload"); @@ -447,6 +425,73 @@ Status RuntimeImpl::EnqueueWorkload(NetworkId networkId, return loadedNetwork->EnqueueWorkload(inputTensors, outputTensors); } +Status RuntimeImpl::Execute(IWorkingMemHandle& iWorkingMemHandle, + const InputTensors& inputTensors, + const OutputTensors& outputTensors) +{ + NetworkId networkId = iWorkingMemHandle.GetNetworkId(); + LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId); + + if (!loadedNetwork) + { + ARMNN_LOG(error) << "A Network with an id of " << networkId << " does not exist.\n"; + return Status::Failure; + } + if (!loadedNetwork->IsAsyncEnabled()) + { + ARMNN_LOG(error) << "Network " << networkId << " is not async enabled.\n"; + return Status::Failure; + } + ProfilerManager::GetInstance().RegisterProfiler(loadedNetwork->GetProfiler().get()); + + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute"); + + static thread_local NetworkId lastId = networkId; + if (lastId != networkId) + { + LoadedNetworkFuncSafe(lastId, [](LoadedNetwork* network) + { + network->FreeWorkingMemory(); + }); + } + lastId=networkId; + + return loadedNetwork->Execute(inputTensors, outputTensors, iWorkingMemHandle); +} + +/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have +/// overlapped Execution by calling this function from different threads. +std::unique_ptr RuntimeImpl::CreateWorkingMemHandle(NetworkId networkId) +{ + LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId); + + if (!loadedNetwork) + { + ARMNN_LOG(error) << "A Network with an id of " << networkId << " does not exist.\n"; + return nullptr; + } + if (!loadedNetwork->IsAsyncEnabled()) + { + ARMNN_LOG(error) << "Network " << networkId << " is not async enabled.\n"; + return nullptr; + } + ProfilerManager::GetInstance().RegisterProfiler(loadedNetwork->GetProfiler().get()); + + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CreateWorkingMemHandle"); + + static thread_local NetworkId lastId = networkId; + if (lastId != networkId) + { + LoadedNetworkFuncSafe(lastId, [](LoadedNetwork* network) + { + network->FreeWorkingMemory(); + }); + } + lastId=networkId; + + return loadedNetwork->CreateWorkingMemHandle(networkId); +} + void RuntimeImpl::RegisterDebugCallback(NetworkId networkId, const DebugCallbackFunction& func) { LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId); diff --git a/src/armnn/Runtime.hpp b/src/armnn/Runtime.hpp index 150012eb61..da5445383f 100644 --- a/src/armnn/Runtime.hpp +++ b/src/armnn/Runtime.hpp @@ -4,7 +4,6 @@ // #pragma once -#include "AsyncNetwork.hpp" #include "LoadedNetwork.hpp" #include "DeviceSpec.hpp" @@ -56,17 +55,14 @@ public: TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; - // Create Aysnchronous Network from the IOptimizedNetowrkPtr - std::unique_ptr CreateAsyncNetwork(NetworkId& networkIdOut, - IOptimizedNetworkPtr network, - std::string& errorMessage, - const INetworkProperties& networkProperties); - - // Evaluates network using input in inputTensors, outputs filled into outputTensors. Status EnqueueWorkload(NetworkId networkId, - const InputTensors& inputTensors, - const OutputTensors& outputTensors); + const InputTensors& inputTensors, + const OutputTensors& outputTensors); + + Status Execute(IWorkingMemHandle& workingMemHandle, + const InputTensors& inputTensors, + const OutputTensors& outputTensors); /// Unloads a network from the Runtime. /// At the moment this only removes the network from the m_Impl->m_Network. @@ -82,6 +78,10 @@ public: /// @return A pointer to the requested profiler, or nullptr if not found. const std::shared_ptr GetProfiler(NetworkId networkId) const; + /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have + /// overlapped Execution by calling this function from different threads. + std::unique_ptr CreateWorkingMemHandle(NetworkId networkId); + /// Registers a callback function to debug layers performing custom computations on intermediate tensors. /// @param networkId The id of the network to register the callback. /// @param func callback function to pass to the debug layer. diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp index 7a901b296b..c1a48d482f 100644 --- a/src/armnn/WorkingMemHandle.cpp +++ b/src/armnn/WorkingMemHandle.cpp @@ -13,8 +13,10 @@ namespace armnn namespace experimental { -WorkingMemHandle::WorkingMemHandle(std::vector workingMemDescriptors, +WorkingMemHandle::WorkingMemHandle(NetworkId networkId, + std::vector workingMemDescriptors, std::unordered_map workingMemDescriptorMap) : + m_NetworkId(networkId), m_WorkingMemDescriptors(workingMemDescriptors), m_WorkingMemDescriptorMap(workingMemDescriptorMap), m_IsAllocated(false), diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp index 090f180206..cef6fb6fd3 100644 --- a/src/armnn/WorkingMemHandle.hpp +++ b/src/armnn/WorkingMemHandle.hpp @@ -24,10 +24,17 @@ class WorkingMemHandle final : public IWorkingMemHandle { public: - WorkingMemHandle(std::vector workingMemDescriptors, + WorkingMemHandle(NetworkId networkId, + std::vector workingMemDescriptors, std::unordered_map workingMemDescriptorMap); - ~WorkingMemHandle() { FreeWorkingMemory(); } + ~WorkingMemHandle() + { FreeWorkingMemory(); } + + NetworkId GetNetworkId() override + { + return m_NetworkId; + } /// Allocate the backing memory required for execution. If this is not called, then allocation will be /// deferred to execution time. The mutex must be locked. @@ -106,6 +113,7 @@ public: private: void FreeWorkingMemory(); + NetworkId m_NetworkId; std::shared_ptr m_Profiler; std::vector m_WorkingMemDescriptors; diff --git a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp index 2ccd2b13af..66ccdbf1d9 100644 --- a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp +++ b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp @@ -40,15 +40,15 @@ void AsyncEndToEndTestImpl(INetworkPtr network, // Creates AsyncNetwork NetworkId networkId = 0; std::string errorMessage; - const INetworkProperties networkProperties; - auto asyncNetwork = runtime->CreateAsyncNetwork(networkId, std::move(optNet), errorMessage, networkProperties); + const INetworkProperties networkProperties(false, false, true); + runtime->LoadNetwork(networkId, std::move(optNet), errorMessage, networkProperties); InputTensors inputTensors; inputTensors.reserve(inputTensorData.size()); for (auto&& it : inputTensorData) { inputTensors.push_back({it.first, - ConstTensor(asyncNetwork->GetInputTensorInfo(it.first), it.second.data())}); + ConstTensor(runtime->GetInputTensorInfo(networkId, it.first), it.second.data())}); } OutputTensors outputTensors; @@ -59,16 +59,16 @@ void AsyncEndToEndTestImpl(INetworkPtr network, std::vector out(it.second.size()); outputStorage.emplace(it.first, out); outputTensors.push_back({it.first, - Tensor(asyncNetwork->GetOutputTensorInfo(it.first), + Tensor(runtime->GetOutputTensorInfo(networkId, it.first), outputStorage.at(it.first).data())}); } // Create WorkingMemHandle for this async network - std::unique_ptr workingMemHandle = asyncNetwork->CreateWorkingMemHandle(); + std::unique_ptr workingMemHandle = runtime->CreateWorkingMemHandle(networkId); IWorkingMemHandle& workingMemHandleRef = *workingMemHandle.get(); // Run the async network - asyncNetwork->Execute(inputTensors, outputTensors, workingMemHandleRef); + runtime->Execute(workingMemHandleRef, inputTensors, outputTensors); // Checks the results. for (auto&& it : expectedOutputData) -- cgit v1.2.1