aboutsummaryrefslogtreecommitdiff
path: root/src/armnn/LoadedNetwork.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/armnn/LoadedNetwork.cpp')
-rw-r--r--src/armnn/LoadedNetwork.cpp610
1 files changed, 511 insertions, 99 deletions
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index 7fb14d0f32..03e5ad5bfe 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -131,11 +131,14 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
profiler->EnableNetworkDetailsToStdOut(networkProperties.m_OutputNetworkDetailsMethod);
- Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
//First create tensor handlers, backends and workload factories.
//Handlers are created before workloads are.
//Because workload creation can modify some of the handlers,
//(for example the splitter and concat layers).
+
+ bool useExternalMemoryManager = false;
+ bool useInternalMemoryManager = false;
+ Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
for (auto&& layer : order)
{
auto const& backendId = layer->GetBackendId();
@@ -154,25 +157,44 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
throw BackendCapabilityException(er);
}
+ if (networkProperties.m_AsyncEnabled &&
+ !HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},
+ backend->GetCapabilities()))
+ {
+ std::string er = backend->GetId();
+ er += " does not support ExternallyManagedMemory\n";
+ er += "AsyncEnabled networks require all backends to support ExternallyManagedMemory";
+ throw BackendCapabilityException(er);
+ }
+
+ if (HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},backend->GetCapabilities())
+ && (m_NetworkProperties.m_ExternalMemoryManagementEnabled || m_NetworkProperties.m_AsyncEnabled))
+ {
+ m_SupportsExternallyManagedMemory[backend->GetId()] = true;
+ useExternalMemoryManager = true;
+ }
+ else
+ {
+ m_SupportsExternallyManagedMemory[backend->GetId()] = false;
+ useInternalMemoryManager = true;
+ }
+
+ IBackendInternal::IWorkloadFactoryPtr workloadFactory;
if (backend->SupportsTensorAllocatorAPI())
{
- auto workloadFactory = backend->CreateWorkloadFactory(
+ workloadFactory = backend->CreateWorkloadFactory(
m_TensorHandleFactoryRegistry,
m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(),
static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
- m_WorkloadFactories.emplace(
- std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
}
else
{
- IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager();
- auto workloadFactory = backend->CreateWorkloadFactory(
- memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
-
- m_WorkloadFactories.emplace(
- std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
+ m_BackendMemoryMangers.emplace_back(backend->CreateMemoryManager());
+ workloadFactory = backend->CreateWorkloadFactory(
+ m_BackendMemoryMangers.back(), m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
}
+ m_WorkloadFactories[backendId ] = std::move(workloadFactory);
}
}
@@ -181,6 +203,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
for (auto&& layer : order)
{
auto& workloadFactory = GetWorkloadFactory(*layer);
+ bool supportsExternalManager = m_SupportsExternallyManagedMemory[layer->GetBackendId()];
switch (layer->GetType())
{
@@ -191,7 +214,12 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// to false when creating TensorHandles
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
workloadFactory,
- !m_NetworkProperties.m_ImportEnabled);
+ !supportsExternalManager && !m_NetworkProperties.m_ImportEnabled);
+ break;
+ }
+ case LayerType::Constant:
+ {
+ layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, true);
break;
}
default:
@@ -199,16 +227,18 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
// If Export is enabled disable memory management so we can export, otherwise we do a copy
if ((layer->GetNumOutputSlots() == 1) &&
- (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
- (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
+ (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
+ (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
{
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
workloadFactory,
- !m_NetworkProperties.m_ExportEnabled);
+ !supportsExternalManager && !m_NetworkProperties.m_ExportEnabled);
}
else
{
- layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
+ layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
+ workloadFactory,
+ !supportsExternalManager);
}
}
}
@@ -251,7 +281,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
break;
}
- default: {
+ default:
+ {
auto workload = layer->CreateWorkload(workloadFactory);
if (!workload)
@@ -272,11 +303,16 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
// and are separated out from the other workloads
- if (networkProperties.m_AsyncEnabled && layer->GetType() == LayerType::Constant)
+ if((networkProperties.m_AsyncEnabled || useExternalMemoryManager) &&
+ layer->GetType() == LayerType::Constant)
{
+ m_ConstantTensorHandles[layer->GetGuid()] =
+ layer->GetOutputSlot(0).GetOutputHandler().GetData();
m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
- } else {
- m_WorkloadQueue.push_back(move(workload));
+ }
+ else
+ {
+ m_WorkloadQueue.push_back(std::move(workload));
}
// release the constant data in the layer..
@@ -289,7 +325,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
for (auto&& workloadFactory : m_WorkloadFactories)
{
- workloadFactory.second.first->AfterWorkloadsCreated();
+ workloadFactory.second->AfterWorkloadsCreated();
}
if (timelineUtils)
@@ -298,28 +334,90 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
timelineUtils->Commit();
}
+ if (useExternalMemoryManager)
+ {
+ if (networkProperties.m_AsyncEnabled)
+ {
+ CreateMemoryProfileAsync();
+ }
+ else
+ {
+ CreateMemoryProfile();
+ }
+
+ auto backendStrategyMap = BackendRegistryInstance().GetMemoryOptimizerStrategies();
+ for (auto& backendMemoryProfile : m_MemBlockMap)
+ {
+ const BackendId& backendId = backendMemoryProfile.first;
+ if (backendStrategyMap.find(backendId) != backendStrategyMap.end())
+ {
+ m_MemBinMap[backendId] = backendStrategyMap[backendId]->Optimize(backendMemoryProfile.second);
+ }
+ else
+ {
+ m_MemBinMap[backendId] = m_ConstantStrategy->Optimize(backendMemoryProfile.second);
+ }
+ }
+
+ if (!networkProperties.m_AsyncEnabled)
+ {
+ m_ExternalMemoryManager = CreateExternalMemoryManger(m_TensorMemory);
+
+ // Sort m_TensorMemory, so it's order matches m_Tensorhandles
+ std::sort(m_TensorMemory.begin(), m_TensorMemory.end(),
+ [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
+ const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
+ {
+ return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
+ });
+ }
+ }
+
+ // Now that the intermediate tensor memory has been set-up,
+ // do any post allocation configuration for each workload.
if (!networkProperties.m_AsyncEnabled)
{
- // Set up memory.
- m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
+ if (useInternalMemoryManager)
+ {
+ // Set up memory.
+ m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
+ }
- // Now that the intermediate tensor memory has been set-up,
- // do any post allocation configuration for each workload.
- ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_PostAllocationConfigure");
for (auto &workload : m_WorkloadQueue)
{
workload->PostAllocationConfigure();
}
}
- else
+
+ if (useExternalMemoryManager)
{
- AllocateAndExecuteConstantWorkloads();
+ if (!networkProperties.m_AsyncEnabled)
+ {
+ AllocateAndExecuteConstantWorkloads();
+ }
+ else
+ {
+ AllocateAndExecuteConstantWorkloadsAsync();
+ }
}
}
void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
+ for (auto& pair : m_ConstantWorkloads)
+ {
+ auto tensorHandle = m_ConstantTensorHandles[pair.first];
+ tensorHandle->Allocate();
+ pair.second->Execute();
+ }
+}
+
+
+
+void LoadedNetwork::AllocateAndExecuteConstantWorkloadsAsync()
+{
+ ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
for (auto&& layer : order)
{
@@ -343,7 +441,6 @@ void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
}
}
-
void LoadedNetwork::SendNetworkStructure()
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_SendNetworkStructure");
@@ -429,7 +526,7 @@ const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) co
CHECK_LOCATION());
}
- workloadFactory = it->second.first.get();
+ workloadFactory = it->second.get();
ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
@@ -780,9 +877,19 @@ void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
{
return;
}
- for (auto&& workloadFactory : m_WorkloadFactories)
+
+ if (m_ExternalMemoryManager)
+ {
+ m_ExternalMemoryManager->Allocate();
+
+ for (unsigned int i = 0; i < m_TensorMemory.size(); ++i)
+ {
+ m_Tensorhandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second);
+ }
+ }
+
+ for (auto&& memoryManager : m_BackendMemoryMangers)
{
- IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
if (memoryManager)
{
memoryManager->Acquire();
@@ -795,14 +902,20 @@ void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
void LoadedNetwork::FreeWorkingMemory()
{
std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
+
if (!m_IsWorkingMemAllocated)
{
return;
}
- // Informs the memory managers to release memory in it's respective memory group
- for (auto&& workloadFactory : m_WorkloadFactories)
+
+ if (m_ExternalMemoryManager)
+ {
+ m_ExternalMemoryManager->Deallocate();
+ }
+
+ // Informs the memory managers to release memory in its respective memory group
+ for (auto&& memoryManager : m_BackendMemoryMangers)
{
- IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
if (memoryManager)
{
memoryManager->Release();
@@ -1392,37 +1505,16 @@ Status LoadedNetwork::Execute(const InputTensors& inputTensors,
std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
{
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
- std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > tensorHandleMap;
- std::vector<WorkingMemDescriptor> workingMemDescriptors;
- std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
- TensorHandleFactoryRegistry tensorHandleFactoryRegistry;
- WorkloadFactoryMap workloadFactoryMap;
- std::vector<std::shared_ptr<IMemoryManager>> memoryManagers;
+ // Tensors that will need to be allocated internally within armnn
+ std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles;
+ // Tensors that will be allocated externally by the user
+ std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles;
- for (auto const& backend : m_Backends)
- {
- if (backend.second->SupportsTensorAllocatorAPI())
- {
- backend.second->RegisterTensorHandleFactories(
- tensorHandleFactoryRegistry,
- static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
- static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
- memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back());
- }
- else
- {
- std::shared_ptr<IMemoryManager> memoryManager = backend.second->CreateMemoryManager();
- auto workloadFactory = backend.second->CreateWorkloadFactory(
- memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
-
- workloadFactoryMap.emplace(
- std::make_pair(backend.first, std::make_pair(std::move(workloadFactory), memoryManager)));
- memoryManagers.emplace_back(memoryManager);
- }
- }
+ std::vector<WorkingMemDescriptor> workingMemDescriptors;
+ std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
- auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot, bool isMemoryManaged)
+ auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot)
{
ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
@@ -1431,28 +1523,30 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
{
BackendId id = layer->GetBackendId();
ARMNN_NO_DEPRECATE_WARN_BEGIN
- return workloadFactoryMap.at(id).first->CreateTensorHandle(tensorInfo, isMemoryManaged);
+ return m_WorkloadFactories.at(id)->CreateTensorHandle(tensorInfo, false);
ARMNN_NO_DEPRECATE_WARN_END
}
else
{
- ITensorHandleFactory* handleFactory = tensorHandleFactoryRegistry.GetFactory(factoryId);
+ ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
ARMNN_ASSERT(handleFactory);
- return handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
+ return handleFactory->CreateTensorHandle(tensorInfo, false);
}
};
struct HandleInfo
{
- unsigned int m_ReferenceCount = 0;
- bool isInputLayerHandle = false;
- bool isOutputLayerHandle = false;
+ ITensorHandle* m_TensorHandle;
+
+ bool m_IsInputLayerHandle = false;
+ bool m_IsOutputLayerHandle = false;
WorkingMemHandle::InputMemDescriptorCoords m_InputMemDescriptorCoords;
WorkingMemHandle::OutputMemDescriptorCoords m_OutputMemDescriptorCoords;
};
- std::unordered_map<const ITensorHandle*, HandleInfo> handleReferenceCounts;
+ std::unordered_map<const OutputSlot*, HandleInfo> outputToHandleInfoMap;
+
unsigned int layerIndex = 0;
for (auto&& layer : order)
{
@@ -1508,27 +1602,33 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
}
}
- tensorHandleMap[layer->GetGuid()].emplace_back(GetTensorHandle(layer, slot, isMemoryManaged));
- ITensorHandle* tensorHandle = tensorHandleMap[layer->GetGuid()].back().get();
+ ITensorHandle* tensorHandle;
+ if (isMemoryManaged)
+ {
+ managedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
+ tensorHandle = managedTensorHandles.back().get();
+ }
+ else
+ {
+ unmanagedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
+ tensorHandle = unmanagedTensorHandles.back().get();
+ }
workingMemDescriptor.m_Outputs.push_back(tensorHandle);
- tensorHandle->Manage();
- unsigned int numConnections = slot.GetNumConnections();
- ARMNN_ASSERT(numConnections != 0);
- HandleInfo& handleInfo = handleReferenceCounts[tensorHandle];
- handleInfo.m_ReferenceCount = numConnections;
+ HandleInfo& handleInfo = outputToHandleInfoMap[&slot];
+ handleInfo.m_TensorHandle = tensorHandle;
// Store the coordinates of the current layer's OutputSlot that is connected to the OutputLayer
if (isConnectedToOutputLayer)
{
- handleInfo.isOutputLayerHandle = true;
+ handleInfo.m_IsOutputLayerHandle = true;
handleInfo.m_OutputMemDescriptorCoords.m_OutputSlotCoords = {layerIndex, slotIndex};
}
// Store the LayerBindingId of the InputLayer
if (isInputLayer)
{
- handleInfo.isInputLayerHandle = true;
+ handleInfo.m_IsInputLayerHandle = true;
LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
handleInfo.m_InputMemDescriptorCoords.m_LayerBindingId = bindingId;
}
@@ -1557,20 +1657,19 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
{
LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
- HandleInfo& handleInfo = handleReferenceCounts[tensorHandle];
- handleInfo.isOutputLayerHandle = true;
+ HandleInfo& handleInfo = outputToHandleInfoMap[outputSlot];
+ handleInfo.m_TensorHandle = tensorHandle;
+ handleInfo.m_IsOutputLayerHandle = true;
handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
}
continue;
}
- auto search = tensorHandleMap.find(key);
- unsigned int index = outputSlot->CalculateIndexOnOwner();
- ITensorHandle* inputTensorHandle = search->second[index].get();
- workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
+ HandleInfo& handleInfo = outputToHandleInfoMap.at(outputSlot);
- HandleInfo& handleInfo = handleReferenceCounts.at(inputTensorHandle);
+ ITensorHandle* inputTensorHandle = handleInfo.m_TensorHandle;
+ workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
// Store the LayerBindingId of the OutputLayer
if (isOutputLayer)
@@ -1581,25 +1680,18 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
}
// In this case the layer is not an Output Layer but shares its input tensorhandle with an OutputLayer
// It will need to be updated as well, if we swap out the tensorhandle
- else if (handleInfo.isOutputLayerHandle)
+ else if (handleInfo.m_IsOutputLayerHandle)
{
handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, slot.GetSlotIndex()});
}
// Store the coordinates of the InputSlots connected to the InputLayer
// There can be more than one InputSlot connected to an InputLayer, so we use a vector
- if (handleInfo.isInputLayerHandle)
+ if (handleInfo.m_IsInputLayerHandle)
{
std::pair<LayerGuid, unsigned int> connectionLocation{layerIndex, slot.GetSlotIndex()};
handleInfo.m_InputMemDescriptorCoords.m_InputSlotCoords.emplace_back(connectionLocation);
}
-
- --handleInfo.m_ReferenceCount;
- if (handleInfo.m_ReferenceCount == 0u)
- {
- // Stop managing lifetime of tensor handle
- inputTensorHandle->Allocate();
- }
}
workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
@@ -1612,17 +1704,29 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
}
}
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory;
+
+ auto externalMemoryManager = CreateExternalMemoryManger(tensorMemory);
+
+ // Sort m_TensorMemory, so it's order matches the outputSlot order
+ std::sort(tensorMemory.begin(), tensorMemory.end(),
+ [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
+ const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
+ {
+ return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
+ });
+
std::vector<WorkingMemHandle::InputMemDescriptorCoords> inputConnectionsInfo;
std::vector<WorkingMemHandle::OutputMemDescriptorCoords> outputConnectionsInfo;
- for (const auto& handleInfo: handleReferenceCounts)
+ for (const auto& handleInfo: outputToHandleInfoMap)
{
- if (handleInfo.second.isOutputLayerHandle)
+ if (handleInfo.second.m_IsOutputLayerHandle)
{
outputConnectionsInfo.emplace_back(handleInfo.second.m_OutputMemDescriptorCoords);
}
- if (handleInfo.second.isInputLayerHandle)
+ if (handleInfo.second.m_IsInputLayerHandle)
{
inputConnectionsInfo.emplace_back(handleInfo.second.m_InputMemDescriptorCoords);
}
@@ -1633,8 +1737,10 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
outputConnectionsInfo,
workingMemDescriptors,
workingMemDescriptorMap,
- memoryManagers,
- std::move(tensorHandleMap));
+ std::move(externalMemoryManager),
+ std::move(tensorMemory),
+ std::move(managedTensorHandles),
+ std::move(unmanagedTensorHandles));
}
void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
@@ -1645,6 +1751,312 @@ void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
}
}
+
+void LoadedNetwork::CreateMemoryProfileAsync()
+{
+ struct PartialBlock
+ {
+ unsigned int m_StartOfLife;
+ unsigned int m_Lifetime;
+
+ size_t m_MemSize;
+ unsigned int m_Index;
+
+ BackendId m_BackendId;
+ };
+
+ auto align = [](size_t numToAlign)
+ {
+ const size_t alignment = sizeof(float);
+ return ((numToAlign + alignment - 1) / alignment) * alignment;
+ };
+
+ std::unordered_map<const OutputSlot*, PartialBlock> memBlockTrackerMap;
+
+ const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
+ const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
+
+ unsigned int timestep = 0;
+ unsigned int outputIndex = 0;
+ Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+ for (auto&& layer : order)
+ {
+ const LayerType& layerType = layer->GetType();
+ // Don't manage memory if importing.
+ if (layerType == LayerType::Input && inputImportingEnabled)
+ {
+ continue;
+ }
+ // Don't manage memory if importing.
+ if (layerType == LayerType::Output && outputImportingEnabled
+ && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
+ {
+ continue;
+ }
+ // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
+ // management is done separately.
+ if (layerType == LayerType::Constant)
+ {
+ continue;
+ }
+
+ BackendId backendId = layer->GetBackendId();
+ for (auto& outputSlot : layer->GetOutputSlots())
+ {
+ if (!m_SupportsExternallyManagedMemory[backendId])
+ {
+ continue;
+ }
+
+ PartialBlock partialBlock;
+
+ partialBlock.m_StartOfLife = timestep;
+
+ size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
+ partialBlock.m_MemSize = alignedSize;
+ partialBlock.m_Index = outputIndex++;
+ partialBlock.m_Lifetime = outputSlot.GetNumConnections();
+ partialBlock.m_BackendId = backendId;
+
+ if (partialBlock.m_Lifetime == 0)
+ {
+ m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+ partialBlock.m_StartOfLife,
+ partialBlock.m_MemSize,
+ 0,
+ partialBlock.m_Index);
+ }
+ else
+ {
+ memBlockTrackerMap[&outputSlot] = partialBlock;
+ }
+ }
+
+ for (auto& inputSlot : layer->GetInputSlots())
+ {
+ const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
+ const LayerType& owningLayerType = connectedInputLayer.GetType();
+
+ if (owningLayerType == LayerType::Constant)
+ {
+ continue;
+ }
+ if (inputImportingEnabled && owningLayerType == LayerType::Input)
+ {
+ continue;
+ }
+
+ auto outputSlot = inputSlot.GetConnectedOutputSlot();
+
+ PartialBlock& partialBlock = memBlockTrackerMap.at(outputSlot);
+
+ auto& lifetime = partialBlock.m_Lifetime;
+ --lifetime;
+
+ if (lifetime == 0)
+ {
+ m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+ timestep,
+ partialBlock.m_MemSize,
+ 0,
+ partialBlock.m_Index);
+ }
+ }
+ ++timestep;
+ }
+}
+
+void LoadedNetwork::CreateMemoryProfile()
+{
+ // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided
+ // is a TensorHandle, the function just returns it
+ auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle)
+ {
+ ITensorHandle* ancestor = subTensorHandle;
+ while (ancestor && ancestor->GetParent())
+ {
+ ancestor = ancestor->GetParent();
+ }
+ return ancestor;
+ };
+
+ struct PartialBlock
+ {
+ unsigned int m_StartOfLife;
+ unsigned int m_Lifetime;
+
+ size_t m_MemSize;
+ unsigned int m_Index;
+
+ BackendId m_BackendId;
+ };
+
+ auto align = [](size_t numToAlign)
+ {
+ const size_t alignment = sizeof(float);
+ return ((numToAlign + alignment - 1) / alignment) * alignment;
+ };
+
+ std::unordered_map<ITensorHandle*, PartialBlock> memBlockTrackerMap;
+
+ const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
+ const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
+
+ unsigned int timestep = 0;
+ unsigned int outputIndex = 0;
+ Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+ for (auto&& layer : order)
+ {
+ const LayerType& layerType = layer->GetType();
+ // Don't manage memory if importing.
+ if (layerType == LayerType::Input && inputImportingEnabled)
+ {
+ continue;
+ }
+ // Don't manage memory if importing.
+ if (layerType == LayerType::Output && outputImportingEnabled
+ && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
+ {
+ continue;
+ }
+ // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
+ // management is done separately.
+ if (layerType == LayerType::Constant)
+ {
+ continue;
+ }
+
+ BackendId backendId = layer->GetBackendId();
+ for (auto& outputSlot : layer->GetOutputSlots())
+ {
+ if (!m_SupportsExternallyManagedMemory[backendId])
+ {
+ continue;
+ }
+
+ ITensorHandle* tensorHandle = outputSlot.GetOutputHandler().GetData();
+ tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
+
+ if (memBlockTrackerMap.find(tensorHandle) == memBlockTrackerMap.end())
+ {
+ PartialBlock partialBlock;
+
+ partialBlock.m_StartOfLife = timestep;
+
+ size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
+ partialBlock.m_MemSize = alignedSize;
+ partialBlock.m_Index = outputIndex++;
+ partialBlock.m_Lifetime = outputSlot.GetNumConnections();
+ partialBlock.m_BackendId = backendId;
+
+ if (partialBlock.m_Lifetime == 0)
+ {
+ m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+ partialBlock.m_StartOfLife,
+ partialBlock.m_MemSize,
+ 0,
+ partialBlock.m_Index);
+ }
+ else
+ {
+ memBlockTrackerMap[tensorHandle] = partialBlock;
+ }
+ m_Tensorhandles.push_back(tensorHandle);
+
+ }
+ else
+ {
+ memBlockTrackerMap.at(tensorHandle).m_Lifetime += outputSlot.GetNumConnections();
+ }
+ }
+
+ for (auto& inputSlot : layer->GetInputSlots())
+ {
+ const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
+ const LayerType& owningLayerType = connectedInputLayer.GetType();
+
+ if (owningLayerType == LayerType::Constant)
+ {
+ continue;
+ }
+ if (inputImportingEnabled && owningLayerType == LayerType::Input)
+ {
+ continue;
+ }
+ if (!m_SupportsExternallyManagedMemory[connectedInputLayer.GetBackendId()])
+ {
+ continue;
+ }
+
+ auto outputSlot = inputSlot.GetConnectedOutputSlot();
+
+ ITensorHandle* tensorHandle = outputSlot->GetOutputHandler().GetData();
+ tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
+
+ PartialBlock& partialBlock = memBlockTrackerMap.at(tensorHandle);
+
+ auto& lifetime = partialBlock.m_Lifetime;
+ --lifetime;
+
+ if (lifetime == 0)
+ {
+ m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+ timestep,
+ partialBlock.m_MemSize,
+ 0,
+ partialBlock.m_Index);
+ }
+ }
+ ++timestep;
+ }
+
+}
+
+std::unique_ptr<MemoryManager> LoadedNetwork::CreateExternalMemoryManger(
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>>& tensorMemoryVec)
+{
+ std::unique_ptr<MemoryManager> memoryManager = std::make_unique<MemoryManager>();
+ auto allocatorMap = BackendRegistryInstance().GetAllocators();
+
+ for (auto& backend : m_MemBinMap)
+ {
+ std::vector<BufferStorage> bufferStorageVec;
+
+ std::shared_ptr<ICustomAllocator> backendAllocator;
+ if (allocatorMap.find(backend.first) != allocatorMap.end())
+ {
+ backendAllocator = allocatorMap[backend.first];
+ }
+ else
+ {
+ backendAllocator = m_Backends[backend.first]->GetDefaultAllocator();
+ }
+
+ for (auto& memBin : backend.second)
+ {
+ BufferStorage bufferStorage;
+ bufferStorage.m_BufferSize = memBin.m_MemSize;
+ bufferStorage.m_TensorMemoryVector.reserve(memBin.m_MemBlocks.size());
+
+ for (auto& memBlock : memBin.m_MemBlocks)
+ {
+ auto tensorMemory = std::make_shared<TensorMemory>(TensorMemory{memBlock.m_Offset, memBlock.m_Index});
+
+ tensorMemoryVec.emplace_back(tensorMemory, backendAllocator->GetMemorySourceType());
+ bufferStorage.m_TensorMemoryVector.emplace_back(tensorMemory);
+ }
+
+ bufferStorageVec.emplace_back(std::move(bufferStorage));
+ }
+
+ memoryManager->StoreMemToAllocate(bufferStorageVec, backendAllocator, 4);
+ }
+
+ return memoryManager;
+}
+
LayerBindingId LoadedNetwork::ValidateImportedInputID(ImportedInputId id)
{
try