aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFinn Williams <finn.williams@arm.com>2021-10-28 19:07:32 +0100
committerFinn Williams <finn.williams@arm.com>2021-11-08 14:33:17 +0000
commitb1aad4270fa8ad5c4aa62e27d564baf723b2cee5 (patch)
tree98b19ba85b50e2c730d5d2e3822cd2b1438bd149
parent3f22d27f51c493e37b9da0692b6bf776f4430dcf (diff)
downloadarmnn-b1aad4270fa8ad5c4aa62e27d564baf723b2cee5.tar.gz
IVGCVSW-6527 Support the new memory API in loaded network
* enable external memory management for neon and ref backends * change m_TensorMemoryVector to hold shared pointers * change input layer backend Id to match backend id of connected layer Signed-off-by: Finn Williams <finn.williams@arm.com> Change-Id: I2216a724028312eb101b290df3f224177826b1a0
-rw-r--r--include/armnn/IRuntime.hpp14
-rw-r--r--include/armnn/backends/IMemoryOptimizerStrategy.hpp4
-rw-r--r--src/armnn/LoadedNetwork.cpp610
-rw-r--r--src/armnn/LoadedNetwork.hpp38
-rw-r--r--src/armnn/Network.cpp16
-rw-r--r--src/armnn/WorkingMemHandle.cpp28
-rw-r--r--src/armnn/WorkingMemHandle.hpp25
-rw-r--r--src/armnn/test/OptimizerTests.cpp3
-rw-r--r--src/backends/backendsCommon/DefaultAllocator.hpp4
-rw-r--r--src/backends/backendsCommon/MemoryManager.cpp2
-rw-r--r--src/backends/backendsCommon/MemoryManager.hpp11
-rw-r--r--src/backends/backendsCommon/common.mk7
-rw-r--r--src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp4
-rw-r--r--src/backends/backendsCommon/test/CompatibilityTests.cpp6
-rw-r--r--src/backends/backendsCommon/test/MemoryManagerTests.cpp40
-rw-r--r--src/backends/backendsCommon/test/OptimizedNetworkTests.cpp4
-rw-r--r--src/backends/cl/ClBackend.hpp2
-rw-r--r--src/backends/neon/NeonBackend.hpp4
-rw-r--r--src/backends/neon/NeonTensorHandle.hpp12
-rw-r--r--src/backends/reference/RefBackend.hpp2
-rw-r--r--src/backends/reference/RefTensorHandle.cpp2
-rw-r--r--src/backends/reference/RefWorkloadFactory.cpp22
22 files changed, 677 insertions, 183 deletions
diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp
index 93f8b0fd5b..bdfd9b224b 100644
--- a/include/armnn/IRuntime.hpp
+++ b/include/armnn/IRuntime.hpp
@@ -43,7 +43,8 @@ struct INetworkProperties
m_ProfilingEnabled(profilingEnabled),
m_OutputNetworkDetailsMethod(ProfilingDetailsMethod::Undefined),
m_InputSource(m_ImportEnabled ? MemorySource::Malloc : MemorySource::Undefined),
- m_OutputSource(m_ExportEnabled ? MemorySource::Malloc : MemorySource::Undefined)
+ m_OutputSource(m_ExportEnabled ? MemorySource::Malloc : MemorySource::Undefined),
+ m_ExternalMemoryManagementEnabled(false)
{}
ARMNN_DEPRECATED_MSG_REMOVAL_DATE("Please use INetworkProperties constructor without numThreads argument", "22.02")
@@ -58,7 +59,8 @@ struct INetworkProperties
m_ProfilingEnabled(profilingEnabled),
m_OutputNetworkDetailsMethod(ProfilingDetailsMethod::Undefined),
m_InputSource(inputSource),
- m_OutputSource(outputSource)
+ m_OutputSource(outputSource),
+ m_ExternalMemoryManagementEnabled(false)
{
armnn::IgnoreUnused(numThreads);
}
@@ -67,14 +69,16 @@ struct INetworkProperties
MemorySource inputSource,
MemorySource outputSource,
bool profilingEnabled = false,
- ProfilingDetailsMethod detailsMethod = ProfilingDetailsMethod::Undefined)
+ ProfilingDetailsMethod detailsMethod = ProfilingDetailsMethod::Undefined,
+ bool externalMemoryManagementEnabled = false)
: m_ImportEnabled(inputSource != MemorySource::Undefined),
m_ExportEnabled(outputSource != MemorySource::Undefined),
m_AsyncEnabled(asyncEnabled),
m_ProfilingEnabled(profilingEnabled),
m_OutputNetworkDetailsMethod(detailsMethod),
m_InputSource(inputSource),
- m_OutputSource(outputSource)
+ m_OutputSource(outputSource),
+ m_ExternalMemoryManagementEnabled(externalMemoryManagementEnabled)
{}
/// Deprecated and will be removed in future release.
@@ -91,6 +95,8 @@ struct INetworkProperties
const MemorySource m_InputSource;
const MemorySource m_OutputSource;
+ const bool m_ExternalMemoryManagementEnabled;
+
virtual ~INetworkProperties() {}
};
diff --git a/include/armnn/backends/IMemoryOptimizerStrategy.hpp b/include/armnn/backends/IMemoryOptimizerStrategy.hpp
index ad5513f8a3..bdb2f5bd30 100644
--- a/include/armnn/backends/IMemoryOptimizerStrategy.hpp
+++ b/include/armnn/backends/IMemoryOptimizerStrategy.hpp
@@ -19,8 +19,8 @@ struct MemBlock
const unsigned int index)
: m_StartOfLife(startOfLife), m_EndOfLife(endOfLife), m_MemSize(memSize), m_Offset(offset), m_Index(index) {}
- const unsigned int m_StartOfLife; // Y start
- const unsigned int m_EndOfLife; // Y end
+ const unsigned int m_StartOfLife; // Y start inclusive
+ const unsigned int m_EndOfLife; // Y end inclusive
const size_t m_MemSize; // Offset + Memsize = X end
size_t m_Offset; // X start
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index 7fb14d0f32..03e5ad5bfe 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -131,11 +131,14 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
profiler->EnableNetworkDetailsToStdOut(networkProperties.m_OutputNetworkDetailsMethod);
- Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
//First create tensor handlers, backends and workload factories.
//Handlers are created before workloads are.
//Because workload creation can modify some of the handlers,
//(for example the splitter and concat layers).
+
+ bool useExternalMemoryManager = false;
+ bool useInternalMemoryManager = false;
+ Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
for (auto&& layer : order)
{
auto const& backendId = layer->GetBackendId();
@@ -154,25 +157,44 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
throw BackendCapabilityException(er);
}
+ if (networkProperties.m_AsyncEnabled &&
+ !HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},
+ backend->GetCapabilities()))
+ {
+ std::string er = backend->GetId();
+ er += " does not support ExternallyManagedMemory\n";
+ er += "AsyncEnabled networks require all backends to support ExternallyManagedMemory";
+ throw BackendCapabilityException(er);
+ }
+
+ if (HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},backend->GetCapabilities())
+ && (m_NetworkProperties.m_ExternalMemoryManagementEnabled || m_NetworkProperties.m_AsyncEnabled))
+ {
+ m_SupportsExternallyManagedMemory[backend->GetId()] = true;
+ useExternalMemoryManager = true;
+ }
+ else
+ {
+ m_SupportsExternallyManagedMemory[backend->GetId()] = false;
+ useInternalMemoryManager = true;
+ }
+
+ IBackendInternal::IWorkloadFactoryPtr workloadFactory;
if (backend->SupportsTensorAllocatorAPI())
{
- auto workloadFactory = backend->CreateWorkloadFactory(
+ workloadFactory = backend->CreateWorkloadFactory(
m_TensorHandleFactoryRegistry,
m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(),
static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
- m_WorkloadFactories.emplace(
- std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
}
else
{
- IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager();
- auto workloadFactory = backend->CreateWorkloadFactory(
- memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
-
- m_WorkloadFactories.emplace(
- std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
+ m_BackendMemoryMangers.emplace_back(backend->CreateMemoryManager());
+ workloadFactory = backend->CreateWorkloadFactory(
+ m_BackendMemoryMangers.back(), m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
}
+ m_WorkloadFactories[backendId ] = std::move(workloadFactory);
}
}
@@ -181,6 +203,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
for (auto&& layer : order)
{
auto& workloadFactory = GetWorkloadFactory(*layer);
+ bool supportsExternalManager = m_SupportsExternallyManagedMemory[layer->GetBackendId()];
switch (layer->GetType())
{
@@ -191,7 +214,12 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// to false when creating TensorHandles
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
workloadFactory,
- !m_NetworkProperties.m_ImportEnabled);
+ !supportsExternalManager && !m_NetworkProperties.m_ImportEnabled);
+ break;
+ }
+ case LayerType::Constant:
+ {
+ layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, true);
break;
}
default:
@@ -199,16 +227,18 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
// If Export is enabled disable memory management so we can export, otherwise we do a copy
if ((layer->GetNumOutputSlots() == 1) &&
- (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
- (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
+ (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
+ (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
{
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
workloadFactory,
- !m_NetworkProperties.m_ExportEnabled);
+ !supportsExternalManager && !m_NetworkProperties.m_ExportEnabled);
}
else
{
- layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
+ layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
+ workloadFactory,
+ !supportsExternalManager);
}
}
}
@@ -251,7 +281,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
break;
}
- default: {
+ default:
+ {
auto workload = layer->CreateWorkload(workloadFactory);
if (!workload)
@@ -272,11 +303,16 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
// and are separated out from the other workloads
- if (networkProperties.m_AsyncEnabled && layer->GetType() == LayerType::Constant)
+ if((networkProperties.m_AsyncEnabled || useExternalMemoryManager) &&
+ layer->GetType() == LayerType::Constant)
{
+ m_ConstantTensorHandles[layer->GetGuid()] =
+ layer->GetOutputSlot(0).GetOutputHandler().GetData();
m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
- } else {
- m_WorkloadQueue.push_back(move(workload));
+ }
+ else
+ {
+ m_WorkloadQueue.push_back(std::move(workload));
}
// release the constant data in the layer..
@@ -289,7 +325,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
for (auto&& workloadFactory : m_WorkloadFactories)
{
- workloadFactory.second.first->AfterWorkloadsCreated();
+ workloadFactory.second->AfterWorkloadsCreated();
}
if (timelineUtils)
@@ -298,28 +334,90 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
timelineUtils->Commit();
}
+ if (useExternalMemoryManager)
+ {
+ if (networkProperties.m_AsyncEnabled)
+ {
+ CreateMemoryProfileAsync();
+ }
+ else
+ {
+ CreateMemoryProfile();
+ }
+
+ auto backendStrategyMap = BackendRegistryInstance().GetMemoryOptimizerStrategies();
+ for (auto& backendMemoryProfile : m_MemBlockMap)
+ {
+ const BackendId& backendId = backendMemoryProfile.first;
+ if (backendStrategyMap.find(backendId) != backendStrategyMap.end())
+ {
+ m_MemBinMap[backendId] = backendStrategyMap[backendId]->Optimize(backendMemoryProfile.second);
+ }
+ else
+ {
+ m_MemBinMap[backendId] = m_ConstantStrategy->Optimize(backendMemoryProfile.second);
+ }
+ }
+
+ if (!networkProperties.m_AsyncEnabled)
+ {
+ m_ExternalMemoryManager = CreateExternalMemoryManger(m_TensorMemory);
+
+ // Sort m_TensorMemory, so it's order matches m_Tensorhandles
+ std::sort(m_TensorMemory.begin(), m_TensorMemory.end(),
+ [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
+ const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
+ {
+ return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
+ });
+ }
+ }
+
+ // Now that the intermediate tensor memory has been set-up,
+ // do any post allocation configuration for each workload.
if (!networkProperties.m_AsyncEnabled)
{
- // Set up memory.
- m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
+ if (useInternalMemoryManager)
+ {
+ // Set up memory.
+ m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
+ }
- // Now that the intermediate tensor memory has been set-up,
- // do any post allocation configuration for each workload.
- ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_PostAllocationConfigure");
for (auto &workload : m_WorkloadQueue)
{
workload->PostAllocationConfigure();
}
}
- else
+
+ if (useExternalMemoryManager)
{
- AllocateAndExecuteConstantWorkloads();
+ if (!networkProperties.m_AsyncEnabled)
+ {
+ AllocateAndExecuteConstantWorkloads();
+ }
+ else
+ {
+ AllocateAndExecuteConstantWorkloadsAsync();
+ }
}
}
void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
+ for (auto& pair : m_ConstantWorkloads)
+ {
+ auto tensorHandle = m_ConstantTensorHandles[pair.first];
+ tensorHandle->Allocate();
+ pair.second->Execute();
+ }
+}
+
+
+
+void LoadedNetwork::AllocateAndExecuteConstantWorkloadsAsync()
+{
+ ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
for (auto&& layer : order)
{
@@ -343,7 +441,6 @@ void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
}
}
-
void LoadedNetwork::SendNetworkStructure()
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_SendNetworkStructure");
@@ -429,7 +526,7 @@ const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) co
CHECK_LOCATION());
}
- workloadFactory = it->second.first.get();
+ workloadFactory = it->second.get();
ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
@@ -780,9 +877,19 @@ void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
{
return;
}
- for (auto&& workloadFactory : m_WorkloadFactories)
+
+ if (m_ExternalMemoryManager)
+ {
+ m_ExternalMemoryManager->Allocate();
+
+ for (unsigned int i = 0; i < m_TensorMemory.size(); ++i)
+ {
+ m_Tensorhandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second);
+ }
+ }
+
+ for (auto&& memoryManager : m_BackendMemoryMangers)
{
- IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
if (memoryManager)
{
memoryManager->Acquire();
@@ -795,14 +902,20 @@ void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
void LoadedNetwork::FreeWorkingMemory()
{
std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
+
if (!m_IsWorkingMemAllocated)
{
return;
}
- // Informs the memory managers to release memory in it's respective memory group
- for (auto&& workloadFactory : m_WorkloadFactories)
+
+ if (m_ExternalMemoryManager)
+ {
+ m_ExternalMemoryManager->Deallocate();
+ }
+
+ // Informs the memory managers to release memory in its respective memory group
+ for (auto&& memoryManager : m_BackendMemoryMangers)
{
- IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
if (memoryManager)
{
memoryManager->Release();
@@ -1392,37 +1505,16 @@ Status LoadedNetwork::Execute(const InputTensors& inputTensors,
std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
{
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
- std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > tensorHandleMap;
- std::vector<WorkingMemDescriptor> workingMemDescriptors;
- std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
- TensorHandleFactoryRegistry tensorHandleFactoryRegistry;
- WorkloadFactoryMap workloadFactoryMap;
- std::vector<std::shared_ptr<IMemoryManager>> memoryManagers;
+ // Tensors that will need to be allocated internally within armnn
+ std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles;
+ // Tensors that will be allocated externally by the user
+ std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles;
- for (auto const& backend : m_Backends)
- {
- if (backend.second->SupportsTensorAllocatorAPI())
- {
- backend.second->RegisterTensorHandleFactories(
- tensorHandleFactoryRegistry,
- static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
- static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
- memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back());
- }
- else
- {
- std::shared_ptr<IMemoryManager> memoryManager = backend.second->CreateMemoryManager();
- auto workloadFactory = backend.second->CreateWorkloadFactory(
- memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
-
- workloadFactoryMap.emplace(
- std::make_pair(backend.first, std::make_pair(std::move(workloadFactory), memoryManager)));
- memoryManagers.emplace_back(memoryManager);
- }
- }
+ std::vector<WorkingMemDescriptor> workingMemDescriptors;
+ std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
- auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot, bool isMemoryManaged)
+ auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot)
{
ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
@@ -1431,28 +1523,30 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
{
BackendId id = layer->GetBackendId();
ARMNN_NO_DEPRECATE_WARN_BEGIN
- return workloadFactoryMap.at(id).first->CreateTensorHandle(tensorInfo, isMemoryManaged);
+ return m_WorkloadFactories.at(id)->CreateTensorHandle(tensorInfo, false);
ARMNN_NO_DEPRECATE_WARN_END
}
else
{
- ITensorHandleFactory* handleFactory = tensorHandleFactoryRegistry.GetFactory(factoryId);
+ ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
ARMNN_ASSERT(handleFactory);
- return handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
+ return handleFactory->CreateTensorHandle(tensorInfo, false);
}
};
struct HandleInfo
{
- unsigned int m_ReferenceCount = 0;
- bool isInputLayerHandle = false;
- bool isOutputLayerHandle = false;
+ ITensorHandle* m_TensorHandle;
+
+ bool m_IsInputLayerHandle = false;
+ bool m_IsOutputLayerHandle = false;
WorkingMemHandle::InputMemDescriptorCoords m_InputMemDescriptorCoords;
WorkingMemHandle::OutputMemDescriptorCoords m_OutputMemDescriptorCoords;
};
- std::unordered_map<const ITensorHandle*, HandleInfo> handleReferenceCounts;
+ std::unordered_map<const OutputSlot*, HandleInfo> outputToHandleInfoMap;
+
unsigned int layerIndex = 0;
for (auto&& layer : order)
{
@@ -1508,27 +1602,33 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
}
}
- tensorHandleMap[layer->GetGuid()].emplace_back(GetTensorHandle(layer, slot, isMemoryManaged));
- ITensorHandle* tensorHandle = tensorHandleMap[layer->GetGuid()].back().get();
+ ITensorHandle* tensorHandle;
+ if (isMemoryManaged)
+ {
+ managedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
+ tensorHandle = managedTensorHandles.back().get();
+ }
+ else
+ {
+ unmanagedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
+ tensorHandle = unmanagedTensorHandles.back().get();
+ }
workingMemDescriptor.m_Outputs.push_back(tensorHandle);
- tensorHandle->Manage();
- unsigned int numConnections = slot.GetNumConnections();
- ARMNN_ASSERT(numConnections != 0);
- HandleInfo& handleInfo = handleReferenceCounts[tensorHandle];
- handleInfo.m_ReferenceCount = numConnections;
+ HandleInfo& handleInfo = outputToHandleInfoMap[&slot];
+ handleInfo.m_TensorHandle = tensorHandle;
// Store the coordinates of the current layer's OutputSlot that is connected to the OutputLayer
if (isConnectedToOutputLayer)
{
- handleInfo.isOutputLayerHandle = true;
+ handleInfo.m_IsOutputLayerHandle = true;
handleInfo.m_OutputMemDescriptorCoords.m_OutputSlotCoords = {layerIndex, slotIndex};
}
// Store the LayerBindingId of the InputLayer
if (isInputLayer)
{
- handleInfo.isInputLayerHandle = true;
+ handleInfo.m_IsInputLayerHandle = true;
LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
handleInfo.m_InputMemDescriptorCoords.m_LayerBindingId = bindingId;
}
@@ -1557,20 +1657,19 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
{
LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
- HandleInfo& handleInfo = handleReferenceCounts[tensorHandle];
- handleInfo.isOutputLayerHandle = true;
+ HandleInfo& handleInfo = outputToHandleInfoMap[outputSlot];
+ handleInfo.m_TensorHandle = tensorHandle;
+ handleInfo.m_IsOutputLayerHandle = true;
handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
}
continue;
}
- auto search = tensorHandleMap.find(key);
- unsigned int index = outputSlot->CalculateIndexOnOwner();
- ITensorHandle* inputTensorHandle = search->second[index].get();
- workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
+ HandleInfo& handleInfo = outputToHandleInfoMap.at(outputSlot);
- HandleInfo& handleInfo = handleReferenceCounts.at(inputTensorHandle);
+ ITensorHandle* inputTensorHandle = handleInfo.m_TensorHandle;
+ workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
// Store the LayerBindingId of the OutputLayer
if (isOutputLayer)
@@ -1581,25 +1680,18 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
}
// In this case the layer is not an Output Layer but shares its input tensorhandle with an OutputLayer
// It will need to be updated as well, if we swap out the tensorhandle
- else if (handleInfo.isOutputLayerHandle)
+ else if (handleInfo.m_IsOutputLayerHandle)
{
handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, slot.GetSlotIndex()});
}
// Store the coordinates of the InputSlots connected to the InputLayer
// There can be more than one InputSlot connected to an InputLayer, so we use a vector
- if (handleInfo.isInputLayerHandle)
+ if (handleInfo.m_IsInputLayerHandle)
{
std::pair<LayerGuid, unsigned int> connectionLocation{layerIndex, slot.GetSlotIndex()};
handleInfo.m_InputMemDescriptorCoords.m_InputSlotCoords.emplace_back(connectionLocation);
}
-
- --handleInfo.m_ReferenceCount;
- if (handleInfo.m_ReferenceCount == 0u)
- {
- // Stop managing lifetime of tensor handle
- inputTensorHandle->Allocate();
- }
}
workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
@@ -1612,17 +1704,29 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
}
}
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory;
+
+ auto externalMemoryManager = CreateExternalMemoryManger(tensorMemory);
+
+ // Sort m_TensorMemory, so it's order matches the outputSlot order
+ std::sort(tensorMemory.begin(), tensorMemory.end(),
+ [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
+ const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
+ {
+ return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
+ });
+
std::vector<WorkingMemHandle::InputMemDescriptorCoords> inputConnectionsInfo;
std::vector<WorkingMemHandle::OutputMemDescriptorCoords> outputConnectionsInfo;
- for (const auto& handleInfo: handleReferenceCounts)
+ for (const auto& handleInfo: outputToHandleInfoMap)
{
- if (handleInfo.second.isOutputLayerHandle)
+ if (handleInfo.second.m_IsOutputLayerHandle)
{
outputConnectionsInfo.emplace_back(handleInfo.second.m_OutputMemDescriptorCoords);
}
- if (handleInfo.second.isInputLayerHandle)
+ if (handleInfo.second.m_IsInputLayerHandle)
{
inputConnectionsInfo.emplace_back(handleInfo.second.m_InputMemDescriptorCoords);
}
@@ -1633,8 +1737,10 @@ std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(Network
outputConnectionsInfo,
workingMemDescriptors,
workingMemDescriptorMap,
- memoryManagers,
- std::move(tensorHandleMap));
+ std::move(externalMemoryManager),
+ std::move(tensorMemory),
+ std::move(managedTensorHandles),
+ std::move(unmanagedTensorHandles));
}
void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
@@ -1645,6 +1751,312 @@ void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
}
}
+
+void LoadedNetwork::CreateMemoryProfileAsync()
+{
+ struct PartialBlock
+ {
+ unsigned int m_StartOfLife;
+ unsigned int m_Lifetime;
+
+ size_t m_MemSize;
+ unsigned int m_Index;
+
+ BackendId m_BackendId;
+ };
+
+ auto align = [](size_t numToAlign)
+ {
+ const size_t alignment = sizeof(float);
+ return ((numToAlign + alignment - 1) / alignment) * alignment;
+ };
+
+ std::unordered_map<const OutputSlot*, PartialBlock> memBlockTrackerMap;
+
+ const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
+ const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
+
+ unsigned int timestep = 0;
+ unsigned int outputIndex = 0;
+ Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+ for (auto&& layer : order)
+ {
+ const LayerType& layerType = layer->GetType();
+ // Don't manage memory if importing.
+ if (layerType == LayerType::Input && inputImportingEnabled)
+ {
+ continue;
+ }
+ // Don't manage memory if importing.
+ if (layerType == LayerType::Output && outputImportingEnabled
+ && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
+ {
+ continue;
+ }
+ // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
+ // management is done separately.
+ if (layerType == LayerType::Constant)
+ {
+ continue;
+ }
+
+ BackendId backendId = layer->GetBackendId();
+ for (auto& outputSlot : layer->GetOutputSlots())
+ {
+ if (!m_SupportsExternallyManagedMemory[backendId])
+ {
+ continue;
+ }
+
+ PartialBlock partialBlock;
+
+ partialBlock.m_StartOfLife = timestep;
+
+ size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
+ partialBlock.m_MemSize = alignedSize;
+ partialBlock.m_Index = outputIndex++;
+ partialBlock.m_Lifetime = outputSlot.GetNumConnections();
+ partialBlock.m_BackendId = backendId;
+
+ if (partialBlock.m_Lifetime == 0)
+ {
+ m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+ partialBlock.m_StartOfLife,
+ partialBlock.m_MemSize,
+ 0,
+ partialBlock.m_Index);
+ }
+ else
+ {
+ memBlockTrackerMap[&outputSlot] = partialBlock;
+ }
+ }
+
+ for (auto& inputSlot : layer->GetInputSlots())
+ {
+ const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
+ const LayerType& owningLayerType = connectedInputLayer.GetType();
+
+ if (owningLayerType == LayerType::Constant)
+ {
+ continue;
+ }
+ if (inputImportingEnabled && owningLayerType == LayerType::Input)
+ {
+ continue;
+ }
+
+ auto outputSlot = inputSlot.GetConnectedOutputSlot();
+
+ PartialBlock& partialBlock = memBlockTrackerMap.at(outputSlot);
+
+ auto& lifetime = partialBlock.m_Lifetime;
+ --lifetime;
+
+ if (lifetime == 0)
+ {
+ m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+ timestep,
+ partialBlock.m_MemSize,
+ 0,
+ partialBlock.m_Index);
+ }
+ }
+ ++timestep;
+ }
+}
+
+void LoadedNetwork::CreateMemoryProfile()
+{
+ // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided
+ // is a TensorHandle, the function just returns it
+ auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle)
+ {
+ ITensorHandle* ancestor = subTensorHandle;
+ while (ancestor && ancestor->GetParent())
+ {
+ ancestor = ancestor->GetParent();
+ }
+ return ancestor;
+ };
+
+ struct PartialBlock
+ {
+ unsigned int m_StartOfLife;
+ unsigned int m_Lifetime;
+
+ size_t m_MemSize;
+ unsigned int m_Index;
+
+ BackendId m_BackendId;
+ };
+
+ auto align = [](size_t numToAlign)
+ {
+ const size_t alignment = sizeof(float);
+ return ((numToAlign + alignment - 1) / alignment) * alignment;
+ };
+
+ std::unordered_map<ITensorHandle*, PartialBlock> memBlockTrackerMap;
+
+ const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
+ const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
+
+ unsigned int timestep = 0;
+ unsigned int outputIndex = 0;
+ Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+ for (auto&& layer : order)
+ {
+ const LayerType& layerType = layer->GetType();
+ // Don't manage memory if importing.
+ if (layerType == LayerType::Input && inputImportingEnabled)
+ {
+ continue;
+ }
+ // Don't manage memory if importing.
+ if (layerType == LayerType::Output && outputImportingEnabled
+ && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
+ {
+ continue;
+ }
+ // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
+ // management is done separately.
+ if (layerType == LayerType::Constant)
+ {
+ continue;
+ }
+
+ BackendId backendId = layer->GetBackendId();
+ for (auto& outputSlot : layer->GetOutputSlots())
+ {
+ if (!m_SupportsExternallyManagedMemory[backendId])
+ {
+ continue;
+ }
+
+ ITensorHandle* tensorHandle = outputSlot.GetOutputHandler().GetData();
+ tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
+
+ if (memBlockTrackerMap.find(tensorHandle) == memBlockTrackerMap.end())
+ {
+ PartialBlock partialBlock;
+
+ partialBlock.m_StartOfLife = timestep;
+
+ size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
+ partialBlock.m_MemSize = alignedSize;
+ partialBlock.m_Index = outputIndex++;
+ partialBlock.m_Lifetime = outputSlot.GetNumConnections();
+ partialBlock.m_BackendId = backendId;
+
+ if (partialBlock.m_Lifetime == 0)
+ {
+ m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+ partialBlock.m_StartOfLife,
+ partialBlock.m_MemSize,
+ 0,
+ partialBlock.m_Index);
+ }
+ else
+ {
+ memBlockTrackerMap[tensorHandle] = partialBlock;
+ }
+ m_Tensorhandles.push_back(tensorHandle);
+
+ }
+ else
+ {
+ memBlockTrackerMap.at(tensorHandle).m_Lifetime += outputSlot.GetNumConnections();
+ }
+ }
+
+ for (auto& inputSlot : layer->GetInputSlots())
+ {
+ const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
+ const LayerType& owningLayerType = connectedInputLayer.GetType();
+
+ if (owningLayerType == LayerType::Constant)
+ {
+ continue;
+ }
+ if (inputImportingEnabled && owningLayerType == LayerType::Input)
+ {
+ continue;
+ }
+ if (!m_SupportsExternallyManagedMemory[connectedInputLayer.GetBackendId()])
+ {
+ continue;
+ }
+
+ auto outputSlot = inputSlot.GetConnectedOutputSlot();
+
+ ITensorHandle* tensorHandle = outputSlot->GetOutputHandler().GetData();
+ tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
+
+ PartialBlock& partialBlock = memBlockTrackerMap.at(tensorHandle);
+
+ auto& lifetime = partialBlock.m_Lifetime;
+ --lifetime;
+
+ if (lifetime == 0)
+ {
+ m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+ timestep,
+ partialBlock.m_MemSize,
+ 0,
+ partialBlock.m_Index);
+ }
+ }
+ ++timestep;
+ }
+
+}
+
+std::unique_ptr<MemoryManager> LoadedNetwork::CreateExternalMemoryManger(
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>>& tensorMemoryVec)
+{
+ std::unique_ptr<MemoryManager> memoryManager = std::make_unique<MemoryManager>();
+ auto allocatorMap = BackendRegistryInstance().GetAllocators();
+
+ for (auto& backend : m_MemBinMap)
+ {
+ std::vector<BufferStorage> bufferStorageVec;
+
+ std::shared_ptr<ICustomAllocator> backendAllocator;
+ if (allocatorMap.find(backend.first) != allocatorMap.end())
+ {
+ backendAllocator = allocatorMap[backend.first];
+ }
+ else
+ {
+ backendAllocator = m_Backends[backend.first]->GetDefaultAllocator();
+ }
+
+ for (auto& memBin : backend.second)
+ {
+ BufferStorage bufferStorage;
+ bufferStorage.m_BufferSize = memBin.m_MemSize;
+ bufferStorage.m_TensorMemoryVector.reserve(memBin.m_MemBlocks.size());
+
+ for (auto& memBlock : memBin.m_MemBlocks)
+ {
+ auto tensorMemory = std::make_shared<TensorMemory>(TensorMemory{memBlock.m_Offset, memBlock.m_Index});
+
+ tensorMemoryVec.emplace_back(tensorMemory, backendAllocator->GetMemorySourceType());
+ bufferStorage.m_TensorMemoryVector.emplace_back(tensorMemory);
+ }
+
+ bufferStorageVec.emplace_back(std::move(bufferStorage));
+ }
+
+ memoryManager->StoreMemToAllocate(bufferStorageVec, backendAllocator, 4);
+ }
+
+ return memoryManager;
+}
+
LayerBindingId LoadedNetwork::ValidateImportedInputID(ImportedInputId id)
{
try
diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp
index 71ceaa3938..35c482cbc7 100644
--- a/src/armnn/LoadedNetwork.hpp
+++ b/src/armnn/LoadedNetwork.hpp
@@ -10,9 +10,15 @@
#include <armnn/Tensor.hpp>
#include <armnn/backends/IBackendInternal.hpp>
+#include <armnn/backends/IMemoryOptimizerStrategy.hpp>
#include <backendsCommon/TensorHandleFactoryRegistry.hpp>
#include <backendsCommon/Workload.hpp>
#include <backendsCommon/WorkloadFactory.hpp>
+#include <backendsCommon/DefaultAllocator.hpp>
+#include <backendsCommon/MemoryManager.hpp>
+#include <backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.hpp>
+
+
#include <ProfilingService.hpp>
#include <TimelineUtilityMethods.hpp>
@@ -89,16 +95,16 @@ public:
profiling::ProfilingGuid GetNetworkGuid();
private:
- using WorkloadFactoryWithMemoryManager =
- std::pair<IBackendInternal::IWorkloadFactoryPtr, IBackendInternal::IMemoryManagerSharedPtr>;
- using WorkloadFactoryMap = std::unordered_map<BackendId, WorkloadFactoryWithMemoryManager>;
void AllocateWorkingMemory(std::lock_guard<std::mutex>& lock);
void AllocateAndExecuteConstantWorkloads();
+ void AllocateAndExecuteConstantWorkloadsAsync();
+
+ std::unordered_map<LayerGuid, std::unique_ptr<IWorkload>> m_ConstantWorkloads;
+ std::unordered_map<LayerGuid, ITensorHandle*> m_ConstantTensorHandles;
- std::unordered_map<LayerGuid, ITensorHandle* > m_ConstantTensorHandles;
- std::unordered_map<LayerGuid, std::unique_ptr<IWorkload> > m_ConstantWorkloads;
+ std::unique_ptr<IMemoryOptimizerStrategy> m_ConstantStrategy = std::make_unique<SingleAxisPriorityList>();
LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
const INetworkProperties& networkProperties,
@@ -120,9 +126,18 @@ private:
inline LayerBindingId ValidateImportedInputID(ImportedInputId id);
inline LayerBindingId ValidateImportedOutputID(ImportedOutputId id);
+ void CreateMemoryProfile();
+ void CreateMemoryProfileAsync();
+
+ std::unique_ptr<MemoryManager> CreateExternalMemoryManger(
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>>& tensorMemory);
+
using BackendPtrMap = std::unordered_map<BackendId, IBackendInternalUniquePtr>;
- BackendPtrMap m_Backends;
+ BackendPtrMap m_Backends;
+ std::vector<IBackendInternal::IMemoryManagerSharedPtr> m_BackendMemoryMangers;
+
+ using WorkloadFactoryMap = std::unordered_map<BackendId, IBackendInternal::IWorkloadFactoryPtr>;
WorkloadFactoryMap m_WorkloadFactories;
std::unique_ptr<IOptimizedNetwork> m_OptimizedNetwork;
@@ -171,6 +186,17 @@ private:
ImportedInputId m_CurImportedInputId = 0;
ImportedInputId m_CurImportedOutputId = 0;
+
+ std::unordered_map<BackendId, std::vector<MemBlock>> m_MemBlockMap;
+ std::unordered_map<BackendId, std::vector<MemBin>> m_MemBinMap;
+
+ std::vector<ITensorHandle*> m_Tensorhandles;
+
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> m_TensorMemory;
+
+ std::unique_ptr<MemoryManager> m_ExternalMemoryManager;
+
+ std::unordered_map<BackendId, bool> m_SupportsExternallyManagedMemory;
};
}
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index e00dbfc0fc..17a1da1f6c 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -934,6 +934,11 @@ OptimizationResult AssignBackends(OptimizedNetworkImpl* optNetObjPtr,
{
auto layer = *it;
+ if (layer->GetType() == LayerType::Input)
+ {
+ continue;
+ }
+
DataType dataTypeIn = layer->GetNumInputSlots() == 0 ? DataType::Float32 :
layer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo().GetDataType();
DataType dataTypeOut = layer->GetNumOutputSlots() == 0 ? DataType::Float32 :
@@ -1027,6 +1032,17 @@ OptimizationResult AssignBackends(OptimizedNetworkImpl* optNetObjPtr,
}
}
+ for (auto it = firstLayer; it != lastLayer; ++it)
+ {
+ auto layer = *it;
+
+ if(layer->GetType() == LayerType::Input)
+ {
+ BackendId connectedBackendId = layer->GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetBackendId();
+ layer->SetBackendId(connectedBackendId);
+ }
+ }
+
return result;
}
diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp
index e2ad52a772..2cb47fbfc7 100644
--- a/src/armnn/WorkingMemHandle.cpp
+++ b/src/armnn/WorkingMemHandle.cpp
@@ -17,16 +17,20 @@ namespace experimental
WorkingMemHandle::WorkingMemHandle(NetworkId networkId,
std::vector<InputMemDescriptorCoords> inputLayerInfo,
- std::vector<OutputMemDescriptorCoords> ouputLayerInfo,
+ std::vector<OutputMemDescriptorCoords> outputLayerInfo,
std::vector<WorkingMemDescriptor> workingMemDescriptors,
std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap,
- std::vector<std::shared_ptr<IMemoryManager>> memoryManagers,
- std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > ownedTensorHandles)
+ std::unique_ptr<MemoryManager> memoryManager,
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory,
+ std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles,
+ std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles)
: m_NetworkId(networkId)
, m_WorkingMemDescriptors(workingMemDescriptors)
, m_WorkingMemDescriptorMap(workingMemDescriptorMap)
- , m_MemoryManagers(memoryManagers)
- , m_OwnedTensorHandles(std::move(ownedTensorHandles))
+ , m_MemoryManager(std::move(memoryManager))
+ , m_TensorMemory(std::move(tensorMemory))
+ , m_ManagedTensorHandles(std::move(managedTensorHandles))
+ , m_UnmanagedTensorHandles(std::move(unmanagedTensorHandles))
, m_InputSize(numeric_cast<DifferenceType>(inputLayerInfo.size()))
, m_IsAllocated(false)
{
@@ -54,7 +58,7 @@ WorkingMemHandle::WorkingMemHandle(NetworkId networkId,
}
}
size_t bindingIdCount = inputLayerInfo.size();
- for (const auto& outputInfo : ouputLayerInfo)
+ for (const auto& outputInfo : outputLayerInfo)
{
for (auto bindingId : outputInfo.m_LayerBindingIds)
{
@@ -88,6 +92,7 @@ WorkingMemHandle::WorkingMemHandle(NetworkId networkId,
}
}
m_BindingIdVec = std::vector<LayerBindingId>(bindingIdCount);
+ IgnoreUnused(m_UnmanagedTensorHandles);
}
void WorkingMemHandle::Allocate()
@@ -98,9 +103,11 @@ void WorkingMemHandle::Allocate()
}
m_IsAllocated = true;
- for (auto& mgr : m_MemoryManagers)
+ m_MemoryManager->Allocate();
+
+ for (unsigned int i = 0; i < m_TensorMemory.size(); ++i)
{
- mgr->Acquire();
+ m_ManagedTensorHandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second);
}
}
@@ -112,10 +119,7 @@ void WorkingMemHandle::Free()
}
m_IsAllocated = false;
- for (auto& mgr : m_MemoryManagers)
- {
- mgr->Release();
- }
+ m_MemoryManager->Deallocate();
}
void WorkingMemHandle::MemSyncOutputs()
diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp
index 9078a8d54c..bca1d2d80c 100644
--- a/src/armnn/WorkingMemHandle.hpp
+++ b/src/armnn/WorkingMemHandle.hpp
@@ -14,6 +14,7 @@
#include <unordered_map>
#include <mutex>
+#include <backendsCommon/MemoryManager.hpp>
namespace armnn
{
@@ -45,11 +46,13 @@ public:
WorkingMemHandle(NetworkId networkId,
std::vector<InputMemDescriptorCoords> inputLayerInfo,
- std::vector<OutputMemDescriptorCoords> ouputLayerInfo,
+ std::vector<OutputMemDescriptorCoords> outputLayerInfo,
std::vector<WorkingMemDescriptor> workingMemDescriptors,
std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap,
- std::vector<std::shared_ptr<IMemoryManager>> memoryManagers,
- std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > ownedTensorHandles);
+ std::unique_ptr<MemoryManager> memoryManager,
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory,
+ std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles,
+ std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles);
~WorkingMemHandle()
{ Free(); }
@@ -128,11 +131,17 @@ private:
std::vector<WorkingMemDescriptor> m_WorkingMemDescriptors;
std::unordered_map<LayerGuid, WorkingMemDescriptor> m_WorkingMemDescriptorMap;
- // Vector of IMemoryManagers that manage the WorkingMemHandle's memory
- std::vector<std::shared_ptr<IMemoryManager>> m_MemoryManagers;
- // TensorHandles owned by this WorkingMemHandle
- // constant tensor's can be shared by multiple WorkingMemHandles and so will not be stored here
- std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > m_OwnedTensorHandles;
+ std::unique_ptr<MemoryManager> m_MemoryManager;
+
+ // Memory to be imported into the tensorHandles after allocation
+ std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> m_TensorMemory;
+
+
+ // Tensors that will need to be allocated internally within armnn
+ std::vector<std::unique_ptr<ITensorHandle>> m_ManagedTensorHandles;
+
+ // Tensors that will be allocated externally by the user
+ std::vector<std::unique_ptr<ITensorHandle>> m_UnmanagedTensorHandles;
std::unordered_map<LayerBindingId, bool> m_InputValidationMap;
std::unordered_map<LayerBindingId, bool> m_OutputValidationMap;
diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp
index 8416a8dd0d..3cea1b540e 100644
--- a/src/armnn/test/OptimizerTests.cpp
+++ b/src/armnn/test/OptimizerTests.cpp
@@ -714,7 +714,8 @@ TEST_CASE("BackendHintTest")
case armnn::LayerType::Input:
{
auto inputLayer = PolymorphicDowncast<const InputLayer*>(layer);
- CHECK((inputLayer->GetBackendId() == "MockBackend"));
+ const auto connectedLayerBackendId = inputLayer->GetOutputSlot(0).GetOwningLayer().GetBackendId();
+ CHECK((inputLayer->GetBackendId() == connectedLayerBackendId));
break;
}
case armnn::LayerType::Output:
diff --git a/src/backends/backendsCommon/DefaultAllocator.hpp b/src/backends/backendsCommon/DefaultAllocator.hpp
index 2451db3ab8..cf0f1774f0 100644
--- a/src/backends/backendsCommon/DefaultAllocator.hpp
+++ b/src/backends/backendsCommon/DefaultAllocator.hpp
@@ -22,12 +22,12 @@ public:
void* allocate(size_t size, size_t alignment = 0) override
{
IgnoreUnused(alignment);
- return ::operator new(size);
+ return ::operator new(size_t(size));
}
void free(void* ptr) override
{
- std::free(ptr);
+ ::operator delete(ptr);
}
armnn::MemorySource GetMemorySourceType() override
diff --git a/src/backends/backendsCommon/MemoryManager.cpp b/src/backends/backendsCommon/MemoryManager.cpp
index 1c109c3c91..77cab27789 100644
--- a/src/backends/backendsCommon/MemoryManager.cpp
+++ b/src/backends/backendsCommon/MemoryManager.cpp
@@ -11,7 +11,7 @@ namespace armnn
{
void MemoryManager::StoreMemToAllocate(std::vector<BufferStorage> bufferStorageVector,
- ICustomAllocator* customAllocator,
+ std::shared_ptr<ICustomAllocator> customAllocator,
const size_t typeAlignment)
{
IgnoreUnused(typeAlignment);
diff --git a/src/backends/backendsCommon/MemoryManager.hpp b/src/backends/backendsCommon/MemoryManager.hpp
index cbd6fcf9bc..5113b231d3 100644
--- a/src/backends/backendsCommon/MemoryManager.hpp
+++ b/src/backends/backendsCommon/MemoryManager.hpp
@@ -2,6 +2,7 @@
// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
+#pragma once
#include <armnn/backends/ICustomAllocator.hpp>
@@ -10,7 +11,7 @@ namespace armnn
struct Allocator
{
/// Pointer to @ICustomAllocator.
- ICustomAllocator* m_CustomAllocator{};
+ std::shared_ptr<ICustomAllocator> m_CustomAllocator{};
/// Value which the size of each buffer (actual data size + padding) has to be a multiple of.
size_t m_Alignment = 0 ;
};
@@ -19,16 +20,16 @@ struct TensorMemory
{
/// Number of bytes the value is away from the @BufferStorage.m_Buffer.
size_t m_Offset{};
- /// Pointer to the tensor value.
- void* m_Data = nullptr;
/// Identifier to be used by the @LoadedNetwork to order the tensors.
unsigned int m_OutputSlotId{};
+ /// Pointer to the tensor value.
+ void* m_Data = nullptr;
};
struct BufferStorage
{
/// Vector of pointer to @TensorMemory.
- std::vector<TensorMemory*> m_TensorMemoryVector;
+ std::vector<std::shared_ptr<TensorMemory>> m_TensorMemoryVector;
/// Total size of the buffer.
size_t m_BufferSize;
/// Pointer to the first element of the buffer.
@@ -43,7 +44,7 @@ public:
/// @param[in] customAllocator - Pointer to @ICustomAllocator.
/// @param[in] typeAlignment - Optional parameter. Value of which the size of each value has to be multiple of.
void StoreMemToAllocate(std::vector<BufferStorage> bufferStorageVector,
- ICustomAllocator* customAllocator,
+ std::shared_ptr<ICustomAllocator> customAllocator,
size_t typeAlignment = 0);
/// Allocate the amount of memory indicated by @m_BufferSize, and
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index a77ec06035..56c9d6545a 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -17,6 +17,7 @@ COMMON_SOURCES := \
MapWorkload.cpp \
MemCopyWorkload.cpp \
MemImportWorkload.cpp \
+ MemoryManager.cpp \
MemSyncWorkload.cpp \
OptimizationViews.cpp \
TensorHandleFactoryRegistry.cpp \
@@ -25,7 +26,8 @@ COMMON_SOURCES := \
WorkloadFactory.cpp \
WorkloadUtils.cpp \
memoryOptimizerStrategyLibrary/strategies/ConstantMemoryStrategy.cpp \
- memoryOptimizerStrategyLibrary/strategies/StrategyValidator.cpp \
+ memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp \
+ memoryOptimizerStrategyLibrary/strategies/StrategyValidator.cpp
# COMMON_TEST_SOURCES contains the list of files to be included
@@ -104,7 +106,8 @@ COMMON_TEST_SOURCES := \
test/layerTests/TransposeConvolution2dTestImpl.cpp \
test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp \
memoryOptimizerStrategyLibrary/test/ConstMemoryStrategyTests.cpp \
- memoryOptimizerStrategyLibrary/test/ValidatorStrategyTests.cpp
+ memoryOptimizerStrategyLibrary/test/ValidatorStrategyTests.cpp \
+ memoryOptimizerStrategyLibrary/test/SingleAxisPriorityListTests.cpp
ifeq ($(ARMNN_REF_ENABLED),1)
COMMON_TEST_SOURCES += \
diff --git a/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp b/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp
index 3afa061681..738b7137a7 100644
--- a/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp
+++ b/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp
@@ -155,9 +155,9 @@ void SingleAxisPriorityList::PlaceBlocks(const std::list<MemBlock*>& priorityLis
// The indexes don't match we need at least two words
// Zero the bits to the right of curBlock->m_EndOfLife
- remainder = (curBlock->m_EndOfLife +1 - lastWordIndex * wordSize);
+ remainder = (curBlock->m_EndOfLife - lastWordIndex * wordSize);
- size_t lastWord = (1u << remainder) - 1;
+ size_t lastWord = (1ul << remainder) - 1;
lastWord = lastWord << (wordSize - remainder);
if(firstWordIndex + 1 == lastWordIndex)
diff --git a/src/backends/backendsCommon/test/CompatibilityTests.cpp b/src/backends/backendsCommon/test/CompatibilityTests.cpp
index d18a8fbb6c..3685f75986 100644
--- a/src/backends/backendsCommon/test/CompatibilityTests.cpp
+++ b/src/backends/backendsCommon/test/CompatibilityTests.cpp
@@ -181,7 +181,7 @@ TEST_CASE ("Ref_Backends_Capability_Test")
{"ProtectedContentAllocation", false},
{"ConstantTensorsAsInputs", true},
{"PreImportIOTensors", true},
- {"ExternallyManagedMemory", false},
+ {"ExternallyManagedMemory", true},
{"MultiAxisPacking", false}});
}
@@ -200,7 +200,7 @@ TEST_CASE ("Neon_Backends_Capability_Test")
{"ProtectedContentAllocation", false},
{"ConstantTensorsAsInputs", false},
{"PreImportIOTensors", false},
- {"ExternallyManagedMemory", false},
+ {"ExternallyManagedMemory", true},
{"MultiAxisPacking", false}});
}
@@ -219,7 +219,7 @@ TEST_CASE ("Cl_Backends_Capability_Test")
{"ProtectedContentAllocation", true},
{"ConstantTensorsAsInputs", false},
{"PreImportIOTensors", false},
- {"ExternallyManagedMemory", false},
+ {"ExternallyManagedMemory", true},
{"MultiAxisPacking", false}});
}
diff --git a/src/backends/backendsCommon/test/MemoryManagerTests.cpp b/src/backends/backendsCommon/test/MemoryManagerTests.cpp
index c873499ef3..662a5c2423 100644
--- a/src/backends/backendsCommon/test/MemoryManagerTests.cpp
+++ b/src/backends/backendsCommon/test/MemoryManagerTests.cpp
@@ -59,17 +59,18 @@ TEST_CASE("MemoryManagerTest")
// Create mock up bufferStorageVector with 2 BufferStorage with the same TensorMemory
size_t numTensors = 5;
- std::vector<TensorMemory*> tensorMemoryPointerVector(numTensors);
- std::vector<TensorMemory> tensorMemoryVector;
+ std::vector<std::shared_ptr<TensorMemory>> tensorMemoryPointerVector(numTensors);
+ std::vector<std::shared_ptr<TensorMemory>> tensorMemoryVector;
tensorMemoryVector.reserve(numTensors);
std::vector<size_t> offsets(numTensors);
std::iota(std::begin(offsets), std::end(offsets), 0);
- for (uint32_t idx = 0; idx < tensorMemoryPointerVector.size(); ++idx)
+ for (uint idx = 0; idx < tensorMemoryPointerVector.size(); ++idx)
{
- tensorMemoryVector.emplace_back(TensorMemory{offsets[idx], nullptr, 0});
- tensorMemoryPointerVector[idx] = &tensorMemoryVector[idx];
+ tensorMemoryVector.emplace_back(std::make_shared<TensorMemory>(TensorMemory{offsets[idx], 0, nullptr}));
+
+ tensorMemoryPointerVector[idx] = tensorMemoryVector[idx];
}
std::vector<BufferStorage> bufferStorageVector;
@@ -77,30 +78,31 @@ TEST_CASE("MemoryManagerTest")
bufferStorageVector.emplace_back(BufferStorage{tensorMemoryPointerVector, numTensors});
// Create an instance of the SampleCustomAllocator
- SampleCustomAllocator customAllocator = SampleCustomAllocator();
- customAllocator.m_Values = {10, 11, 12, 13, 14};
+ std::shared_ptr<SampleCustomAllocator> customAllocator =
+ std::make_unique<SampleCustomAllocator>(SampleCustomAllocator());
+
+ customAllocator->m_Values = {10, 11, 12, 13, 14};
// Check that the test was set up correctly
- CHECK(customAllocator.m_Values.size() == numTensors);
+ CHECK(customAllocator->m_Values.size() == numTensors);
+ size_t bufferVecSize = bufferStorageVector.size();
// Utilise 3 functions in the MemoryManager. Check the counters and the pointer to the values are correct.
MemoryManager memoryManager;
- memoryManager.StoreMemToAllocate(bufferStorageVector, &customAllocator);
+ memoryManager.StoreMemToAllocate(bufferStorageVector, customAllocator);
memoryManager.Allocate();
- CHECK(customAllocator.m_CounterAllocate == bufferStorageVector.size());
- for (const auto& bufferStorage : bufferStorageVector)
+ CHECK(customAllocator->m_CounterAllocate == bufferVecSize);
+
+ uint idx = 0;
+ for (auto tensorMemory : tensorMemoryVector)
{
- uint32_t idx = 0;
- for (auto tensorMemory : bufferStorage.m_TensorMemoryVector)
- {
- auto value = reinterpret_cast<uint8_t *>(tensorMemory->m_Data);
- CHECK(customAllocator.m_Values[idx] == *value);
- idx += 1;
- }
+ auto value = reinterpret_cast<uint8_t *>(tensorMemory->m_Data);
+ CHECK(customAllocator->m_Values[idx] == *value);
+ idx += 1;
}
memoryManager.Deallocate();
- CHECK(customAllocator.m_CounterFree == bufferStorageVector.size());
+ CHECK(customAllocator->m_CounterFree == bufferStorageVector.size());
}
}
diff --git a/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp b/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp
index 012737e1d7..b0ee9bee32 100644
--- a/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp
+++ b/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp
@@ -138,7 +138,7 @@ TEST_CASE("OptimizeValidateDeviceNonSupportLayerWithFallback")
// the other layers are supported by CpuRef.
// If NEON is not enabled, all layers are supported by CpuRef.
#if defined(ARMCOMPUTENEON_ENABLED)
- if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output)
+ if (layer->GetType() == armnn::LayerType::Output)
{
CHECK(layer->GetBackendId() == armnn::Compute::CpuAcc);
}
@@ -337,7 +337,7 @@ TEST_CASE("OptimizeValidateWorkloadsDuplicateComputeDeviceWithFallback")
// the other layers are supported by CpuRef.
// If neither NEON, nor CL is enabled, all layers are supported by CpuRef.
#if defined(ARMCOMPUTENEON_ENABLED)
- if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output)
+ if (layer->GetType() == armnn::LayerType::Output)
{
CHECK(layer->GetBackendId() == armnn::Compute::CpuAcc);
}
diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp
index 7597d093be..99fe9069ff 100644
--- a/src/backends/cl/ClBackend.hpp
+++ b/src/backends/cl/ClBackend.hpp
@@ -29,7 +29,7 @@ const BackendCapabilities gpuAccCapabilities("GpuAcc",
{"ProtectedContentAllocation", true},
{"ConstantTensorsAsInputs", false},
{"PreImportIOTensors", false},
- {"ExternallyManagedMemory", false},
+ {"ExternallyManagedMemory", true},
{"MultiAxisPacking", false},
{"SingleAxisPacking", true}
});
diff --git a/src/backends/neon/NeonBackend.hpp b/src/backends/neon/NeonBackend.hpp
index 68d60a4c04..e53bacb84a 100644
--- a/src/backends/neon/NeonBackend.hpp
+++ b/src/backends/neon/NeonBackend.hpp
@@ -10,14 +10,14 @@ namespace armnn
{
// add new capabilities here..
-const BackendCapabilities cpuAccCapabilities("GpuAcc",
+const BackendCapabilities cpuAccCapabilities("CpuAcc",
{
{"NonConstWeights", false},
{"AsyncExecution", false},
{"ProtectedContentAllocation", false},
{"ConstantTensorsAsInputs", false},
{"PreImportIOTensors", false},
- {"ExternallyManagedMemory", false},
+ {"ExternallyManagedMemory", true},
{"MultiAxisPacking", false},
{"SingleAxisPacking", true}
});
diff --git a/src/backends/neon/NeonTensorHandle.hpp b/src/backends/neon/NeonTensorHandle.hpp
index ae8aa5d8c7..dd4c2572f9 100644
--- a/src/backends/neon/NeonTensorHandle.hpp
+++ b/src/backends/neon/NeonTensorHandle.hpp
@@ -29,7 +29,8 @@ public:
NeonTensorHandle(const TensorInfo& tensorInfo)
: m_ImportFlags(static_cast<MemorySourceFlags>(MemorySource::Malloc)),
m_Imported(false),
- m_IsImportEnabled(false)
+ m_IsImportEnabled(false),
+ m_TypeAlignment(GetDataTypeSize(tensorInfo.GetDataType()))
{
armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo);
}
@@ -39,7 +40,9 @@ public:
MemorySourceFlags importFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc))
: m_ImportFlags(importFlags),
m_Imported(false),
- m_IsImportEnabled(false)
+ m_IsImportEnabled(false),
+ m_TypeAlignment(GetDataTypeSize(tensorInfo.GetDataType()))
+
{
armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout);
@@ -117,9 +120,7 @@ public:
{
if (source == MemorySource::Malloc && m_IsImportEnabled)
{
- // Checks the 16 byte memory alignment
- constexpr uintptr_t alignment = sizeof(size_t);
- if (reinterpret_cast<uintptr_t>(memory) % alignment)
+ if (reinterpret_cast<uintptr_t>(memory) % m_TypeAlignment)
{
throw MemoryImportException("NeonTensorHandle::Import Attempting to import unaligned memory");
}
@@ -263,6 +264,7 @@ private:
MemorySourceFlags m_ImportFlags;
bool m_Imported;
bool m_IsImportEnabled;
+ const uintptr_t m_TypeAlignment;
};
class NeonSubTensorHandle : public IAclTensorHandle
diff --git a/src/backends/reference/RefBackend.hpp b/src/backends/reference/RefBackend.hpp
index 6114ce6218..da04f22d93 100644
--- a/src/backends/reference/RefBackend.hpp
+++ b/src/backends/reference/RefBackend.hpp
@@ -16,7 +16,7 @@ const BackendCapabilities cpuRefCapabilities("CpuRef",
{"ProtectedContentAllocation", false},
{"ConstantTensorsAsInputs", true},
{"PreImportIOTensors", true},
- {"ExternallyManagedMemory", false},
+ {"ExternallyManagedMemory", true},
{"MultiAxisPacking", false},
{"SingleAxisPacking", true}
});
diff --git a/src/backends/reference/RefTensorHandle.cpp b/src/backends/reference/RefTensorHandle.cpp
index b9e566eace..5229e9d62b 100644
--- a/src/backends/reference/RefTensorHandle.cpp
+++ b/src/backends/reference/RefTensorHandle.cpp
@@ -122,7 +122,7 @@ bool RefTensorHandle::Import(void* memory, MemorySource source)
if (m_IsImportEnabled && source == MemorySource::Malloc)
{
// Check memory alignment
- constexpr uintptr_t alignment = sizeof(size_t);
+ uintptr_t alignment = GetDataTypeSize(m_TensorInfo.GetDataType());
if (reinterpret_cast<uintptr_t>(memory) % alignment)
{
if (m_Imported)
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 75008bc866..36dcd21d32 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -113,10 +113,14 @@ bool RefWorkloadFactory::IsLayerSupported(const IConnectableLayer& layer,
std::unique_ptr<ITensorHandle> RefWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
const bool isMemoryManaged) const
{
- // For Ref it is okay to make the TensorHandle memory managed as it can also store a pointer
- // to unmanaged memory. This also ensures memory alignment.
- IgnoreUnused(isMemoryManaged);
- return std::make_unique<RefTensorHandle>(tensorInfo, m_MemoryManager);
+ if (isMemoryManaged)
+ {
+ return std::make_unique<RefTensorHandle>(tensorInfo, m_MemoryManager);
+ }
+ else
+ {
+ return std::make_unique<RefTensorHandle>(tensorInfo, static_cast<unsigned int>(MemorySource::Malloc));
+ }
}
std::unique_ptr<ITensorHandle> RefWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
@@ -126,7 +130,15 @@ std::unique_ptr<ITensorHandle> RefWorkloadFactory::CreateTensorHandle(const Tens
// For Ref it is okay to make the TensorHandle memory managed as it can also store a pointer
// to unmanaged memory. This also ensures memory alignment.
IgnoreUnused(isMemoryManaged, dataLayout);
- return std::make_unique<RefTensorHandle>(tensorInfo, m_MemoryManager);
+
+ if (isMemoryManaged)
+ {
+ return std::make_unique<RefTensorHandle>(tensorInfo, m_MemoryManager);
+ }
+ else
+ {
+ return std::make_unique<RefTensorHandle>(tensorInfo, static_cast<unsigned int>(MemorySource::Malloc));
+ }
}
std::unique_ptr<IWorkload> RefWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,