aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFinn Williams <Finn.Williams@arm.com>2021-04-26 12:06:34 +0100
committerfinn.williams <finn.williams@arm.com>2021-04-28 11:39:10 +0000
commit01097941ef85073c56cbd1d5f00d7e8ffeb9876d (patch)
tree818686d467b142084e0e49bbd4084670d1d0d50b
parentc2b99a8783388ec3bd90dfed2e1b6d4f4d4bd1c8 (diff)
downloadarmnn-01097941ef85073c56cbd1d5f00d7e8ffeb9876d.tar.gz
IVGCVSW-5843 Separate memory managers for WorkingMemHandles
* Add inter layer memory management to WorkingMemHandle * Change Const layers to be executed once in loadedNetworkConstruction and share tensorHandle between all WorkingMemHandles * Fix various reference workloads pointing to memory in the queueDescriptor Signed-off-by: Finn Williams <Finn.Williams@arm.com> Change-Id: I69d4b3c5c84d2f5abe4540c3e624ab4f00d88226
-rw-r--r--src/armnn/LoadedNetwork.cpp358
-rw-r--r--src/armnn/LoadedNetwork.hpp26
-rw-r--r--src/armnn/WorkingMemHandle.cpp56
-rw-r--r--src/armnn/WorkingMemHandle.hpp59
-rw-r--r--src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp5
-rw-r--r--src/backends/reference/workloads/InstanceNorm.cpp2
-rw-r--r--src/backends/reference/workloads/InstanceNorm.hpp1
-rw-r--r--src/backends/reference/workloads/Pad.cpp8
-rw-r--r--src/backends/reference/workloads/Pad.hpp2
-rw-r--r--src/backends/reference/workloads/PreluImpl.cpp8
-rw-r--r--src/backends/reference/workloads/PreluImpl.hpp4
-rw-r--r--src/backends/reference/workloads/RefArgMinMaxWorkload.cpp4
-rw-r--r--src/backends/reference/workloads/RefGatherWorkload.cpp2
-rw-r--r--src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp3
-rw-r--r--src/backends/reference/workloads/RefPadWorkload.cpp2
-rw-r--r--src/backends/reference/workloads/RefPreluWorkload.cpp6
-rw-r--r--src/backends/reference/workloads/RefRankWorkload.hpp2
-rw-r--r--src/backends/reference/workloads/RefStackWorkload.cpp20
-rw-r--r--src/backends/reference/workloads/RefWorkloadUtils.hpp6
-rw-r--r--src/backends/reference/workloads/Stack.cpp18
20 files changed, 339 insertions, 253 deletions
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index d75a2021b2..85451cb0d8 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -161,35 +161,38 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
}
}
}
-
- for (auto&& layer : order)
+ if (!networkProperties.m_AsyncEnabled)
{
- auto& workloadFactory = GetWorkloadFactory(*layer);
-
- switch (layer->GetType())
+ for (auto &&layer : order)
{
- case LayerType::Input:
- case LayerType::MemImport:
- {
- // If IsImportEnabled is true then we need to set IsMemoryManaged to false when creating TensorHandles
- layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory,
- !m_NetworkProperties.m_ImportEnabled);
- break;
- }
- default:
+ auto &workloadFactory = GetWorkloadFactory(*layer);
+
+ switch (layer->GetType())
{
- // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
- // If Export is enabled disable memory management so we can export, otherwise we do a copy
- if((layer->GetNumOutputSlots() == 1) &&
- (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
- (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
+ case LayerType::Input:
+ case LayerType::MemImport:
{
+ // If IsImportEnabled is true then we need to set IsMemoryManaged
+ // to false when creating TensorHandles
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory,
- !m_NetworkProperties.m_ExportEnabled);
+ !m_NetworkProperties.m_ImportEnabled);
+ break;
}
- else
+ default:
{
- layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
+ // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
+ // If Export is enabled disable memory management so we can export, otherwise we do a copy
+ if ((layer->GetNumOutputSlots() == 1) &&
+ (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
+ (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
+ {
+ layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory,
+ !m_NetworkProperties.m_ExportEnabled);
+ }
+ else
+ {
+ layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
+ }
}
}
}
@@ -249,7 +252,17 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
AddWorkloadStructure(timelineUtils, workload, *layer);
}
- m_WorkloadQueue.push_back(move(workload));
+ // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
+ // and are separated out from the other workloads
+ if (networkProperties.m_AsyncEnabled && layer->GetType() == LayerType::Constant)
+ {
+ m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
+ }
+ else
+ {
+ m_WorkloadQueue.push_back(move(workload));
+ }
+
// release the constant data in the layer..
layer->ReleaseConstantData();
break;
@@ -268,16 +281,50 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
timelineUtils->Commit();
}
- // Set up memory.
- m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
+ if (!networkProperties.m_AsyncEnabled)
+ {
+ // Set up memory.
+ m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
- // Now that the intermediate tensor memory has been set-up, do any post allocation configuration for each workload.
- for (auto& workload : m_WorkloadQueue)
+ // Now that the intermediate tensor memory has been set-up,
+ // do any post allocation configuration for each workload.
+ for (auto &workload : m_WorkloadQueue)
+ {
+ workload->PostAllocationConfigure();
+ }
+ }
+ else
{
- workload->PostAllocationConfigure();
+ AllocateAndExecuteConstantWorkloads();
}
}
+void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
+{
+ Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
+ for (auto&& layer : order)
+ {
+ if (layer->GetType() == LayerType::Constant)
+ {
+ const auto& outSlot = layer->GetOutputSlots()[0];
+ const auto factoryId = outSlot.GetTensorHandleFactoryId();
+ ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
+ auto& workloadFactory = GetWorkloadFactory(*layer);
+
+ layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
+ ITensorHandle* tensorHandle = outSlot.GetOutputHandler().GetData();
+
+ m_ConstantTensorHandles[layer->GetGuid()] = tensorHandle;
+ tensorHandle->Allocate();
+
+ WorkingMemDescriptor memDesc;
+ memDesc.m_Outputs.push_back(tensorHandle);
+ m_ConstantWorkloads[layer->GetGuid()]->ExecuteAsync(memDesc);
+ }
+ }
+}
+
+
void LoadedNetwork::SendNetworkStructure()
{
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
@@ -803,9 +850,8 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer,
{
throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
}
- LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid();
+ LayerGuid id = layer.GetGuid();
WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id);
- ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output");
MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags();
if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor
@@ -841,7 +887,7 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer,
memcpy(dst, src, size);
};
- for (const auto& input : descriptor.m_Inputs)
+ for (const auto& input : descriptor.m_Outputs)
{
CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc);
}
@@ -856,7 +902,7 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outp
}
ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
- LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid();
+ LayerGuid id = layer.GetGuid();
WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id);
ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0];
@@ -888,8 +934,8 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outp
if (importOk)
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
- descriptor.m_Inputs[0]->Map(true);
- descriptor.m_Inputs[0]->Unmap();
+ inputTensorHandle->Map(true);
+ inputTensorHandle->Unmap();
}
else
{
@@ -914,10 +960,38 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outp
};
std::unique_ptr<ITensorHandle> tensorHandle =
- std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
+ std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(),
+ outputTensor.GetMemoryArea());
+
+ CopyTensorContentsGeneric(inputTensorHandle, tensorHandle.get(), copyFunc);
+ }
+}
+
+
+const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors& inputTensors)
+{
+ for (auto inputTensorPair : inputTensors)
+ {
+ LayerBindingId id = inputTensorPair.first;
+ if (id == layerId)
+ {
+ return inputTensorPair.second;
+ }
+ }
+ throw InvalidArgumentException("Input does not exist.");
+}
- CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc);
+const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors& outputTensors)
+{
+ for (auto outputTensorPair : outputTensors)
+ {
+ LayerBindingId id = outputTensorPair.first;
+ if (id == layerId)
+ {
+ return outputTensorPair.second;
+ }
}
+ throw InvalidArgumentException("Output does not exist.");
}
Status LoadedNetwork::Execute(const InputTensors& inputTensors,
@@ -971,12 +1045,9 @@ Status LoadedNetwork::Execute(const InputTensors& inputTensors,
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
- unsigned int i = 0;
-
for (const BindableLayer* inputLayer : graph.GetInputLayers())
{
- EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle);
- ++i;
+ EnqueueInput(*inputLayer, GetInputTensor(inputLayer->GetBindingId(), inputTensors), workingMemHandle);
}
}
@@ -1016,130 +1087,153 @@ Status LoadedNetwork::Execute(const InputTensors& inputTensors,
// For each output to the network, call EnqueueOutput with the data passed by the user.
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
- unsigned int i = static_cast<unsigned int>(m_WorkloadQueue.size() - graph.GetNumOutputs());
-
- for (const BindableLayer* outputLayer : graph.GetOutputLayers())
+ for (const BindableLayer *outputLayer : graph.GetOutputLayers())
{
- EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle);
- ++i;
+ EnqueueOutput(*outputLayer, GetOutputTensor(outputLayer->GetBindingId(), outputTensors), workingMemHandle);
}
}
return executionSucceeded ? Status::Success : Status::Failure;
}
-// Need something like the collectors to get the correct tensors for the inputs
-void LoadedNetwork::CollectInputTensorHandles(
- std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
- std::vector<ITensorHandle*>& inputs,
- const armnn::Layer* layer,
- const TensorHandleFactoryRegistry& registry,
- const bool isMemoryManaged)
+
+/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+/// overlapped Execution by calling this function from different threads.
+std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
{
- for (auto&& inputSlot : layer->GetInputSlots())
- {
- // The graph must be well-formed at this point.
- ARMNN_ASSERT(inputSlot.GetConnection());
- auto outputSlot = inputSlot.GetConnectedOutputSlot();
- auto key = outputSlot->GetOwningLayer().GetGuid();
- auto search = tensorHandles.find(key);
+ Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
+ std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > tensorHandleMap;
+ std::vector<WorkingMemDescriptor> workingMemDescriptors;
+ std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
+ TensorHandleFactoryRegistry tensorHandleFactoryRegistry;
+ WorkloadFactoryMap workloadFactoryMap;
- if (search == tensorHandles.end())
- {
- ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
- const TensorInfo& tensorInfo = outputSlot->GetTensorInfo();
+ std::vector<std::shared_ptr<IMemoryManager>> memoryManagers;
- ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
- ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId);
- ARMNN_ASSERT(handleFactory);
- std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
- ITensorHandle* tensorPtr = tensor.release();
- inputs.push_back(tensorPtr);
+ for (auto const& backend : m_Backends)
+ {
+ if (backend.second->SupportsTensorAllocatorAPI())
+ {
+ backend.second->RegisterTensorHandleFactories(tensorHandleFactoryRegistry);
+ memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back());
}
else
{
- unsigned int index = outputSlot->CalculateIndexOnOwner();
- inputs.push_back(search->second[index]);
+ std::shared_ptr<IMemoryManager> memoryManager = backend.second->CreateMemoryManager();
+ auto workloadFactory = backend.second->CreateWorkloadFactory(
+ memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
+
+ workloadFactoryMap.emplace(
+ std::make_pair(backend.first, std::make_pair(std::move(workloadFactory), memoryManager)));
+ memoryManagers.emplace_back(memoryManager);
}
}
-}
-
-void LoadedNetwork::CreateOutputTensorHandles(
- std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
- std::vector<ITensorHandle*>& outputs,
- const armnn::Layer* layer,
- const TensorHandleFactoryRegistry& registry,
- const bool isMemoryManaged)
-{
- auto guid = layer->GetGuid();
- std::vector<ITensorHandle*> tensorHandleVectors;
- tensorHandleVectors.reserve(layer->GetNumOutputSlots());
- for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++)
+ auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot, bool isMemoryManaged)
{
- const OutputSlot& slot = layer->GetOutputSlot(idx);
- ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId();
- const TensorInfo& tensorInfo = slot.GetTensorInfo();
+ ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
+ const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
- ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
- ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId);
- ARMNN_ASSERT(handleFactory);
- std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
- ITensorHandle* tensorPtr = tensor.release();
- outputs.push_back(tensorPtr);
- tensorHandleVectors.push_back(tensorPtr);
- }
- tensorHandles.insert({guid, tensorHandleVectors});
-}
-
-/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
-/// overlapped Execution by calling this function from different threads.
-std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
-{
- Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
- std::unordered_map<LayerGuid, std::vector<ITensorHandle*> > tensorHandles;
- std::vector<WorkingMemDescriptor> workingMemDescriptors;
- std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
+ if (factoryId == ITensorHandleFactory::LegacyFactoryId)
+ {
+ BackendId id = layer->GetBackendId();
+ ARMNN_NO_DEPRECATE_WARN_BEGIN
+ return workloadFactoryMap.at(id).first->CreateTensorHandle(tensorInfo, isMemoryManaged);
+ ARMNN_NO_DEPRECATE_WARN_END
+ }
+ else
+ {
+ ITensorHandleFactory* handleFactory = tensorHandleFactoryRegistry.GetFactory(factoryId);
+ ARMNN_ASSERT(handleFactory);
+ return handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
+ }
+ };
+ std::unordered_map<const ITensorHandle*, unsigned int> handleReferenceCounts;
for (auto&& layer : order)
{
- if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output)
+ WorkingMemDescriptor workingMemDescriptor;
+
+ // Constant layers execution and management is handled during loaded network construction
+ if (layer->GetType() == LayerType::Constant)
{
continue;
}
- WorkingMemDescriptor workingMemDescriptor;
+ bool isMemoryManaged = true;
+ bool isInputLayer = true;
// Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
// If Export is enabled disable memory management so we can export, otherwise we do a copy
- if((layer->GetNumOutputSlots() == 1) &&
- (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
- (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
+ if ((layer->GetNumOutputSlots() == 1) &&
+ (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
+ (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
{
- CollectInputTensorHandles(tensorHandles,
- workingMemDescriptor.m_Inputs,
- layer,
- m_TensorHandleFactoryRegistry,
- !m_NetworkProperties.m_ExportEnabled);
- CreateOutputTensorHandles(tensorHandles,
- workingMemDescriptor.m_Outputs,
- layer,
- m_TensorHandleFactoryRegistry,
- !m_NetworkProperties.m_ExportEnabled);
+ isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
}
- else
+ else if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::MemImport)
+ {
+ // Input layers/workloads will not be executed so the descriptor is not added to workingMemDescriptors
+ // However we will still need to manage the tensorHandle
+ isInputLayer = false;
+ isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
+ }
+
+ // Create a tensor handle for each output slot of a layer
+ // Once we create it, we start managing its lifetime
+ for (auto& slot : layer->GetOutputSlots())
+ {
+ tensorHandleMap[layer->GetGuid()].emplace_back(GetTensorHandle(layer, slot, isMemoryManaged));
+ ITensorHandle* tensorHandle = tensorHandleMap[layer->GetGuid()].back().get();
+
+ workingMemDescriptor.m_Outputs.push_back(tensorHandle);
+ tensorHandle->Manage();
+ unsigned int numConnections = slot.GetNumConnections();
+ ARMNN_ASSERT(numConnections != 0);
+
+ handleReferenceCounts[tensorHandle] = numConnections;
+ }
+ // Loop through the input slots in the same layer and decrement the reference counter associated
+ // to each tensor handle we encounter.
+ // Once it reaches zero, the lifetime of the tensor handle has ended, and we mark it's memory as available
+ // so that the next tensor handle with a non overlapping lifetime can share it's memory.
+ for (auto& slot : layer->GetInputSlots())
{
- CollectInputTensorHandles(tensorHandles,
- workingMemDescriptor.m_Inputs,
- layer,
- m_TensorHandleFactoryRegistry);
- CreateOutputTensorHandles(tensorHandles,
- workingMemDescriptor.m_Outputs,
- layer,
- m_TensorHandleFactoryRegistry);
+ ARMNN_ASSERT(slot.GetConnection());
+ auto outputSlot = slot.GetConnectedOutputSlot();
+ auto key = outputSlot->GetOwningLayer().GetGuid();
+
+ // Constant layers execution and management is handled during loaded network construction
+ auto found = m_ConstantTensorHandles.find(key);
+ if (found != m_ConstantTensorHandles.end())
+ {
+ workingMemDescriptor.m_Inputs.push_back(found->second);
+ continue;
+ }
+
+ auto search = tensorHandleMap.find(key);
+ unsigned int index = outputSlot->CalculateIndexOnOwner();
+ ITensorHandle* inputTensorHandle = search->second[index].get();
+ workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
+ --handleReferenceCounts.at(inputTensorHandle);
+ if (handleReferenceCounts.at(inputTensorHandle) == 0u)
+ {
+ // Stop managing lifetime of tensor handle
+ inputTensorHandle->Allocate();
+ handleReferenceCounts.erase(inputTensorHandle);
+ }
}
workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
- workingMemDescriptors.push_back(workingMemDescriptor);
+
+ // Input layers/workloads will not be executed, so the descriptor is not added to workingMemDescriptors
+ // However we will still need to manage the tensorHandle
+ if (isInputLayer)
+ {
+ workingMemDescriptors.push_back(workingMemDescriptor);
+ }
}
+
return std::make_unique<WorkingMemHandle>(networkId,
workingMemDescriptors,
- workingMemDescriptorMap);
+ workingMemDescriptorMap,
+ memoryManagers,
+ std::move(tensorHandleMap));
}
void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp
index 2bcf5c8c08..51092c744e 100644
--- a/src/armnn/LoadedNetwork.hpp
+++ b/src/armnn/LoadedNetwork.hpp
@@ -74,24 +74,21 @@ public:
profiling::ProfilingGuid GetNetworkGuid();
private:
+ using WorkloadFactoryWithMemoryManager =
+ std::pair<IBackendInternal::IWorkloadFactoryPtr, IBackendInternal::IMemoryManagerSharedPtr>;
+
+ using WorkloadFactoryMap = std::unordered_map<BackendId, WorkloadFactoryWithMemoryManager>;
+
void AllocateWorkingMemory(std::lock_guard<std::mutex>& lock);
+ void AllocateAndExecuteConstantWorkloads();
+
+ std::unordered_map<LayerGuid, ITensorHandle* > m_ConstantTensorHandles;
+ std::unordered_map<LayerGuid, std::unique_ptr<IWorkload> > m_ConstantWorkloads;
LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
const INetworkProperties& networkProperties,
profiling::ProfilingService& profilingService);
- void CollectInputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
- std::vector<ITensorHandle*>& inputs,
- const armnn::Layer* layer,
- const TensorHandleFactoryRegistry& registry,
- const bool isMemoryManaged = false);
-
- void CreateOutputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
- std::vector<ITensorHandle*>& outputs,
- const armnn::Layer* layer,
- const TensorHandleFactoryRegistry& registry,
- const bool isMemoryManaged = false);
-
void EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo);
void EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo);
@@ -107,11 +104,6 @@ private:
using BackendPtrMap = std::unordered_map<BackendId, IBackendInternalUniquePtr>;
- using WorkloadFactoryWithMemoryManager =
- std::pair<IBackendInternal::IWorkloadFactoryPtr, IBackendInternal::IMemoryManagerSharedPtr>;
-
- using WorkloadFactoryMap = std::unordered_map<BackendId, WorkloadFactoryWithMemoryManager>;
-
BackendPtrMap m_Backends;
WorkloadFactoryMap m_WorkloadFactories;
diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp
index c1a48d482f..0cbef82e83 100644
--- a/src/armnn/WorkingMemHandle.cpp
+++ b/src/armnn/WorkingMemHandle.cpp
@@ -6,6 +6,7 @@
#include "backendsCommon/CpuTensorHandle.hpp"
#include "WorkingMemHandle.hpp"
#include "Network.hpp"
+#include <armnn/backends/IMemoryManager.hpp>
namespace armnn
{
@@ -13,36 +14,47 @@ namespace armnn
namespace experimental
{
-WorkingMemHandle::WorkingMemHandle(NetworkId networkId,
- std::vector<WorkingMemDescriptor> workingMemDescriptors,
- std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap) :
+WorkingMemHandle::WorkingMemHandle(
+ NetworkId networkId,
+ std::vector<WorkingMemDescriptor> workingMemDescriptors,
+ std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap,
+ std::vector<std::shared_ptr<IMemoryManager>> memoryManagers,
+ std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > ownedTensorHandles) :
m_NetworkId(networkId),
m_WorkingMemDescriptors(workingMemDescriptors),
m_WorkingMemDescriptorMap(workingMemDescriptorMap),
+ m_MemoryManagers(memoryManagers),
+ m_OwnedTensorHandles(std::move(ownedTensorHandles)),
m_IsAllocated(false),
m_Mutex()
-{}
+{
+}
-void WorkingMemHandle::FreeWorkingMemory()
+void WorkingMemHandle::Allocate()
{
- for (auto workingMemDescriptor : m_WorkingMemDescriptors)
+ if (m_IsAllocated)
+ {
+ return;
+ }
+ m_IsAllocated = true;
+
+ for (auto& mgr : m_MemoryManagers)
+ {
+ mgr->Acquire();
+ }
+}
+
+void WorkingMemHandle::Free()
+{
+ if (!m_IsAllocated)
+ {
+ return;
+ }
+ m_IsAllocated = false;
+
+ for (auto& mgr : m_MemoryManagers)
{
- for (auto input : workingMemDescriptor.m_Inputs)
- {
- if (input)
- {
- delete input;
- input = nullptr;
- }
- }
- for (auto output : workingMemDescriptor.m_Outputs)
- {
- if (output)
- {
- delete output;
- output = nullptr;
- }
- }
+ mgr->Release();
}
}
diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp
index cef6fb6fd3..92b0acaec3 100644
--- a/src/armnn/WorkingMemHandle.hpp
+++ b/src/armnn/WorkingMemHandle.hpp
@@ -26,10 +26,12 @@ class WorkingMemHandle final : public IWorkingMemHandle
public:
WorkingMemHandle(NetworkId networkId,
std::vector<WorkingMemDescriptor> workingMemDescriptors,
- std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap);
+ std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap,
+ std::vector<std::shared_ptr<IMemoryManager>> memoryManagers,
+ std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > ownedTensorHandles);
~WorkingMemHandle()
- { FreeWorkingMemory(); }
+ { Free(); }
NetworkId GetNetworkId() override
{
@@ -38,50 +40,10 @@ public:
/// Allocate the backing memory required for execution. If this is not called, then allocation will be
/// deferred to execution time. The mutex must be locked.
- void Allocate() override
- {
- if (m_IsAllocated)
- {
- return;
- }
- m_IsAllocated = true;
-
- // Iterate through all WorkingMemDescriptors calling allocate() on each input and output in turn
- for (auto workingMemDescriptor : m_WorkingMemDescriptors)
- {
- for (auto& input : workingMemDescriptor.m_Inputs)
- {
- input->Allocate();
- }
- for (auto& output : workingMemDescriptor.m_Outputs)
- {
- output->Allocate();
- }
- }
- }
+ void Allocate() override;
/// Free the backing memory required for execution. The mutex must be locked.
- void Free() override
- {
- if (!m_IsAllocated)
- {
- return;
- }
- m_IsAllocated = false;
-
- // Iterate through all WorkingMemDescriptors calling free() on each input and output in turn
- for (auto workingMemDescriptor : m_WorkingMemDescriptors)
- {
- for (auto& input : workingMemDescriptor.m_Inputs)
- {
- input->Unmap();
- }
- for (auto& output : workingMemDescriptor.m_Outputs)
- {
- output->Unmap();
- }
- }
- }
+ void Free() override;
/// IsAllocated returns true if the backing memory is currently allocated. The mutex must be locked.
bool IsAllocated() override
@@ -111,13 +73,18 @@ public:
}
private:
- void FreeWorkingMemory();
-
NetworkId m_NetworkId;
std::shared_ptr<ProfilerImpl> m_Profiler;
std::vector<WorkingMemDescriptor> m_WorkingMemDescriptors;
std::unordered_map<LayerGuid, WorkingMemDescriptor> m_WorkingMemDescriptorMap;
+
+ // Vector of IMemoryManagers that manage the WorkingMemHandle's memory
+ std::vector<std::shared_ptr<IMemoryManager>> m_MemoryManagers;
+ // TensorHandles owned by this WorkingMemHandle
+ // constant tensor's can be shared by multiple WorkingMemHandles and so will not be stored here
+ std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > m_OwnedTensorHandles;
+
bool m_IsAllocated;
std::mutex m_Mutex;
};
diff --git a/src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp b/src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp
index f926478432..e9e76e73a6 100644
--- a/src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp
+++ b/src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp
@@ -41,6 +41,11 @@ public:
/// Release memory required for inference
void ReleaseMemory();
+ std::vector<std::shared_ptr<IMemoryManager>>& GetMemoryManagers()
+ {
+ return m_MemoryManagers;
+ }
+
private:
std::vector<std::unique_ptr<ITensorHandleFactory>> m_Factories;
std::vector<std::shared_ptr<IMemoryManager>> m_MemoryManagers;
diff --git a/src/backends/reference/workloads/InstanceNorm.cpp b/src/backends/reference/workloads/InstanceNorm.cpp
index d628c03e5f..b6e616ad49 100644
--- a/src/backends/reference/workloads/InstanceNorm.cpp
+++ b/src/backends/reference/workloads/InstanceNorm.cpp
@@ -16,10 +16,10 @@ namespace armnn
{
void InstanceNorm(const InstanceNormalizationQueueDescriptor& data,
+ const TensorInfo& inputInfo,
Decoder<float>& inputDecoder,
Encoder<float>& outputEncoder)
{
- const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]);
const TensorShape inputShape = inputInfo.GetShape();
armnnUtils::DataLayoutIndexed dataLayout(data.m_Parameters.m_DataLayout);
diff --git a/src/backends/reference/workloads/InstanceNorm.hpp b/src/backends/reference/workloads/InstanceNorm.hpp
index 2e3a18fc4b..6a783732b3 100644
--- a/src/backends/reference/workloads/InstanceNorm.hpp
+++ b/src/backends/reference/workloads/InstanceNorm.hpp
@@ -14,6 +14,7 @@ namespace armnn
{
void InstanceNorm(const InstanceNormalizationQueueDescriptor& data,
+ const TensorInfo& inputInfo,
Decoder<float>& inputData,
Encoder<float>& outputData);
diff --git a/src/backends/reference/workloads/Pad.cpp b/src/backends/reference/workloads/Pad.cpp
index 1f8b674c3a..f58dbaea61 100644
--- a/src/backends/reference/workloads/Pad.cpp
+++ b/src/backends/reference/workloads/Pad.cpp
@@ -38,6 +38,8 @@ namespace armnn
void Pad(const TensorInfo& inputInfo,
const TensorInfo& outputInfo,
+ const ITensorHandle* inputHandle,
+ ITensorHandle* outputHandle,
const PadQueueDescriptor& data)
{
auto padList = data.m_Parameters.m_PadList;
@@ -66,15 +68,15 @@ void Pad(const TensorInfo& inputInfo,
unsigned int outputHeight = 0;
unsigned int outputWidth = 0;
- auto inputData = MakeDecoder<float>(inputInfo, data.m_Inputs[0]->Map());
- auto outData = MakeEncoder<float>(outputInfo, data.m_Outputs[0]->Map());
+ auto inputData = MakeDecoder<float>(inputInfo, inputHandle->Map());
+ auto outData = MakeEncoder<float>(outputInfo, outputHandle->Map());
// Fill the output tensor with Pad value first
if (outputInfo.IsQuantized())
{
// For Quantized types Pad Value should not be quantized with scale and offset of the tensor info
auto temporaryInfo = TensorInfo(outputInfo.GetShape(), outputInfo.GetDataType(), 1.0f, 0);
- auto outputData = MakeEncoder<float>(temporaryInfo, data.m_Outputs[0]->Map());
+ auto outputData = MakeEncoder<float>(temporaryInfo, outputHandle->Map());
FillOutputWithPadValue(*outputData, padValue, numOutputElements);
}
else
diff --git a/src/backends/reference/workloads/Pad.hpp b/src/backends/reference/workloads/Pad.hpp
index e7be44e88c..65f64dffed 100644
--- a/src/backends/reference/workloads/Pad.hpp
+++ b/src/backends/reference/workloads/Pad.hpp
@@ -15,6 +15,8 @@ namespace armnn
void Pad(const TensorInfo& inputInfo,
const TensorInfo& outputInfo,
+ const ITensorHandle* inputHandle,
+ ITensorHandle* outputHandle,
const PadQueueDescriptor& data);
} //namespace armnn
diff --git a/src/backends/reference/workloads/PreluImpl.cpp b/src/backends/reference/workloads/PreluImpl.cpp
index 458025bb0a..6df259fa4d 100644
--- a/src/backends/reference/workloads/PreluImpl.cpp
+++ b/src/backends/reference/workloads/PreluImpl.cpp
@@ -10,15 +10,13 @@
namespace armnn
{
-void PreluImpl(const PreluQueueDescriptor& data,
+void PreluImpl(const TensorInfo& inputInfo,
+ const TensorInfo& alphaInfo,
+ const TensorInfo& outputInfo,
Decoder<float>& inputData,
Decoder<float>& alphaData,
Encoder<float>& outputData)
{
- const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]);
- const TensorInfo& alphaInfo = GetTensorInfo(data.m_Inputs[1]);
- const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]);
-
const TensorShape& inputShape = inputInfo.GetShape();
const TensorShape& alphaShape = alphaInfo.GetShape();
const TensorShape& outputShape = outputInfo.GetShape();
diff --git a/src/backends/reference/workloads/PreluImpl.hpp b/src/backends/reference/workloads/PreluImpl.hpp
index 9299b1c7f7..0b3d3b08e5 100644
--- a/src/backends/reference/workloads/PreluImpl.hpp
+++ b/src/backends/reference/workloads/PreluImpl.hpp
@@ -13,7 +13,9 @@
namespace armnn
{
-void PreluImpl(const PreluQueueDescriptor& data,
+void PreluImpl(const TensorInfo& inputInfo,
+ const TensorInfo& alphaInfo,
+ const TensorInfo& outputInfo,
Decoder<float>& inputData,
Decoder<float>& alphaData,
Encoder<float>& outputData);
diff --git a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
index 77167a866b..2d635bf6c2 100644
--- a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
+++ b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp
@@ -41,11 +41,11 @@ void RefArgMinMaxWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vect
const TensorInfo &outputTensorInfo = GetTensorInfo(outputs[0]);
if (outputTensorInfo.GetDataType() == armnn::DataType::Signed32) {
- int32_t *output = GetOutputTensorData<int32_t>(0, m_Data);
+ int32_t *output = GetOutputTensorData<int32_t>(outputs[0]);
ArgMinMax(decoder, output, inputTensorInfo, outputTensorInfo, m_Data.m_Parameters.m_Function,
m_Data.m_Parameters.m_Axis);
} else {
- int64_t *output = GetOutputTensorData<int64_t>(0, m_Data);
+ int64_t *output = GetOutputTensorData<int64_t>(outputs[0]);
ArgMinMax(decoder, output, inputTensorInfo, outputTensorInfo, m_Data.m_Parameters.m_Function,
m_Data.m_Parameters.m_Axis);
}
diff --git a/src/backends/reference/workloads/RefGatherWorkload.cpp b/src/backends/reference/workloads/RefGatherWorkload.cpp
index 020c067cfb..be3274f00a 100644
--- a/src/backends/reference/workloads/RefGatherWorkload.cpp
+++ b/src/backends/reference/workloads/RefGatherWorkload.cpp
@@ -34,7 +34,7 @@ void RefGatherWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<
std::unique_ptr<Decoder<float>> decoderPtr = MakeDecoder<float>(inputInfo0, inputs[0]->Map());
Decoder<float>& decoder = *decoderPtr;
- const int32_t* indicesData = GetInputTensorData<int32_t>(1, m_Data);
+ const int32_t* indicesData = reinterpret_cast<int32_t*>(inputs[1]->Map());
std::unique_ptr<Encoder<float>> encoderPtr = MakeEncoder<float>(outputInfo, outputs[0]->Map());
Encoder<float>& encoder = *encoderPtr;
diff --git a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
index daee97ae3e..e642dc9b9a 100644
--- a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
+++ b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp
@@ -37,8 +37,9 @@ void RefInstanceNormalizationWorkload::Execute(std::vector<ITensorHandle*> input
inputs[0]->Map());
std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]),
outputs[0]->Map());
+ const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
- InstanceNorm(m_Data, *inputDecoder, *outputEncoder);
+ InstanceNorm(m_Data, inputInfo, *inputDecoder, *outputEncoder);
}
} // namespace armnn
diff --git a/src/backends/reference/workloads/RefPadWorkload.cpp b/src/backends/reference/workloads/RefPadWorkload.cpp
index ea515cae68..f15306d1af 100644
--- a/src/backends/reference/workloads/RefPadWorkload.cpp
+++ b/src/backends/reference/workloads/RefPadWorkload.cpp
@@ -31,6 +31,8 @@ void RefPadWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITe
armnn::Pad(inputInfo,
outputInfo,
+ inputs[0],
+ outputs[0],
m_Data);
}
diff --git a/src/backends/reference/workloads/RefPreluWorkload.cpp b/src/backends/reference/workloads/RefPreluWorkload.cpp
index b298874334..c1d8de2d01 100644
--- a/src/backends/reference/workloads/RefPreluWorkload.cpp
+++ b/src/backends/reference/workloads/RefPreluWorkload.cpp
@@ -32,6 +32,10 @@ void RefPreluWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<I
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefPreluWorkload_Execute");
+ const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
+ const TensorInfo& alphaInfo = GetTensorInfo(inputs[1]);
+ const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
+
std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(GetTensorInfo(inputs[0]),
inputs[0]->Map());
std::unique_ptr<Decoder<float>> alphaDecoder = MakeDecoder<float>(GetTensorInfo(inputs[1]),
@@ -39,7 +43,7 @@ void RefPreluWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<I
std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(outputs[0]),
outputs[0]->Map());
- PreluImpl(m_Data, *inputDecoder, *alphaDecoder, *outputEncoder);
+ PreluImpl(inputInfo, alphaInfo, outputInfo, *inputDecoder, *alphaDecoder, *outputEncoder);
}
} // namespace armnn
diff --git a/src/backends/reference/workloads/RefRankWorkload.hpp b/src/backends/reference/workloads/RefRankWorkload.hpp
index 237ae999ce..288dddd21d 100644
--- a/src/backends/reference/workloads/RefRankWorkload.hpp
+++ b/src/backends/reference/workloads/RefRankWorkload.hpp
@@ -32,7 +32,7 @@ private:
{
const int32_t rank = static_cast<int32_t>(GetTensorInfo(inputs[0]).GetNumDimensions());
- std::memcpy(GetOutputTensorData<void>(0, m_Data), &rank, sizeof(int32_t));
+ std::memcpy(outputs[0]->Map(), &rank, sizeof(int32_t));
outputs[0]->Unmap();
}
};
diff --git a/src/backends/reference/workloads/RefStackWorkload.cpp b/src/backends/reference/workloads/RefStackWorkload.cpp
index 20cf3b38f5..31949e967e 100644
--- a/src/backends/reference/workloads/RefStackWorkload.cpp
+++ b/src/backends/reference/workloads/RefStackWorkload.cpp
@@ -32,26 +32,6 @@ void RefStackWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<I
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefStackWorkload_Execute");
- // Can perform a simple concatenation when axis == 0
- if (!m_Data.m_Parameters.m_Axis)
- {
- float* output = GetOutputTensorData<float>(0, m_Data);
- ARMNN_ASSERT(output != nullptr);
-
- unsigned int numInputs = m_Data.m_Parameters.m_NumInputs;
- unsigned int inputLength = GetTensorInfo(inputs[0]).GetNumElements();
-
- for (unsigned int inputIdx=0; inputIdx<numInputs; ++inputIdx)
- {
- const float* input = GetInputTensorData<float>(inputIdx, m_Data);
- for (unsigned int elmt=0; elmt<inputLength; ++elmt)
- {
- output[(inputIdx * inputLength) + elmt] = input[elmt];
- }
- }
- return;
- }
-
std::vector<std::unique_ptr<Decoder<float>>> inputDecoders;
for (unsigned int i=0; i<inputs.size(); ++i)
{
diff --git a/src/backends/reference/workloads/RefWorkloadUtils.hpp b/src/backends/reference/workloads/RefWorkloadUtils.hpp
index dfde58fdfe..0d839afc1c 100644
--- a/src/backends/reference/workloads/RefWorkloadUtils.hpp
+++ b/src/backends/reference/workloads/RefWorkloadUtils.hpp
@@ -45,6 +45,12 @@ DataType* GetOutputTensorData(unsigned int idx, const PayloadType& data)
return reinterpret_cast<DataType*>(tensorHandle->Map());
}
+template <typename DataType>
+DataType* GetOutputTensorData(ITensorHandle* tensorHandle)
+{
+ return reinterpret_cast<DataType*>(tensorHandle->Map());
+}
+
template <typename PayloadType>
const float* GetInputTensorDataFloat(unsigned int idx, const PayloadType& data)
{
diff --git a/src/backends/reference/workloads/Stack.cpp b/src/backends/reference/workloads/Stack.cpp
index 386c8992eb..f2bce54d6a 100644
--- a/src/backends/reference/workloads/Stack.cpp
+++ b/src/backends/reference/workloads/Stack.cpp
@@ -24,6 +24,24 @@ void Stack(const StackQueueDescriptor& data,
unsigned int axis = data.m_Parameters.m_Axis;
+ // Can perform a simple concatenation when axis == 0
+ if (!axis)
+ {
+ unsigned int numInputs = data.m_Parameters.m_NumInputs;
+ unsigned int inputLength = inputInfo.GetNumElements();
+
+ for (unsigned int inputIdx=0; inputIdx<numInputs; ++inputIdx)
+ {
+ for (unsigned int elmt=0; elmt<inputLength; ++elmt)
+ {
+ (*inputs[inputIdx])[elmt];
+ output[(inputIdx * inputLength) + elmt];
+ output.Set(inputs[inputIdx]->Get());
+ }
+ }
+ return;
+ }
+
// Initialise output data
unsigned int numOutputElements = 1;
for (unsigned int i=0; i<outputNumDims; ++i)