From 01097941ef85073c56cbd1d5f00d7e8ffeb9876d Mon Sep 17 00:00:00 2001 From: Finn Williams Date: Mon, 26 Apr 2021 12:06:34 +0100 Subject: IVGCVSW-5843 Separate memory managers for WorkingMemHandles * Add inter layer memory management to WorkingMemHandle * Change Const layers to be executed once in loadedNetworkConstruction and share tensorHandle between all WorkingMemHandles * Fix various reference workloads pointing to memory in the queueDescriptor Signed-off-by: Finn Williams Change-Id: I69d4b3c5c84d2f5abe4540c3e624ab4f00d88226 --- src/armnn/LoadedNetwork.cpp | 358 +++++++++++++-------- src/armnn/LoadedNetwork.hpp | 26 +- src/armnn/WorkingMemHandle.cpp | 56 ++-- src/armnn/WorkingMemHandle.hpp | 59 +--- .../backendsCommon/TensorHandleFactoryRegistry.hpp | 5 + src/backends/reference/workloads/InstanceNorm.cpp | 2 +- src/backends/reference/workloads/InstanceNorm.hpp | 1 + src/backends/reference/workloads/Pad.cpp | 8 +- src/backends/reference/workloads/Pad.hpp | 2 + src/backends/reference/workloads/PreluImpl.cpp | 8 +- src/backends/reference/workloads/PreluImpl.hpp | 4 +- .../reference/workloads/RefArgMinMaxWorkload.cpp | 4 +- .../reference/workloads/RefGatherWorkload.cpp | 2 +- .../workloads/RefInstanceNormalizationWorkload.cpp | 3 +- .../reference/workloads/RefPadWorkload.cpp | 2 + .../reference/workloads/RefPreluWorkload.cpp | 6 +- .../reference/workloads/RefRankWorkload.hpp | 2 +- .../reference/workloads/RefStackWorkload.cpp | 20 -- .../reference/workloads/RefWorkloadUtils.hpp | 6 + src/backends/reference/workloads/Stack.cpp | 18 ++ 20 files changed, 339 insertions(+), 253 deletions(-) diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index d75a2021b2..85451cb0d8 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -161,35 +161,38 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, } } } - - for (auto&& layer : order) + if (!networkProperties.m_AsyncEnabled) { - auto& workloadFactory = GetWorkloadFactory(*layer); - - switch (layer->GetType()) + for (auto &&layer : order) { - case LayerType::Input: - case LayerType::MemImport: - { - // If IsImportEnabled is true then we need to set IsMemoryManaged to false when creating TensorHandles - layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, - !m_NetworkProperties.m_ImportEnabled); - break; - } - default: + auto &workloadFactory = GetWorkloadFactory(*layer); + + switch (layer->GetType()) { - // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer - // If Export is enabled disable memory management so we can export, otherwise we do a copy - if((layer->GetNumOutputSlots() == 1) && - (layer->GetOutputSlots()[0].GetNumConnections() == 1) && - (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) + case LayerType::Input: + case LayerType::MemImport: { + // If IsImportEnabled is true then we need to set IsMemoryManaged + // to false when creating TensorHandles layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, - !m_NetworkProperties.m_ExportEnabled); + !m_NetworkProperties.m_ImportEnabled); + break; } - else + default: { - layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory); + // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer + // If Export is enabled disable memory management so we can export, otherwise we do a copy + if ((layer->GetNumOutputSlots() == 1) && + (layer->GetOutputSlots()[0].GetNumConnections() == 1) && + (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) + { + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, + !m_NetworkProperties.m_ExportEnabled); + } + else + { + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory); + } } } } @@ -249,7 +252,17 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, AddWorkloadStructure(timelineUtils, workload, *layer); } - m_WorkloadQueue.push_back(move(workload)); + // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork + // and are separated out from the other workloads + if (networkProperties.m_AsyncEnabled && layer->GetType() == LayerType::Constant) + { + m_ConstantWorkloads[layer->GetGuid()] = std::move(workload); + } + else + { + m_WorkloadQueue.push_back(move(workload)); + } + // release the constant data in the layer.. layer->ReleaseConstantData(); break; @@ -268,16 +281,50 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, timelineUtils->Commit(); } - // Set up memory. - m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers(); + if (!networkProperties.m_AsyncEnabled) + { + // Set up memory. + m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers(); - // Now that the intermediate tensor memory has been set-up, do any post allocation configuration for each workload. - for (auto& workload : m_WorkloadQueue) + // Now that the intermediate tensor memory has been set-up, + // do any post allocation configuration for each workload. + for (auto &workload : m_WorkloadQueue) + { + workload->PostAllocationConfigure(); + } + } + else { - workload->PostAllocationConfigure(); + AllocateAndExecuteConstantWorkloads(); } } +void LoadedNetwork::AllocateAndExecuteConstantWorkloads() +{ + Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); + for (auto&& layer : order) + { + if (layer->GetType() == LayerType::Constant) + { + const auto& outSlot = layer->GetOutputSlots()[0]; + const auto factoryId = outSlot.GetTensorHandleFactoryId(); + ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); + auto& workloadFactory = GetWorkloadFactory(*layer); + + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory); + ITensorHandle* tensorHandle = outSlot.GetOutputHandler().GetData(); + + m_ConstantTensorHandles[layer->GetGuid()] = tensorHandle; + tensorHandle->Allocate(); + + WorkingMemDescriptor memDesc; + memDesc.m_Outputs.push_back(tensorHandle); + m_ConstantWorkloads[layer->GetGuid()]->ExecuteAsync(memDesc); + } + } +} + + void LoadedNetwork::SendNetworkStructure() { Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); @@ -803,9 +850,8 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, { throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer"); } - LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid(); + LayerGuid id = layer.GetGuid(); WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id); - ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output"); MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags(); if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor @@ -841,7 +887,7 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, memcpy(dst, src, size); }; - for (const auto& input : descriptor.m_Inputs) + for (const auto& input : descriptor.m_Outputs) { CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc); } @@ -856,7 +902,7 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outp } ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input."); - LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid(); + LayerGuid id = layer.GetGuid(); WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id); ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0]; @@ -888,8 +934,8 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outp if (importOk) { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute"); - descriptor.m_Inputs[0]->Map(true); - descriptor.m_Inputs[0]->Unmap(); + inputTensorHandle->Map(true); + inputTensorHandle->Unmap(); } else { @@ -914,10 +960,38 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outp }; std::unique_ptr tensorHandle = - std::make_unique(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); + std::make_unique(outputTensor.GetInfo(), + outputTensor.GetMemoryArea()); + + CopyTensorContentsGeneric(inputTensorHandle, tensorHandle.get(), copyFunc); + } +} + + +const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors& inputTensors) +{ + for (auto inputTensorPair : inputTensors) + { + LayerBindingId id = inputTensorPair.first; + if (id == layerId) + { + return inputTensorPair.second; + } + } + throw InvalidArgumentException("Input does not exist."); +} - CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc); +const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors& outputTensors) +{ + for (auto outputTensorPair : outputTensors) + { + LayerBindingId id = outputTensorPair.first; + if (id == layerId) + { + return outputTensorPair.second; + } } + throw InvalidArgumentException("Output does not exist."); } Status LoadedNetwork::Execute(const InputTensors& inputTensors, @@ -971,12 +1045,9 @@ Status LoadedNetwork::Execute(const InputTensors& inputTensors, { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); - unsigned int i = 0; - for (const BindableLayer* inputLayer : graph.GetInputLayers()) { - EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle); - ++i; + EnqueueInput(*inputLayer, GetInputTensor(inputLayer->GetBindingId(), inputTensors), workingMemHandle); } } @@ -1016,130 +1087,153 @@ Status LoadedNetwork::Execute(const InputTensors& inputTensors, // For each output to the network, call EnqueueOutput with the data passed by the user. { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); - unsigned int i = static_cast(m_WorkloadQueue.size() - graph.GetNumOutputs()); - - for (const BindableLayer* outputLayer : graph.GetOutputLayers()) + for (const BindableLayer *outputLayer : graph.GetOutputLayers()) { - EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle); - ++i; + EnqueueOutput(*outputLayer, GetOutputTensor(outputLayer->GetBindingId(), outputTensors), workingMemHandle); } } return executionSucceeded ? Status::Success : Status::Failure; } -// Need something like the collectors to get the correct tensors for the inputs -void LoadedNetwork::CollectInputTensorHandles( - std::unordered_map >& tensorHandles, - std::vector& inputs, - const armnn::Layer* layer, - const TensorHandleFactoryRegistry& registry, - const bool isMemoryManaged) + +/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have +/// overlapped Execution by calling this function from different threads. +std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId) { - for (auto&& inputSlot : layer->GetInputSlots()) - { - // The graph must be well-formed at this point. - ARMNN_ASSERT(inputSlot.GetConnection()); - auto outputSlot = inputSlot.GetConnectedOutputSlot(); - auto key = outputSlot->GetOwningLayer().GetGuid(); - auto search = tensorHandles.find(key); + Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); + std::unordered_map > > tensorHandleMap; + std::vector workingMemDescriptors; + std::unordered_map workingMemDescriptorMap; + TensorHandleFactoryRegistry tensorHandleFactoryRegistry; + WorkloadFactoryMap workloadFactoryMap; - if (search == tensorHandles.end()) - { - ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId(); - const TensorInfo& tensorInfo = outputSlot->GetTensorInfo(); + std::vector> memoryManagers; - ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); - ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); - ARMNN_ASSERT(handleFactory); - std::unique_ptr tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); - ITensorHandle* tensorPtr = tensor.release(); - inputs.push_back(tensorPtr); + for (auto const& backend : m_Backends) + { + if (backend.second->SupportsTensorAllocatorAPI()) + { + backend.second->RegisterTensorHandleFactories(tensorHandleFactoryRegistry); + memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back()); } else { - unsigned int index = outputSlot->CalculateIndexOnOwner(); - inputs.push_back(search->second[index]); + std::shared_ptr memoryManager = backend.second->CreateMemoryManager(); + auto workloadFactory = backend.second->CreateWorkloadFactory( + memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()); + + workloadFactoryMap.emplace( + std::make_pair(backend.first, std::make_pair(std::move(workloadFactory), memoryManager))); + memoryManagers.emplace_back(memoryManager); } } -} - -void LoadedNetwork::CreateOutputTensorHandles( - std::unordered_map >& tensorHandles, - std::vector& outputs, - const armnn::Layer* layer, - const TensorHandleFactoryRegistry& registry, - const bool isMemoryManaged) -{ - auto guid = layer->GetGuid(); - std::vector tensorHandleVectors; - tensorHandleVectors.reserve(layer->GetNumOutputSlots()); - for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++) + auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot, bool isMemoryManaged) { - const OutputSlot& slot = layer->GetOutputSlot(idx); - ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId(); - const TensorInfo& tensorInfo = slot.GetTensorInfo(); + ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId(); + const TensorInfo& tensorInfo = outputSlot.GetTensorInfo(); - ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); - ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); - ARMNN_ASSERT(handleFactory); - std::unique_ptr tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); - ITensorHandle* tensorPtr = tensor.release(); - outputs.push_back(tensorPtr); - tensorHandleVectors.push_back(tensorPtr); - } - tensorHandles.insert({guid, tensorHandleVectors}); -} - -/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have -/// overlapped Execution by calling this function from different threads. -std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId) -{ - Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); - std::unordered_map > tensorHandles; - std::vector workingMemDescriptors; - std::unordered_map workingMemDescriptorMap; + if (factoryId == ITensorHandleFactory::LegacyFactoryId) + { + BackendId id = layer->GetBackendId(); + ARMNN_NO_DEPRECATE_WARN_BEGIN + return workloadFactoryMap.at(id).first->CreateTensorHandle(tensorInfo, isMemoryManaged); + ARMNN_NO_DEPRECATE_WARN_END + } + else + { + ITensorHandleFactory* handleFactory = tensorHandleFactoryRegistry.GetFactory(factoryId); + ARMNN_ASSERT(handleFactory); + return handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); + } + }; + std::unordered_map handleReferenceCounts; for (auto&& layer : order) { - if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output) + WorkingMemDescriptor workingMemDescriptor; + + // Constant layers execution and management is handled during loaded network construction + if (layer->GetType() == LayerType::Constant) { continue; } - WorkingMemDescriptor workingMemDescriptor; + bool isMemoryManaged = true; + bool isInputLayer = true; // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer // If Export is enabled disable memory management so we can export, otherwise we do a copy - if((layer->GetNumOutputSlots() == 1) && - (layer->GetOutputSlots()[0].GetNumConnections() == 1) && - (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) + if ((layer->GetNumOutputSlots() == 1) && + (layer->GetOutputSlots()[0].GetNumConnections() == 1) && + (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) { - CollectInputTensorHandles(tensorHandles, - workingMemDescriptor.m_Inputs, - layer, - m_TensorHandleFactoryRegistry, - !m_NetworkProperties.m_ExportEnabled); - CreateOutputTensorHandles(tensorHandles, - workingMemDescriptor.m_Outputs, - layer, - m_TensorHandleFactoryRegistry, - !m_NetworkProperties.m_ExportEnabled); + isMemoryManaged = !m_NetworkProperties.m_ExportEnabled; } - else + else if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::MemImport) + { + // Input layers/workloads will not be executed so the descriptor is not added to workingMemDescriptors + // However we will still need to manage the tensorHandle + isInputLayer = false; + isMemoryManaged = !m_NetworkProperties.m_ExportEnabled; + } + + // Create a tensor handle for each output slot of a layer + // Once we create it, we start managing its lifetime + for (auto& slot : layer->GetOutputSlots()) + { + tensorHandleMap[layer->GetGuid()].emplace_back(GetTensorHandle(layer, slot, isMemoryManaged)); + ITensorHandle* tensorHandle = tensorHandleMap[layer->GetGuid()].back().get(); + + workingMemDescriptor.m_Outputs.push_back(tensorHandle); + tensorHandle->Manage(); + unsigned int numConnections = slot.GetNumConnections(); + ARMNN_ASSERT(numConnections != 0); + + handleReferenceCounts[tensorHandle] = numConnections; + } + // Loop through the input slots in the same layer and decrement the reference counter associated + // to each tensor handle we encounter. + // Once it reaches zero, the lifetime of the tensor handle has ended, and we mark it's memory as available + // so that the next tensor handle with a non overlapping lifetime can share it's memory. + for (auto& slot : layer->GetInputSlots()) { - CollectInputTensorHandles(tensorHandles, - workingMemDescriptor.m_Inputs, - layer, - m_TensorHandleFactoryRegistry); - CreateOutputTensorHandles(tensorHandles, - workingMemDescriptor.m_Outputs, - layer, - m_TensorHandleFactoryRegistry); + ARMNN_ASSERT(slot.GetConnection()); + auto outputSlot = slot.GetConnectedOutputSlot(); + auto key = outputSlot->GetOwningLayer().GetGuid(); + + // Constant layers execution and management is handled during loaded network construction + auto found = m_ConstantTensorHandles.find(key); + if (found != m_ConstantTensorHandles.end()) + { + workingMemDescriptor.m_Inputs.push_back(found->second); + continue; + } + + auto search = tensorHandleMap.find(key); + unsigned int index = outputSlot->CalculateIndexOnOwner(); + ITensorHandle* inputTensorHandle = search->second[index].get(); + workingMemDescriptor.m_Inputs.push_back(inputTensorHandle); + --handleReferenceCounts.at(inputTensorHandle); + if (handleReferenceCounts.at(inputTensorHandle) == 0u) + { + // Stop managing lifetime of tensor handle + inputTensorHandle->Allocate(); + handleReferenceCounts.erase(inputTensorHandle); + } } workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor}); - workingMemDescriptors.push_back(workingMemDescriptor); + + // Input layers/workloads will not be executed, so the descriptor is not added to workingMemDescriptors + // However we will still need to manage the tensorHandle + if (isInputLayer) + { + workingMemDescriptors.push_back(workingMemDescriptor); + } } + return std::make_unique(networkId, workingMemDescriptors, - workingMemDescriptorMap); + workingMemDescriptorMap, + memoryManagers, + std::move(tensorHandleMap)); } void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func) diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp index 2bcf5c8c08..51092c744e 100644 --- a/src/armnn/LoadedNetwork.hpp +++ b/src/armnn/LoadedNetwork.hpp @@ -74,24 +74,21 @@ public: profiling::ProfilingGuid GetNetworkGuid(); private: + using WorkloadFactoryWithMemoryManager = + std::pair; + + using WorkloadFactoryMap = std::unordered_map; + void AllocateWorkingMemory(std::lock_guard& lock); + void AllocateAndExecuteConstantWorkloads(); + + std::unordered_map m_ConstantTensorHandles; + std::unordered_map > m_ConstantWorkloads; LoadedNetwork(std::unique_ptr net, const INetworkProperties& networkProperties, profiling::ProfilingService& profilingService); - void CollectInputTensorHandles(std::unordered_map >& tensorHandles, - std::vector& inputs, - const armnn::Layer* layer, - const TensorHandleFactoryRegistry& registry, - const bool isMemoryManaged = false); - - void CreateOutputTensorHandles(std::unordered_map >& tensorHandles, - std::vector& outputs, - const armnn::Layer* layer, - const TensorHandleFactoryRegistry& registry, - const bool isMemoryManaged = false); - void EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo); void EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo); @@ -107,11 +104,6 @@ private: using BackendPtrMap = std::unordered_map; - using WorkloadFactoryWithMemoryManager = - std::pair; - - using WorkloadFactoryMap = std::unordered_map; - BackendPtrMap m_Backends; WorkloadFactoryMap m_WorkloadFactories; diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp index c1a48d482f..0cbef82e83 100644 --- a/src/armnn/WorkingMemHandle.cpp +++ b/src/armnn/WorkingMemHandle.cpp @@ -6,6 +6,7 @@ #include "backendsCommon/CpuTensorHandle.hpp" #include "WorkingMemHandle.hpp" #include "Network.hpp" +#include namespace armnn { @@ -13,36 +14,47 @@ namespace armnn namespace experimental { -WorkingMemHandle::WorkingMemHandle(NetworkId networkId, - std::vector workingMemDescriptors, - std::unordered_map workingMemDescriptorMap) : +WorkingMemHandle::WorkingMemHandle( + NetworkId networkId, + std::vector workingMemDescriptors, + std::unordered_map workingMemDescriptorMap, + std::vector> memoryManagers, + std::unordered_map > > ownedTensorHandles) : m_NetworkId(networkId), m_WorkingMemDescriptors(workingMemDescriptors), m_WorkingMemDescriptorMap(workingMemDescriptorMap), + m_MemoryManagers(memoryManagers), + m_OwnedTensorHandles(std::move(ownedTensorHandles)), m_IsAllocated(false), m_Mutex() -{} +{ +} -void WorkingMemHandle::FreeWorkingMemory() +void WorkingMemHandle::Allocate() { - for (auto workingMemDescriptor : m_WorkingMemDescriptors) + if (m_IsAllocated) + { + return; + } + m_IsAllocated = true; + + for (auto& mgr : m_MemoryManagers) + { + mgr->Acquire(); + } +} + +void WorkingMemHandle::Free() +{ + if (!m_IsAllocated) + { + return; + } + m_IsAllocated = false; + + for (auto& mgr : m_MemoryManagers) { - for (auto input : workingMemDescriptor.m_Inputs) - { - if (input) - { - delete input; - input = nullptr; - } - } - for (auto output : workingMemDescriptor.m_Outputs) - { - if (output) - { - delete output; - output = nullptr; - } - } + mgr->Release(); } } diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp index cef6fb6fd3..92b0acaec3 100644 --- a/src/armnn/WorkingMemHandle.hpp +++ b/src/armnn/WorkingMemHandle.hpp @@ -26,10 +26,12 @@ class WorkingMemHandle final : public IWorkingMemHandle public: WorkingMemHandle(NetworkId networkId, std::vector workingMemDescriptors, - std::unordered_map workingMemDescriptorMap); + std::unordered_map workingMemDescriptorMap, + std::vector> memoryManagers, + std::unordered_map > > ownedTensorHandles); ~WorkingMemHandle() - { FreeWorkingMemory(); } + { Free(); } NetworkId GetNetworkId() override { @@ -38,50 +40,10 @@ public: /// Allocate the backing memory required for execution. If this is not called, then allocation will be /// deferred to execution time. The mutex must be locked. - void Allocate() override - { - if (m_IsAllocated) - { - return; - } - m_IsAllocated = true; - - // Iterate through all WorkingMemDescriptors calling allocate() on each input and output in turn - for (auto workingMemDescriptor : m_WorkingMemDescriptors) - { - for (auto& input : workingMemDescriptor.m_Inputs) - { - input->Allocate(); - } - for (auto& output : workingMemDescriptor.m_Outputs) - { - output->Allocate(); - } - } - } + void Allocate() override; /// Free the backing memory required for execution. The mutex must be locked. - void Free() override - { - if (!m_IsAllocated) - { - return; - } - m_IsAllocated = false; - - // Iterate through all WorkingMemDescriptors calling free() on each input and output in turn - for (auto workingMemDescriptor : m_WorkingMemDescriptors) - { - for (auto& input : workingMemDescriptor.m_Inputs) - { - input->Unmap(); - } - for (auto& output : workingMemDescriptor.m_Outputs) - { - output->Unmap(); - } - } - } + void Free() override; /// IsAllocated returns true if the backing memory is currently allocated. The mutex must be locked. bool IsAllocated() override @@ -111,13 +73,18 @@ public: } private: - void FreeWorkingMemory(); - NetworkId m_NetworkId; std::shared_ptr m_Profiler; std::vector m_WorkingMemDescriptors; std::unordered_map m_WorkingMemDescriptorMap; + + // Vector of IMemoryManagers that manage the WorkingMemHandle's memory + std::vector> m_MemoryManagers; + // TensorHandles owned by this WorkingMemHandle + // constant tensor's can be shared by multiple WorkingMemHandles and so will not be stored here + std::unordered_map > > m_OwnedTensorHandles; + bool m_IsAllocated; std::mutex m_Mutex; }; diff --git a/src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp b/src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp index f926478432..e9e76e73a6 100644 --- a/src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp +++ b/src/backends/backendsCommon/TensorHandleFactoryRegistry.hpp @@ -41,6 +41,11 @@ public: /// Release memory required for inference void ReleaseMemory(); + std::vector>& GetMemoryManagers() + { + return m_MemoryManagers; + } + private: std::vector> m_Factories; std::vector> m_MemoryManagers; diff --git a/src/backends/reference/workloads/InstanceNorm.cpp b/src/backends/reference/workloads/InstanceNorm.cpp index d628c03e5f..b6e616ad49 100644 --- a/src/backends/reference/workloads/InstanceNorm.cpp +++ b/src/backends/reference/workloads/InstanceNorm.cpp @@ -16,10 +16,10 @@ namespace armnn { void InstanceNorm(const InstanceNormalizationQueueDescriptor& data, + const TensorInfo& inputInfo, Decoder& inputDecoder, Encoder& outputEncoder) { - const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]); const TensorShape inputShape = inputInfo.GetShape(); armnnUtils::DataLayoutIndexed dataLayout(data.m_Parameters.m_DataLayout); diff --git a/src/backends/reference/workloads/InstanceNorm.hpp b/src/backends/reference/workloads/InstanceNorm.hpp index 2e3a18fc4b..6a783732b3 100644 --- a/src/backends/reference/workloads/InstanceNorm.hpp +++ b/src/backends/reference/workloads/InstanceNorm.hpp @@ -14,6 +14,7 @@ namespace armnn { void InstanceNorm(const InstanceNormalizationQueueDescriptor& data, + const TensorInfo& inputInfo, Decoder& inputData, Encoder& outputData); diff --git a/src/backends/reference/workloads/Pad.cpp b/src/backends/reference/workloads/Pad.cpp index 1f8b674c3a..f58dbaea61 100644 --- a/src/backends/reference/workloads/Pad.cpp +++ b/src/backends/reference/workloads/Pad.cpp @@ -38,6 +38,8 @@ namespace armnn void Pad(const TensorInfo& inputInfo, const TensorInfo& outputInfo, + const ITensorHandle* inputHandle, + ITensorHandle* outputHandle, const PadQueueDescriptor& data) { auto padList = data.m_Parameters.m_PadList; @@ -66,15 +68,15 @@ void Pad(const TensorInfo& inputInfo, unsigned int outputHeight = 0; unsigned int outputWidth = 0; - auto inputData = MakeDecoder(inputInfo, data.m_Inputs[0]->Map()); - auto outData = MakeEncoder(outputInfo, data.m_Outputs[0]->Map()); + auto inputData = MakeDecoder(inputInfo, inputHandle->Map()); + auto outData = MakeEncoder(outputInfo, outputHandle->Map()); // Fill the output tensor with Pad value first if (outputInfo.IsQuantized()) { // For Quantized types Pad Value should not be quantized with scale and offset of the tensor info auto temporaryInfo = TensorInfo(outputInfo.GetShape(), outputInfo.GetDataType(), 1.0f, 0); - auto outputData = MakeEncoder(temporaryInfo, data.m_Outputs[0]->Map()); + auto outputData = MakeEncoder(temporaryInfo, outputHandle->Map()); FillOutputWithPadValue(*outputData, padValue, numOutputElements); } else diff --git a/src/backends/reference/workloads/Pad.hpp b/src/backends/reference/workloads/Pad.hpp index e7be44e88c..65f64dffed 100644 --- a/src/backends/reference/workloads/Pad.hpp +++ b/src/backends/reference/workloads/Pad.hpp @@ -15,6 +15,8 @@ namespace armnn void Pad(const TensorInfo& inputInfo, const TensorInfo& outputInfo, + const ITensorHandle* inputHandle, + ITensorHandle* outputHandle, const PadQueueDescriptor& data); } //namespace armnn diff --git a/src/backends/reference/workloads/PreluImpl.cpp b/src/backends/reference/workloads/PreluImpl.cpp index 458025bb0a..6df259fa4d 100644 --- a/src/backends/reference/workloads/PreluImpl.cpp +++ b/src/backends/reference/workloads/PreluImpl.cpp @@ -10,15 +10,13 @@ namespace armnn { -void PreluImpl(const PreluQueueDescriptor& data, +void PreluImpl(const TensorInfo& inputInfo, + const TensorInfo& alphaInfo, + const TensorInfo& outputInfo, Decoder& inputData, Decoder& alphaData, Encoder& outputData) { - const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]); - const TensorInfo& alphaInfo = GetTensorInfo(data.m_Inputs[1]); - const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]); - const TensorShape& inputShape = inputInfo.GetShape(); const TensorShape& alphaShape = alphaInfo.GetShape(); const TensorShape& outputShape = outputInfo.GetShape(); diff --git a/src/backends/reference/workloads/PreluImpl.hpp b/src/backends/reference/workloads/PreluImpl.hpp index 9299b1c7f7..0b3d3b08e5 100644 --- a/src/backends/reference/workloads/PreluImpl.hpp +++ b/src/backends/reference/workloads/PreluImpl.hpp @@ -13,7 +13,9 @@ namespace armnn { -void PreluImpl(const PreluQueueDescriptor& data, +void PreluImpl(const TensorInfo& inputInfo, + const TensorInfo& alphaInfo, + const TensorInfo& outputInfo, Decoder& inputData, Decoder& alphaData, Encoder& outputData); diff --git a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp index 77167a866b..2d635bf6c2 100644 --- a/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp +++ b/src/backends/reference/workloads/RefArgMinMaxWorkload.cpp @@ -41,11 +41,11 @@ void RefArgMinMaxWorkload::Execute(std::vector inputs, std::vect const TensorInfo &outputTensorInfo = GetTensorInfo(outputs[0]); if (outputTensorInfo.GetDataType() == armnn::DataType::Signed32) { - int32_t *output = GetOutputTensorData(0, m_Data); + int32_t *output = GetOutputTensorData(outputs[0]); ArgMinMax(decoder, output, inputTensorInfo, outputTensorInfo, m_Data.m_Parameters.m_Function, m_Data.m_Parameters.m_Axis); } else { - int64_t *output = GetOutputTensorData(0, m_Data); + int64_t *output = GetOutputTensorData(outputs[0]); ArgMinMax(decoder, output, inputTensorInfo, outputTensorInfo, m_Data.m_Parameters.m_Function, m_Data.m_Parameters.m_Axis); } diff --git a/src/backends/reference/workloads/RefGatherWorkload.cpp b/src/backends/reference/workloads/RefGatherWorkload.cpp index 020c067cfb..be3274f00a 100644 --- a/src/backends/reference/workloads/RefGatherWorkload.cpp +++ b/src/backends/reference/workloads/RefGatherWorkload.cpp @@ -34,7 +34,7 @@ void RefGatherWorkload::Execute(std::vector inputs, std::vector< std::unique_ptr> decoderPtr = MakeDecoder(inputInfo0, inputs[0]->Map()); Decoder& decoder = *decoderPtr; - const int32_t* indicesData = GetInputTensorData(1, m_Data); + const int32_t* indicesData = reinterpret_cast(inputs[1]->Map()); std::unique_ptr> encoderPtr = MakeEncoder(outputInfo, outputs[0]->Map()); Encoder& encoder = *encoderPtr; diff --git a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp index daee97ae3e..e642dc9b9a 100644 --- a/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp +++ b/src/backends/reference/workloads/RefInstanceNormalizationWorkload.cpp @@ -37,8 +37,9 @@ void RefInstanceNormalizationWorkload::Execute(std::vector input inputs[0]->Map()); std::unique_ptr> outputEncoder = MakeEncoder(GetTensorInfo(outputs[0]), outputs[0]->Map()); + const TensorInfo& inputInfo = GetTensorInfo(inputs[0]); - InstanceNorm(m_Data, *inputDecoder, *outputEncoder); + InstanceNorm(m_Data, inputInfo, *inputDecoder, *outputEncoder); } } // namespace armnn diff --git a/src/backends/reference/workloads/RefPadWorkload.cpp b/src/backends/reference/workloads/RefPadWorkload.cpp index ea515cae68..f15306d1af 100644 --- a/src/backends/reference/workloads/RefPadWorkload.cpp +++ b/src/backends/reference/workloads/RefPadWorkload.cpp @@ -31,6 +31,8 @@ void RefPadWorkload::Execute(std::vector inputs, std::vector inputs, std::vector> inputDecoder = MakeDecoder(GetTensorInfo(inputs[0]), inputs[0]->Map()); std::unique_ptr> alphaDecoder = MakeDecoder(GetTensorInfo(inputs[1]), @@ -39,7 +43,7 @@ void RefPreluWorkload::Execute(std::vector inputs, std::vector> outputEncoder = MakeEncoder(GetTensorInfo(outputs[0]), outputs[0]->Map()); - PreluImpl(m_Data, *inputDecoder, *alphaDecoder, *outputEncoder); + PreluImpl(inputInfo, alphaInfo, outputInfo, *inputDecoder, *alphaDecoder, *outputEncoder); } } // namespace armnn diff --git a/src/backends/reference/workloads/RefRankWorkload.hpp b/src/backends/reference/workloads/RefRankWorkload.hpp index 237ae999ce..288dddd21d 100644 --- a/src/backends/reference/workloads/RefRankWorkload.hpp +++ b/src/backends/reference/workloads/RefRankWorkload.hpp @@ -32,7 +32,7 @@ private: { const int32_t rank = static_cast(GetTensorInfo(inputs[0]).GetNumDimensions()); - std::memcpy(GetOutputTensorData(0, m_Data), &rank, sizeof(int32_t)); + std::memcpy(outputs[0]->Map(), &rank, sizeof(int32_t)); outputs[0]->Unmap(); } }; diff --git a/src/backends/reference/workloads/RefStackWorkload.cpp b/src/backends/reference/workloads/RefStackWorkload.cpp index 20cf3b38f5..31949e967e 100644 --- a/src/backends/reference/workloads/RefStackWorkload.cpp +++ b/src/backends/reference/workloads/RefStackWorkload.cpp @@ -32,26 +32,6 @@ void RefStackWorkload::Execute(std::vector inputs, std::vector(0, m_Data); - ARMNN_ASSERT(output != nullptr); - - unsigned int numInputs = m_Data.m_Parameters.m_NumInputs; - unsigned int inputLength = GetTensorInfo(inputs[0]).GetNumElements(); - - for (unsigned int inputIdx=0; inputIdx(inputIdx, m_Data); - for (unsigned int elmt=0; elmt>> inputDecoders; for (unsigned int i=0; i(tensorHandle->Map()); } +template +DataType* GetOutputTensorData(ITensorHandle* tensorHandle) +{ + return reinterpret_cast(tensorHandle->Map()); +} + template const float* GetInputTensorDataFloat(unsigned int idx, const PayloadType& data) { diff --git a/src/backends/reference/workloads/Stack.cpp b/src/backends/reference/workloads/Stack.cpp index 386c8992eb..f2bce54d6a 100644 --- a/src/backends/reference/workloads/Stack.cpp +++ b/src/backends/reference/workloads/Stack.cpp @@ -24,6 +24,24 @@ void Stack(const StackQueueDescriptor& data, unsigned int axis = data.m_Parameters.m_Axis; + // Can perform a simple concatenation when axis == 0 + if (!axis) + { + unsigned int numInputs = data.m_Parameters.m_NumInputs; + unsigned int inputLength = inputInfo.GetNumElements(); + + for (unsigned int inputIdx=0; inputIdxGet()); + } + } + return; + } + // Initialise output data unsigned int numOutputElements = 1; for (unsigned int i=0; i