From b1aad4270fa8ad5c4aa62e27d564baf723b2cee5 Mon Sep 17 00:00:00 2001 From: Finn Williams Date: Thu, 28 Oct 2021 19:07:32 +0100 Subject: IVGCVSW-6527 Support the new memory API in loaded network * enable external memory management for neon and ref backends * change m_TensorMemoryVector to hold shared pointers * change input layer backend Id to match backend id of connected layer Signed-off-by: Finn Williams Change-Id: I2216a724028312eb101b290df3f224177826b1a0 --- include/armnn/IRuntime.hpp | 14 +- .../armnn/backends/IMemoryOptimizerStrategy.hpp | 4 +- src/armnn/LoadedNetwork.cpp | 610 +++++++++++++++++---- src/armnn/LoadedNetwork.hpp | 38 +- src/armnn/Network.cpp | 16 + src/armnn/WorkingMemHandle.cpp | 28 +- src/armnn/WorkingMemHandle.hpp | 25 +- src/armnn/test/OptimizerTests.cpp | 3 +- src/backends/backendsCommon/DefaultAllocator.hpp | 4 +- src/backends/backendsCommon/MemoryManager.cpp | 2 +- src/backends/backendsCommon/MemoryManager.hpp | 11 +- src/backends/backendsCommon/common.mk | 7 +- .../strategies/SingleAxisPriorityList.cpp | 4 +- .../backendsCommon/test/CompatibilityTests.cpp | 6 +- .../backendsCommon/test/MemoryManagerTests.cpp | 40 +- .../backendsCommon/test/OptimizedNetworkTests.cpp | 4 +- src/backends/cl/ClBackend.hpp | 2 +- src/backends/neon/NeonBackend.hpp | 4 +- src/backends/neon/NeonTensorHandle.hpp | 12 +- src/backends/reference/RefBackend.hpp | 2 +- src/backends/reference/RefTensorHandle.cpp | 2 +- src/backends/reference/RefWorkloadFactory.cpp | 22 +- 22 files changed, 677 insertions(+), 183 deletions(-) diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp index 93f8b0fd5b..bdfd9b224b 100644 --- a/include/armnn/IRuntime.hpp +++ b/include/armnn/IRuntime.hpp @@ -43,7 +43,8 @@ struct INetworkProperties m_ProfilingEnabled(profilingEnabled), m_OutputNetworkDetailsMethod(ProfilingDetailsMethod::Undefined), m_InputSource(m_ImportEnabled ? MemorySource::Malloc : MemorySource::Undefined), - m_OutputSource(m_ExportEnabled ? MemorySource::Malloc : MemorySource::Undefined) + m_OutputSource(m_ExportEnabled ? MemorySource::Malloc : MemorySource::Undefined), + m_ExternalMemoryManagementEnabled(false) {} ARMNN_DEPRECATED_MSG_REMOVAL_DATE("Please use INetworkProperties constructor without numThreads argument", "22.02") @@ -58,7 +59,8 @@ struct INetworkProperties m_ProfilingEnabled(profilingEnabled), m_OutputNetworkDetailsMethod(ProfilingDetailsMethod::Undefined), m_InputSource(inputSource), - m_OutputSource(outputSource) + m_OutputSource(outputSource), + m_ExternalMemoryManagementEnabled(false) { armnn::IgnoreUnused(numThreads); } @@ -67,14 +69,16 @@ struct INetworkProperties MemorySource inputSource, MemorySource outputSource, bool profilingEnabled = false, - ProfilingDetailsMethod detailsMethod = ProfilingDetailsMethod::Undefined) + ProfilingDetailsMethod detailsMethod = ProfilingDetailsMethod::Undefined, + bool externalMemoryManagementEnabled = false) : m_ImportEnabled(inputSource != MemorySource::Undefined), m_ExportEnabled(outputSource != MemorySource::Undefined), m_AsyncEnabled(asyncEnabled), m_ProfilingEnabled(profilingEnabled), m_OutputNetworkDetailsMethod(detailsMethod), m_InputSource(inputSource), - m_OutputSource(outputSource) + m_OutputSource(outputSource), + m_ExternalMemoryManagementEnabled(externalMemoryManagementEnabled) {} /// Deprecated and will be removed in future release. @@ -91,6 +95,8 @@ struct INetworkProperties const MemorySource m_InputSource; const MemorySource m_OutputSource; + const bool m_ExternalMemoryManagementEnabled; + virtual ~INetworkProperties() {} }; diff --git a/include/armnn/backends/IMemoryOptimizerStrategy.hpp b/include/armnn/backends/IMemoryOptimizerStrategy.hpp index ad5513f8a3..bdb2f5bd30 100644 --- a/include/armnn/backends/IMemoryOptimizerStrategy.hpp +++ b/include/armnn/backends/IMemoryOptimizerStrategy.hpp @@ -19,8 +19,8 @@ struct MemBlock const unsigned int index) : m_StartOfLife(startOfLife), m_EndOfLife(endOfLife), m_MemSize(memSize), m_Offset(offset), m_Index(index) {} - const unsigned int m_StartOfLife; // Y start - const unsigned int m_EndOfLife; // Y end + const unsigned int m_StartOfLife; // Y start inclusive + const unsigned int m_EndOfLife; // Y end inclusive const size_t m_MemSize; // Offset + Memsize = X end size_t m_Offset; // X start diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index 7fb14d0f32..03e5ad5bfe 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -131,11 +131,14 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, profiler->EnableNetworkDetailsToStdOut(networkProperties.m_OutputNetworkDetailsMethod); - Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); //First create tensor handlers, backends and workload factories. //Handlers are created before workloads are. //Because workload creation can modify some of the handlers, //(for example the splitter and concat layers). + + bool useExternalMemoryManager = false; + bool useInternalMemoryManager = false; + Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); for (auto&& layer : order) { auto const& backendId = layer->GetBackendId(); @@ -154,25 +157,44 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, throw BackendCapabilityException(er); } + if (networkProperties.m_AsyncEnabled && + !HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true}, + backend->GetCapabilities())) + { + std::string er = backend->GetId(); + er += " does not support ExternallyManagedMemory\n"; + er += "AsyncEnabled networks require all backends to support ExternallyManagedMemory"; + throw BackendCapabilityException(er); + } + + if (HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},backend->GetCapabilities()) + && (m_NetworkProperties.m_ExternalMemoryManagementEnabled || m_NetworkProperties.m_AsyncEnabled)) + { + m_SupportsExternallyManagedMemory[backend->GetId()] = true; + useExternalMemoryManager = true; + } + else + { + m_SupportsExternallyManagedMemory[backend->GetId()] = false; + useInternalMemoryManager = true; + } + + IBackendInternal::IWorkloadFactoryPtr workloadFactory; if (backend->SupportsTensorAllocatorAPI()) { - auto workloadFactory = backend->CreateWorkloadFactory( + workloadFactory = backend->CreateWorkloadFactory( m_TensorHandleFactoryRegistry, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(), static_cast(m_NetworkProperties.m_InputSource), static_cast(m_NetworkProperties.m_OutputSource)); - m_WorkloadFactories.emplace( - std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr))); } else { - IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager(); - auto workloadFactory = backend->CreateWorkloadFactory( - memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()); - - m_WorkloadFactories.emplace( - std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager))); + m_BackendMemoryMangers.emplace_back(backend->CreateMemoryManager()); + workloadFactory = backend->CreateWorkloadFactory( + m_BackendMemoryMangers.back(), m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()); } + m_WorkloadFactories[backendId ] = std::move(workloadFactory); } } @@ -181,6 +203,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, for (auto&& layer : order) { auto& workloadFactory = GetWorkloadFactory(*layer); + bool supportsExternalManager = m_SupportsExternallyManagedMemory[layer->GetBackendId()]; switch (layer->GetType()) { @@ -191,7 +214,12 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, // to false when creating TensorHandles layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, - !m_NetworkProperties.m_ImportEnabled); + !supportsExternalManager && !m_NetworkProperties.m_ImportEnabled); + break; + } + case LayerType::Constant: + { + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, true); break; } default: @@ -199,16 +227,18 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer // If Export is enabled disable memory management so we can export, otherwise we do a copy if ((layer->GetNumOutputSlots() == 1) && - (layer->GetOutputSlots()[0].GetNumConnections() == 1) && - (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) + (layer->GetOutputSlots()[0].GetNumConnections() == 1) && + (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) { layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, - !m_NetworkProperties.m_ExportEnabled); + !supportsExternalManager && !m_NetworkProperties.m_ExportEnabled); } else { - layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory); + layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, + workloadFactory, + !supportsExternalManager); } } } @@ -251,7 +281,8 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput(). break; } - default: { + default: + { auto workload = layer->CreateWorkload(workloadFactory); if (!workload) @@ -272,11 +303,16 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork // and are separated out from the other workloads - if (networkProperties.m_AsyncEnabled && layer->GetType() == LayerType::Constant) + if((networkProperties.m_AsyncEnabled || useExternalMemoryManager) && + layer->GetType() == LayerType::Constant) { + m_ConstantTensorHandles[layer->GetGuid()] = + layer->GetOutputSlot(0).GetOutputHandler().GetData(); m_ConstantWorkloads[layer->GetGuid()] = std::move(workload); - } else { - m_WorkloadQueue.push_back(move(workload)); + } + else + { + m_WorkloadQueue.push_back(std::move(workload)); } // release the constant data in the layer.. @@ -289,7 +325,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, for (auto&& workloadFactory : m_WorkloadFactories) { - workloadFactory.second.first->AfterWorkloadsCreated(); + workloadFactory.second->AfterWorkloadsCreated(); } if (timelineUtils) @@ -298,26 +334,88 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, timelineUtils->Commit(); } + if (useExternalMemoryManager) + { + if (networkProperties.m_AsyncEnabled) + { + CreateMemoryProfileAsync(); + } + else + { + CreateMemoryProfile(); + } + + auto backendStrategyMap = BackendRegistryInstance().GetMemoryOptimizerStrategies(); + for (auto& backendMemoryProfile : m_MemBlockMap) + { + const BackendId& backendId = backendMemoryProfile.first; + if (backendStrategyMap.find(backendId) != backendStrategyMap.end()) + { + m_MemBinMap[backendId] = backendStrategyMap[backendId]->Optimize(backendMemoryProfile.second); + } + else + { + m_MemBinMap[backendId] = m_ConstantStrategy->Optimize(backendMemoryProfile.second); + } + } + + if (!networkProperties.m_AsyncEnabled) + { + m_ExternalMemoryManager = CreateExternalMemoryManger(m_TensorMemory); + + // Sort m_TensorMemory, so it's order matches m_Tensorhandles + std::sort(m_TensorMemory.begin(), m_TensorMemory.end(), + [](const std::pair, MemorySource>& lhs, + const std::pair, MemorySource>& rhs) + { + return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId; + }); + } + } + + // Now that the intermediate tensor memory has been set-up, + // do any post allocation configuration for each workload. if (!networkProperties.m_AsyncEnabled) { - // Set up memory. - m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers(); + if (useInternalMemoryManager) + { + // Set up memory. + m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers(); + } - // Now that the intermediate tensor memory has been set-up, - // do any post allocation configuration for each workload. - ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_PostAllocationConfigure"); for (auto &workload : m_WorkloadQueue) { workload->PostAllocationConfigure(); } } - else + + if (useExternalMemoryManager) { - AllocateAndExecuteConstantWorkloads(); + if (!networkProperties.m_AsyncEnabled) + { + AllocateAndExecuteConstantWorkloads(); + } + else + { + AllocateAndExecuteConstantWorkloadsAsync(); + } } } void LoadedNetwork::AllocateAndExecuteConstantWorkloads() +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants"); + for (auto& pair : m_ConstantWorkloads) + { + auto tensorHandle = m_ConstantTensorHandles[pair.first]; + tensorHandle->Allocate(); + pair.second->Execute(); + } +} + + + +void LoadedNetwork::AllocateAndExecuteConstantWorkloadsAsync() { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants"); Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); @@ -343,7 +441,6 @@ void LoadedNetwork::AllocateAndExecuteConstantWorkloads() } } - void LoadedNetwork::SendNetworkStructure() { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_SendNetworkStructure"); @@ -429,7 +526,7 @@ const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) co CHECK_LOCATION()); } - workloadFactory = it->second.first.get(); + workloadFactory = it->second.get(); ARMNN_ASSERT_MSG(workloadFactory, "No workload factory"); @@ -780,9 +877,19 @@ void LoadedNetwork::AllocateWorkingMemory(std::lock_guard& lock) { return; } - for (auto&& workloadFactory : m_WorkloadFactories) + + if (m_ExternalMemoryManager) + { + m_ExternalMemoryManager->Allocate(); + + for (unsigned int i = 0; i < m_TensorMemory.size(); ++i) + { + m_Tensorhandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second); + } + } + + for (auto&& memoryManager : m_BackendMemoryMangers) { - IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second; if (memoryManager) { memoryManager->Acquire(); @@ -795,14 +902,20 @@ void LoadedNetwork::AllocateWorkingMemory(std::lock_guard& lock) void LoadedNetwork::FreeWorkingMemory() { std::lock_guard lockGuard(m_WorkingMemMutex); + if (!m_IsWorkingMemAllocated) { return; } - // Informs the memory managers to release memory in it's respective memory group - for (auto&& workloadFactory : m_WorkloadFactories) + + if (m_ExternalMemoryManager) + { + m_ExternalMemoryManager->Deallocate(); + } + + // Informs the memory managers to release memory in its respective memory group + for (auto&& memoryManager : m_BackendMemoryMangers) { - IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second; if (memoryManager) { memoryManager->Release(); @@ -1392,37 +1505,16 @@ Status LoadedNetwork::Execute(const InputTensors& inputTensors, std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId) { Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); - std::unordered_map > > tensorHandleMap; - std::vector workingMemDescriptors; - std::unordered_map workingMemDescriptorMap; - TensorHandleFactoryRegistry tensorHandleFactoryRegistry; - WorkloadFactoryMap workloadFactoryMap; - std::vector> memoryManagers; + // Tensors that will need to be allocated internally within armnn + std::vector> managedTensorHandles; + // Tensors that will be allocated externally by the user + std::vector> unmanagedTensorHandles; - for (auto const& backend : m_Backends) - { - if (backend.second->SupportsTensorAllocatorAPI()) - { - backend.second->RegisterTensorHandleFactories( - tensorHandleFactoryRegistry, - static_cast(m_NetworkProperties.m_InputSource), - static_cast(m_NetworkProperties.m_OutputSource)); - memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back()); - } - else - { - std::shared_ptr memoryManager = backend.second->CreateMemoryManager(); - auto workloadFactory = backend.second->CreateWorkloadFactory( - memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()); - - workloadFactoryMap.emplace( - std::make_pair(backend.first, std::make_pair(std::move(workloadFactory), memoryManager))); - memoryManagers.emplace_back(memoryManager); - } - } + std::vector workingMemDescriptors; + std::unordered_map workingMemDescriptorMap; - auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot, bool isMemoryManaged) + auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot) { ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId(); const TensorInfo& tensorInfo = outputSlot.GetTensorInfo(); @@ -1431,28 +1523,30 @@ std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(Network { BackendId id = layer->GetBackendId(); ARMNN_NO_DEPRECATE_WARN_BEGIN - return workloadFactoryMap.at(id).first->CreateTensorHandle(tensorInfo, isMemoryManaged); + return m_WorkloadFactories.at(id)->CreateTensorHandle(tensorInfo, false); ARMNN_NO_DEPRECATE_WARN_END } else { - ITensorHandleFactory* handleFactory = tensorHandleFactoryRegistry.GetFactory(factoryId); + ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId); ARMNN_ASSERT(handleFactory); - return handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged); + return handleFactory->CreateTensorHandle(tensorInfo, false); } }; struct HandleInfo { - unsigned int m_ReferenceCount = 0; - bool isInputLayerHandle = false; - bool isOutputLayerHandle = false; + ITensorHandle* m_TensorHandle; + + bool m_IsInputLayerHandle = false; + bool m_IsOutputLayerHandle = false; WorkingMemHandle::InputMemDescriptorCoords m_InputMemDescriptorCoords; WorkingMemHandle::OutputMemDescriptorCoords m_OutputMemDescriptorCoords; }; - std::unordered_map handleReferenceCounts; + std::unordered_map outputToHandleInfoMap; + unsigned int layerIndex = 0; for (auto&& layer : order) { @@ -1508,27 +1602,33 @@ std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(Network } } - tensorHandleMap[layer->GetGuid()].emplace_back(GetTensorHandle(layer, slot, isMemoryManaged)); - ITensorHandle* tensorHandle = tensorHandleMap[layer->GetGuid()].back().get(); + ITensorHandle* tensorHandle; + if (isMemoryManaged) + { + managedTensorHandles.emplace_back(GetTensorHandle(layer, slot)); + tensorHandle = managedTensorHandles.back().get(); + } + else + { + unmanagedTensorHandles.emplace_back(GetTensorHandle(layer, slot)); + tensorHandle = unmanagedTensorHandles.back().get(); + } workingMemDescriptor.m_Outputs.push_back(tensorHandle); - tensorHandle->Manage(); - unsigned int numConnections = slot.GetNumConnections(); - ARMNN_ASSERT(numConnections != 0); - HandleInfo& handleInfo = handleReferenceCounts[tensorHandle]; - handleInfo.m_ReferenceCount = numConnections; + HandleInfo& handleInfo = outputToHandleInfoMap[&slot]; + handleInfo.m_TensorHandle = tensorHandle; // Store the coordinates of the current layer's OutputSlot that is connected to the OutputLayer if (isConnectedToOutputLayer) { - handleInfo.isOutputLayerHandle = true; + handleInfo.m_IsOutputLayerHandle = true; handleInfo.m_OutputMemDescriptorCoords.m_OutputSlotCoords = {layerIndex, slotIndex}; } // Store the LayerBindingId of the InputLayer if (isInputLayer) { - handleInfo.isInputLayerHandle = true; + handleInfo.m_IsInputLayerHandle = true; LayerBindingId bindingId = static_cast(layer)->GetBindingId(); handleInfo.m_InputMemDescriptorCoords.m_LayerBindingId = bindingId; } @@ -1557,20 +1657,19 @@ std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(Network { LayerBindingId bindingId = static_cast(layer)->GetBindingId(); - HandleInfo& handleInfo = handleReferenceCounts[tensorHandle]; - handleInfo.isOutputLayerHandle = true; + HandleInfo& handleInfo = outputToHandleInfoMap[outputSlot]; + handleInfo.m_TensorHandle = tensorHandle; + handleInfo.m_IsOutputLayerHandle = true; handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId); handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0}); } continue; } - auto search = tensorHandleMap.find(key); - unsigned int index = outputSlot->CalculateIndexOnOwner(); - ITensorHandle* inputTensorHandle = search->second[index].get(); - workingMemDescriptor.m_Inputs.push_back(inputTensorHandle); + HandleInfo& handleInfo = outputToHandleInfoMap.at(outputSlot); - HandleInfo& handleInfo = handleReferenceCounts.at(inputTensorHandle); + ITensorHandle* inputTensorHandle = handleInfo.m_TensorHandle; + workingMemDescriptor.m_Inputs.push_back(inputTensorHandle); // Store the LayerBindingId of the OutputLayer if (isOutputLayer) @@ -1581,25 +1680,18 @@ std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(Network } // In this case the layer is not an Output Layer but shares its input tensorhandle with an OutputLayer // It will need to be updated as well, if we swap out the tensorhandle - else if (handleInfo.isOutputLayerHandle) + else if (handleInfo.m_IsOutputLayerHandle) { handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, slot.GetSlotIndex()}); } // Store the coordinates of the InputSlots connected to the InputLayer // There can be more than one InputSlot connected to an InputLayer, so we use a vector - if (handleInfo.isInputLayerHandle) + if (handleInfo.m_IsInputLayerHandle) { std::pair connectionLocation{layerIndex, slot.GetSlotIndex()}; handleInfo.m_InputMemDescriptorCoords.m_InputSlotCoords.emplace_back(connectionLocation); } - - --handleInfo.m_ReferenceCount; - if (handleInfo.m_ReferenceCount == 0u) - { - // Stop managing lifetime of tensor handle - inputTensorHandle->Allocate(); - } } workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor}); @@ -1612,17 +1704,29 @@ std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(Network } } + std::vector, MemorySource>> tensorMemory; + + auto externalMemoryManager = CreateExternalMemoryManger(tensorMemory); + + // Sort m_TensorMemory, so it's order matches the outputSlot order + std::sort(tensorMemory.begin(), tensorMemory.end(), + [](const std::pair, MemorySource>& lhs, + const std::pair, MemorySource>& rhs) + { + return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId; + }); + std::vector inputConnectionsInfo; std::vector outputConnectionsInfo; - for (const auto& handleInfo: handleReferenceCounts) + for (const auto& handleInfo: outputToHandleInfoMap) { - if (handleInfo.second.isOutputLayerHandle) + if (handleInfo.second.m_IsOutputLayerHandle) { outputConnectionsInfo.emplace_back(handleInfo.second.m_OutputMemDescriptorCoords); } - if (handleInfo.second.isInputLayerHandle) + if (handleInfo.second.m_IsInputLayerHandle) { inputConnectionsInfo.emplace_back(handleInfo.second.m_InputMemDescriptorCoords); } @@ -1633,8 +1737,10 @@ std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(Network outputConnectionsInfo, workingMemDescriptors, workingMemDescriptorMap, - memoryManagers, - std::move(tensorHandleMap)); + std::move(externalMemoryManager), + std::move(tensorMemory), + std::move(managedTensorHandles), + std::move(unmanagedTensorHandles)); } void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func) @@ -1645,6 +1751,312 @@ void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func) } } + +void LoadedNetwork::CreateMemoryProfileAsync() +{ + struct PartialBlock + { + unsigned int m_StartOfLife; + unsigned int m_Lifetime; + + size_t m_MemSize; + unsigned int m_Index; + + BackendId m_BackendId; + }; + + auto align = [](size_t numToAlign) + { + const size_t alignment = sizeof(float); + return ((numToAlign + alignment - 1) / alignment) * alignment; + }; + + std::unordered_map memBlockTrackerMap; + + const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined; + const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined; + + unsigned int timestep = 0; + unsigned int outputIndex = 0; + Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); + + for (auto&& layer : order) + { + const LayerType& layerType = layer->GetType(); + // Don't manage memory if importing. + if (layerType == LayerType::Input && inputImportingEnabled) + { + continue; + } + // Don't manage memory if importing. + if (layerType == LayerType::Output && outputImportingEnabled + && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1) + { + continue; + } + // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution, + // management is done separately. + if (layerType == LayerType::Constant) + { + continue; + } + + BackendId backendId = layer->GetBackendId(); + for (auto& outputSlot : layer->GetOutputSlots()) + { + if (!m_SupportsExternallyManagedMemory[backendId]) + { + continue; + } + + PartialBlock partialBlock; + + partialBlock.m_StartOfLife = timestep; + + size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes()); + partialBlock.m_MemSize = alignedSize; + partialBlock.m_Index = outputIndex++; + partialBlock.m_Lifetime = outputSlot.GetNumConnections(); + partialBlock.m_BackendId = backendId; + + if (partialBlock.m_Lifetime == 0) + { + m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife, + partialBlock.m_StartOfLife, + partialBlock.m_MemSize, + 0, + partialBlock.m_Index); + } + else + { + memBlockTrackerMap[&outputSlot] = partialBlock; + } + } + + for (auto& inputSlot : layer->GetInputSlots()) + { + const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer(); + const LayerType& owningLayerType = connectedInputLayer.GetType(); + + if (owningLayerType == LayerType::Constant) + { + continue; + } + if (inputImportingEnabled && owningLayerType == LayerType::Input) + { + continue; + } + + auto outputSlot = inputSlot.GetConnectedOutputSlot(); + + PartialBlock& partialBlock = memBlockTrackerMap.at(outputSlot); + + auto& lifetime = partialBlock.m_Lifetime; + --lifetime; + + if (lifetime == 0) + { + m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife, + timestep, + partialBlock.m_MemSize, + 0, + partialBlock.m_Index); + } + } + ++timestep; + } +} + +void LoadedNetwork::CreateMemoryProfile() +{ + // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided + // is a TensorHandle, the function just returns it + auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle) + { + ITensorHandle* ancestor = subTensorHandle; + while (ancestor && ancestor->GetParent()) + { + ancestor = ancestor->GetParent(); + } + return ancestor; + }; + + struct PartialBlock + { + unsigned int m_StartOfLife; + unsigned int m_Lifetime; + + size_t m_MemSize; + unsigned int m_Index; + + BackendId m_BackendId; + }; + + auto align = [](size_t numToAlign) + { + const size_t alignment = sizeof(float); + return ((numToAlign + alignment - 1) / alignment) * alignment; + }; + + std::unordered_map memBlockTrackerMap; + + const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined; + const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined; + + unsigned int timestep = 0; + unsigned int outputIndex = 0; + Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); + + for (auto&& layer : order) + { + const LayerType& layerType = layer->GetType(); + // Don't manage memory if importing. + if (layerType == LayerType::Input && inputImportingEnabled) + { + continue; + } + // Don't manage memory if importing. + if (layerType == LayerType::Output && outputImportingEnabled + && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1) + { + continue; + } + // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution, + // management is done separately. + if (layerType == LayerType::Constant) + { + continue; + } + + BackendId backendId = layer->GetBackendId(); + for (auto& outputSlot : layer->GetOutputSlots()) + { + if (!m_SupportsExternallyManagedMemory[backendId]) + { + continue; + } + + ITensorHandle* tensorHandle = outputSlot.GetOutputHandler().GetData(); + tensorHandle = TraceSubTensorHandleAncestry(tensorHandle); + + if (memBlockTrackerMap.find(tensorHandle) == memBlockTrackerMap.end()) + { + PartialBlock partialBlock; + + partialBlock.m_StartOfLife = timestep; + + size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes()); + partialBlock.m_MemSize = alignedSize; + partialBlock.m_Index = outputIndex++; + partialBlock.m_Lifetime = outputSlot.GetNumConnections(); + partialBlock.m_BackendId = backendId; + + if (partialBlock.m_Lifetime == 0) + { + m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife, + partialBlock.m_StartOfLife, + partialBlock.m_MemSize, + 0, + partialBlock.m_Index); + } + else + { + memBlockTrackerMap[tensorHandle] = partialBlock; + } + m_Tensorhandles.push_back(tensorHandle); + + } + else + { + memBlockTrackerMap.at(tensorHandle).m_Lifetime += outputSlot.GetNumConnections(); + } + } + + for (auto& inputSlot : layer->GetInputSlots()) + { + const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer(); + const LayerType& owningLayerType = connectedInputLayer.GetType(); + + if (owningLayerType == LayerType::Constant) + { + continue; + } + if (inputImportingEnabled && owningLayerType == LayerType::Input) + { + continue; + } + if (!m_SupportsExternallyManagedMemory[connectedInputLayer.GetBackendId()]) + { + continue; + } + + auto outputSlot = inputSlot.GetConnectedOutputSlot(); + + ITensorHandle* tensorHandle = outputSlot->GetOutputHandler().GetData(); + tensorHandle = TraceSubTensorHandleAncestry(tensorHandle); + + PartialBlock& partialBlock = memBlockTrackerMap.at(tensorHandle); + + auto& lifetime = partialBlock.m_Lifetime; + --lifetime; + + if (lifetime == 0) + { + m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife, + timestep, + partialBlock.m_MemSize, + 0, + partialBlock.m_Index); + } + } + ++timestep; + } + +} + +std::unique_ptr LoadedNetwork::CreateExternalMemoryManger( + std::vector, MemorySource>>& tensorMemoryVec) +{ + std::unique_ptr memoryManager = std::make_unique(); + auto allocatorMap = BackendRegistryInstance().GetAllocators(); + + for (auto& backend : m_MemBinMap) + { + std::vector bufferStorageVec; + + std::shared_ptr backendAllocator; + if (allocatorMap.find(backend.first) != allocatorMap.end()) + { + backendAllocator = allocatorMap[backend.first]; + } + else + { + backendAllocator = m_Backends[backend.first]->GetDefaultAllocator(); + } + + for (auto& memBin : backend.second) + { + BufferStorage bufferStorage; + bufferStorage.m_BufferSize = memBin.m_MemSize; + bufferStorage.m_TensorMemoryVector.reserve(memBin.m_MemBlocks.size()); + + for (auto& memBlock : memBin.m_MemBlocks) + { + auto tensorMemory = std::make_shared(TensorMemory{memBlock.m_Offset, memBlock.m_Index}); + + tensorMemoryVec.emplace_back(tensorMemory, backendAllocator->GetMemorySourceType()); + bufferStorage.m_TensorMemoryVector.emplace_back(tensorMemory); + } + + bufferStorageVec.emplace_back(std::move(bufferStorage)); + } + + memoryManager->StoreMemToAllocate(bufferStorageVec, backendAllocator, 4); + } + + return memoryManager; +} + LayerBindingId LoadedNetwork::ValidateImportedInputID(ImportedInputId id) { try diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp index 71ceaa3938..35c482cbc7 100644 --- a/src/armnn/LoadedNetwork.hpp +++ b/src/armnn/LoadedNetwork.hpp @@ -10,9 +10,15 @@ #include #include +#include #include #include #include +#include +#include +#include + + #include #include @@ -89,16 +95,16 @@ public: profiling::ProfilingGuid GetNetworkGuid(); private: - using WorkloadFactoryWithMemoryManager = - std::pair; - using WorkloadFactoryMap = std::unordered_map; void AllocateWorkingMemory(std::lock_guard& lock); void AllocateAndExecuteConstantWorkloads(); + void AllocateAndExecuteConstantWorkloadsAsync(); + + std::unordered_map> m_ConstantWorkloads; + std::unordered_map m_ConstantTensorHandles; - std::unordered_map m_ConstantTensorHandles; - std::unordered_map > m_ConstantWorkloads; + std::unique_ptr m_ConstantStrategy = std::make_unique(); LoadedNetwork(std::unique_ptr net, const INetworkProperties& networkProperties, @@ -120,9 +126,18 @@ private: inline LayerBindingId ValidateImportedInputID(ImportedInputId id); inline LayerBindingId ValidateImportedOutputID(ImportedOutputId id); + void CreateMemoryProfile(); + void CreateMemoryProfileAsync(); + + std::unique_ptr CreateExternalMemoryManger( + std::vector, MemorySource>>& tensorMemory); + using BackendPtrMap = std::unordered_map; - BackendPtrMap m_Backends; + BackendPtrMap m_Backends; + std::vector m_BackendMemoryMangers; + + using WorkloadFactoryMap = std::unordered_map; WorkloadFactoryMap m_WorkloadFactories; std::unique_ptr m_OptimizedNetwork; @@ -171,6 +186,17 @@ private: ImportedInputId m_CurImportedInputId = 0; ImportedInputId m_CurImportedOutputId = 0; + + std::unordered_map> m_MemBlockMap; + std::unordered_map> m_MemBinMap; + + std::vector m_Tensorhandles; + + std::vector, MemorySource>> m_TensorMemory; + + std::unique_ptr m_ExternalMemoryManager; + + std::unordered_map m_SupportsExternallyManagedMemory; }; } diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index e00dbfc0fc..17a1da1f6c 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -934,6 +934,11 @@ OptimizationResult AssignBackends(OptimizedNetworkImpl* optNetObjPtr, { auto layer = *it; + if (layer->GetType() == LayerType::Input) + { + continue; + } + DataType dataTypeIn = layer->GetNumInputSlots() == 0 ? DataType::Float32 : layer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo().GetDataType(); DataType dataTypeOut = layer->GetNumOutputSlots() == 0 ? DataType::Float32 : @@ -1027,6 +1032,17 @@ OptimizationResult AssignBackends(OptimizedNetworkImpl* optNetObjPtr, } } + for (auto it = firstLayer; it != lastLayer; ++it) + { + auto layer = *it; + + if(layer->GetType() == LayerType::Input) + { + BackendId connectedBackendId = layer->GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetBackendId(); + layer->SetBackendId(connectedBackendId); + } + } + return result; } diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp index e2ad52a772..2cb47fbfc7 100644 --- a/src/armnn/WorkingMemHandle.cpp +++ b/src/armnn/WorkingMemHandle.cpp @@ -17,16 +17,20 @@ namespace experimental WorkingMemHandle::WorkingMemHandle(NetworkId networkId, std::vector inputLayerInfo, - std::vector ouputLayerInfo, + std::vector outputLayerInfo, std::vector workingMemDescriptors, std::unordered_map workingMemDescriptorMap, - std::vector> memoryManagers, - std::unordered_map > > ownedTensorHandles) + std::unique_ptr memoryManager, + std::vector, MemorySource>> tensorMemory, + std::vector> managedTensorHandles, + std::vector> unmanagedTensorHandles) : m_NetworkId(networkId) , m_WorkingMemDescriptors(workingMemDescriptors) , m_WorkingMemDescriptorMap(workingMemDescriptorMap) - , m_MemoryManagers(memoryManagers) - , m_OwnedTensorHandles(std::move(ownedTensorHandles)) + , m_MemoryManager(std::move(memoryManager)) + , m_TensorMemory(std::move(tensorMemory)) + , m_ManagedTensorHandles(std::move(managedTensorHandles)) + , m_UnmanagedTensorHandles(std::move(unmanagedTensorHandles)) , m_InputSize(numeric_cast(inputLayerInfo.size())) , m_IsAllocated(false) { @@ -54,7 +58,7 @@ WorkingMemHandle::WorkingMemHandle(NetworkId networkId, } } size_t bindingIdCount = inputLayerInfo.size(); - for (const auto& outputInfo : ouputLayerInfo) + for (const auto& outputInfo : outputLayerInfo) { for (auto bindingId : outputInfo.m_LayerBindingIds) { @@ -88,6 +92,7 @@ WorkingMemHandle::WorkingMemHandle(NetworkId networkId, } } m_BindingIdVec = std::vector(bindingIdCount); + IgnoreUnused(m_UnmanagedTensorHandles); } void WorkingMemHandle::Allocate() @@ -98,9 +103,11 @@ void WorkingMemHandle::Allocate() } m_IsAllocated = true; - for (auto& mgr : m_MemoryManagers) + m_MemoryManager->Allocate(); + + for (unsigned int i = 0; i < m_TensorMemory.size(); ++i) { - mgr->Acquire(); + m_ManagedTensorHandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second); } } @@ -112,10 +119,7 @@ void WorkingMemHandle::Free() } m_IsAllocated = false; - for (auto& mgr : m_MemoryManagers) - { - mgr->Release(); - } + m_MemoryManager->Deallocate(); } void WorkingMemHandle::MemSyncOutputs() diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp index 9078a8d54c..bca1d2d80c 100644 --- a/src/armnn/WorkingMemHandle.hpp +++ b/src/armnn/WorkingMemHandle.hpp @@ -14,6 +14,7 @@ #include #include +#include namespace armnn { @@ -45,11 +46,13 @@ public: WorkingMemHandle(NetworkId networkId, std::vector inputLayerInfo, - std::vector ouputLayerInfo, + std::vector outputLayerInfo, std::vector workingMemDescriptors, std::unordered_map workingMemDescriptorMap, - std::vector> memoryManagers, - std::unordered_map > > ownedTensorHandles); + std::unique_ptr memoryManager, + std::vector, MemorySource>> tensorMemory, + std::vector> managedTensorHandles, + std::vector> unmanagedTensorHandles); ~WorkingMemHandle() { Free(); } @@ -128,11 +131,17 @@ private: std::vector m_WorkingMemDescriptors; std::unordered_map m_WorkingMemDescriptorMap; - // Vector of IMemoryManagers that manage the WorkingMemHandle's memory - std::vector> m_MemoryManagers; - // TensorHandles owned by this WorkingMemHandle - // constant tensor's can be shared by multiple WorkingMemHandles and so will not be stored here - std::unordered_map > > m_OwnedTensorHandles; + std::unique_ptr m_MemoryManager; + + // Memory to be imported into the tensorHandles after allocation + std::vector, MemorySource>> m_TensorMemory; + + + // Tensors that will need to be allocated internally within armnn + std::vector> m_ManagedTensorHandles; + + // Tensors that will be allocated externally by the user + std::vector> m_UnmanagedTensorHandles; std::unordered_map m_InputValidationMap; std::unordered_map m_OutputValidationMap; diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp index 8416a8dd0d..3cea1b540e 100644 --- a/src/armnn/test/OptimizerTests.cpp +++ b/src/armnn/test/OptimizerTests.cpp @@ -714,7 +714,8 @@ TEST_CASE("BackendHintTest") case armnn::LayerType::Input: { auto inputLayer = PolymorphicDowncast(layer); - CHECK((inputLayer->GetBackendId() == "MockBackend")); + const auto connectedLayerBackendId = inputLayer->GetOutputSlot(0).GetOwningLayer().GetBackendId(); + CHECK((inputLayer->GetBackendId() == connectedLayerBackendId)); break; } case armnn::LayerType::Output: diff --git a/src/backends/backendsCommon/DefaultAllocator.hpp b/src/backends/backendsCommon/DefaultAllocator.hpp index 2451db3ab8..cf0f1774f0 100644 --- a/src/backends/backendsCommon/DefaultAllocator.hpp +++ b/src/backends/backendsCommon/DefaultAllocator.hpp @@ -22,12 +22,12 @@ public: void* allocate(size_t size, size_t alignment = 0) override { IgnoreUnused(alignment); - return ::operator new(size); + return ::operator new(size_t(size)); } void free(void* ptr) override { - std::free(ptr); + ::operator delete(ptr); } armnn::MemorySource GetMemorySourceType() override diff --git a/src/backends/backendsCommon/MemoryManager.cpp b/src/backends/backendsCommon/MemoryManager.cpp index 1c109c3c91..77cab27789 100644 --- a/src/backends/backendsCommon/MemoryManager.cpp +++ b/src/backends/backendsCommon/MemoryManager.cpp @@ -11,7 +11,7 @@ namespace armnn { void MemoryManager::StoreMemToAllocate(std::vector bufferStorageVector, - ICustomAllocator* customAllocator, + std::shared_ptr customAllocator, const size_t typeAlignment) { IgnoreUnused(typeAlignment); diff --git a/src/backends/backendsCommon/MemoryManager.hpp b/src/backends/backendsCommon/MemoryManager.hpp index cbd6fcf9bc..5113b231d3 100644 --- a/src/backends/backendsCommon/MemoryManager.hpp +++ b/src/backends/backendsCommon/MemoryManager.hpp @@ -2,6 +2,7 @@ // Copyright © 2021 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // +#pragma once #include @@ -10,7 +11,7 @@ namespace armnn struct Allocator { /// Pointer to @ICustomAllocator. - ICustomAllocator* m_CustomAllocator{}; + std::shared_ptr m_CustomAllocator{}; /// Value which the size of each buffer (actual data size + padding) has to be a multiple of. size_t m_Alignment = 0 ; }; @@ -19,16 +20,16 @@ struct TensorMemory { /// Number of bytes the value is away from the @BufferStorage.m_Buffer. size_t m_Offset{}; - /// Pointer to the tensor value. - void* m_Data = nullptr; /// Identifier to be used by the @LoadedNetwork to order the tensors. unsigned int m_OutputSlotId{}; + /// Pointer to the tensor value. + void* m_Data = nullptr; }; struct BufferStorage { /// Vector of pointer to @TensorMemory. - std::vector m_TensorMemoryVector; + std::vector> m_TensorMemoryVector; /// Total size of the buffer. size_t m_BufferSize; /// Pointer to the first element of the buffer. @@ -43,7 +44,7 @@ public: /// @param[in] customAllocator - Pointer to @ICustomAllocator. /// @param[in] typeAlignment - Optional parameter. Value of which the size of each value has to be multiple of. void StoreMemToAllocate(std::vector bufferStorageVector, - ICustomAllocator* customAllocator, + std::shared_ptr customAllocator, size_t typeAlignment = 0); /// Allocate the amount of memory indicated by @m_BufferSize, and diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk index a77ec06035..56c9d6545a 100644 --- a/src/backends/backendsCommon/common.mk +++ b/src/backends/backendsCommon/common.mk @@ -17,6 +17,7 @@ COMMON_SOURCES := \ MapWorkload.cpp \ MemCopyWorkload.cpp \ MemImportWorkload.cpp \ + MemoryManager.cpp \ MemSyncWorkload.cpp \ OptimizationViews.cpp \ TensorHandleFactoryRegistry.cpp \ @@ -25,7 +26,8 @@ COMMON_SOURCES := \ WorkloadFactory.cpp \ WorkloadUtils.cpp \ memoryOptimizerStrategyLibrary/strategies/ConstantMemoryStrategy.cpp \ - memoryOptimizerStrategyLibrary/strategies/StrategyValidator.cpp \ + memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp \ + memoryOptimizerStrategyLibrary/strategies/StrategyValidator.cpp # COMMON_TEST_SOURCES contains the list of files to be included @@ -104,7 +106,8 @@ COMMON_TEST_SOURCES := \ test/layerTests/TransposeConvolution2dTestImpl.cpp \ test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp \ memoryOptimizerStrategyLibrary/test/ConstMemoryStrategyTests.cpp \ - memoryOptimizerStrategyLibrary/test/ValidatorStrategyTests.cpp + memoryOptimizerStrategyLibrary/test/ValidatorStrategyTests.cpp \ + memoryOptimizerStrategyLibrary/test/SingleAxisPriorityListTests.cpp ifeq ($(ARMNN_REF_ENABLED),1) COMMON_TEST_SOURCES += \ diff --git a/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp b/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp index 3afa061681..738b7137a7 100644 --- a/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp +++ b/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp @@ -155,9 +155,9 @@ void SingleAxisPriorityList::PlaceBlocks(const std::list& priorityLis // The indexes don't match we need at least two words // Zero the bits to the right of curBlock->m_EndOfLife - remainder = (curBlock->m_EndOfLife +1 - lastWordIndex * wordSize); + remainder = (curBlock->m_EndOfLife - lastWordIndex * wordSize); - size_t lastWord = (1u << remainder) - 1; + size_t lastWord = (1ul << remainder) - 1; lastWord = lastWord << (wordSize - remainder); if(firstWordIndex + 1 == lastWordIndex) diff --git a/src/backends/backendsCommon/test/CompatibilityTests.cpp b/src/backends/backendsCommon/test/CompatibilityTests.cpp index d18a8fbb6c..3685f75986 100644 --- a/src/backends/backendsCommon/test/CompatibilityTests.cpp +++ b/src/backends/backendsCommon/test/CompatibilityTests.cpp @@ -181,7 +181,7 @@ TEST_CASE ("Ref_Backends_Capability_Test") {"ProtectedContentAllocation", false}, {"ConstantTensorsAsInputs", true}, {"PreImportIOTensors", true}, - {"ExternallyManagedMemory", false}, + {"ExternallyManagedMemory", true}, {"MultiAxisPacking", false}}); } @@ -200,7 +200,7 @@ TEST_CASE ("Neon_Backends_Capability_Test") {"ProtectedContentAllocation", false}, {"ConstantTensorsAsInputs", false}, {"PreImportIOTensors", false}, - {"ExternallyManagedMemory", false}, + {"ExternallyManagedMemory", true}, {"MultiAxisPacking", false}}); } @@ -219,7 +219,7 @@ TEST_CASE ("Cl_Backends_Capability_Test") {"ProtectedContentAllocation", true}, {"ConstantTensorsAsInputs", false}, {"PreImportIOTensors", false}, - {"ExternallyManagedMemory", false}, + {"ExternallyManagedMemory", true}, {"MultiAxisPacking", false}}); } diff --git a/src/backends/backendsCommon/test/MemoryManagerTests.cpp b/src/backends/backendsCommon/test/MemoryManagerTests.cpp index c873499ef3..662a5c2423 100644 --- a/src/backends/backendsCommon/test/MemoryManagerTests.cpp +++ b/src/backends/backendsCommon/test/MemoryManagerTests.cpp @@ -59,17 +59,18 @@ TEST_CASE("MemoryManagerTest") // Create mock up bufferStorageVector with 2 BufferStorage with the same TensorMemory size_t numTensors = 5; - std::vector tensorMemoryPointerVector(numTensors); - std::vector tensorMemoryVector; + std::vector> tensorMemoryPointerVector(numTensors); + std::vector> tensorMemoryVector; tensorMemoryVector.reserve(numTensors); std::vector offsets(numTensors); std::iota(std::begin(offsets), std::end(offsets), 0); - for (uint32_t idx = 0; idx < tensorMemoryPointerVector.size(); ++idx) + for (uint idx = 0; idx < tensorMemoryPointerVector.size(); ++idx) { - tensorMemoryVector.emplace_back(TensorMemory{offsets[idx], nullptr, 0}); - tensorMemoryPointerVector[idx] = &tensorMemoryVector[idx]; + tensorMemoryVector.emplace_back(std::make_shared(TensorMemory{offsets[idx], 0, nullptr})); + + tensorMemoryPointerVector[idx] = tensorMemoryVector[idx]; } std::vector bufferStorageVector; @@ -77,30 +78,31 @@ TEST_CASE("MemoryManagerTest") bufferStorageVector.emplace_back(BufferStorage{tensorMemoryPointerVector, numTensors}); // Create an instance of the SampleCustomAllocator - SampleCustomAllocator customAllocator = SampleCustomAllocator(); - customAllocator.m_Values = {10, 11, 12, 13, 14}; + std::shared_ptr customAllocator = + std::make_unique(SampleCustomAllocator()); + + customAllocator->m_Values = {10, 11, 12, 13, 14}; // Check that the test was set up correctly - CHECK(customAllocator.m_Values.size() == numTensors); + CHECK(customAllocator->m_Values.size() == numTensors); + size_t bufferVecSize = bufferStorageVector.size(); // Utilise 3 functions in the MemoryManager. Check the counters and the pointer to the values are correct. MemoryManager memoryManager; - memoryManager.StoreMemToAllocate(bufferStorageVector, &customAllocator); + memoryManager.StoreMemToAllocate(bufferStorageVector, customAllocator); memoryManager.Allocate(); - CHECK(customAllocator.m_CounterAllocate == bufferStorageVector.size()); - for (const auto& bufferStorage : bufferStorageVector) + CHECK(customAllocator->m_CounterAllocate == bufferVecSize); + + uint idx = 0; + for (auto tensorMemory : tensorMemoryVector) { - uint32_t idx = 0; - for (auto tensorMemory : bufferStorage.m_TensorMemoryVector) - { - auto value = reinterpret_cast(tensorMemory->m_Data); - CHECK(customAllocator.m_Values[idx] == *value); - idx += 1; - } + auto value = reinterpret_cast(tensorMemory->m_Data); + CHECK(customAllocator->m_Values[idx] == *value); + idx += 1; } memoryManager.Deallocate(); - CHECK(customAllocator.m_CounterFree == bufferStorageVector.size()); + CHECK(customAllocator->m_CounterFree == bufferStorageVector.size()); } } diff --git a/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp b/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp index 012737e1d7..b0ee9bee32 100644 --- a/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp +++ b/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp @@ -138,7 +138,7 @@ TEST_CASE("OptimizeValidateDeviceNonSupportLayerWithFallback") // the other layers are supported by CpuRef. // If NEON is not enabled, all layers are supported by CpuRef. #if defined(ARMCOMPUTENEON_ENABLED) - if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output) + if (layer->GetType() == armnn::LayerType::Output) { CHECK(layer->GetBackendId() == armnn::Compute::CpuAcc); } @@ -337,7 +337,7 @@ TEST_CASE("OptimizeValidateWorkloadsDuplicateComputeDeviceWithFallback") // the other layers are supported by CpuRef. // If neither NEON, nor CL is enabled, all layers are supported by CpuRef. #if defined(ARMCOMPUTENEON_ENABLED) - if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output) + if (layer->GetType() == armnn::LayerType::Output) { CHECK(layer->GetBackendId() == armnn::Compute::CpuAcc); } diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp index 7597d093be..99fe9069ff 100644 --- a/src/backends/cl/ClBackend.hpp +++ b/src/backends/cl/ClBackend.hpp @@ -29,7 +29,7 @@ const BackendCapabilities gpuAccCapabilities("GpuAcc", {"ProtectedContentAllocation", true}, {"ConstantTensorsAsInputs", false}, {"PreImportIOTensors", false}, - {"ExternallyManagedMemory", false}, + {"ExternallyManagedMemory", true}, {"MultiAxisPacking", false}, {"SingleAxisPacking", true} }); diff --git a/src/backends/neon/NeonBackend.hpp b/src/backends/neon/NeonBackend.hpp index 68d60a4c04..e53bacb84a 100644 --- a/src/backends/neon/NeonBackend.hpp +++ b/src/backends/neon/NeonBackend.hpp @@ -10,14 +10,14 @@ namespace armnn { // add new capabilities here.. -const BackendCapabilities cpuAccCapabilities("GpuAcc", +const BackendCapabilities cpuAccCapabilities("CpuAcc", { {"NonConstWeights", false}, {"AsyncExecution", false}, {"ProtectedContentAllocation", false}, {"ConstantTensorsAsInputs", false}, {"PreImportIOTensors", false}, - {"ExternallyManagedMemory", false}, + {"ExternallyManagedMemory", true}, {"MultiAxisPacking", false}, {"SingleAxisPacking", true} }); diff --git a/src/backends/neon/NeonTensorHandle.hpp b/src/backends/neon/NeonTensorHandle.hpp index ae8aa5d8c7..dd4c2572f9 100644 --- a/src/backends/neon/NeonTensorHandle.hpp +++ b/src/backends/neon/NeonTensorHandle.hpp @@ -29,7 +29,8 @@ public: NeonTensorHandle(const TensorInfo& tensorInfo) : m_ImportFlags(static_cast(MemorySource::Malloc)), m_Imported(false), - m_IsImportEnabled(false) + m_IsImportEnabled(false), + m_TypeAlignment(GetDataTypeSize(tensorInfo.GetDataType())) { armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo); } @@ -39,7 +40,9 @@ public: MemorySourceFlags importFlags = static_cast(MemorySource::Malloc)) : m_ImportFlags(importFlags), m_Imported(false), - m_IsImportEnabled(false) + m_IsImportEnabled(false), + m_TypeAlignment(GetDataTypeSize(tensorInfo.GetDataType())) + { armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout); @@ -117,9 +120,7 @@ public: { if (source == MemorySource::Malloc && m_IsImportEnabled) { - // Checks the 16 byte memory alignment - constexpr uintptr_t alignment = sizeof(size_t); - if (reinterpret_cast(memory) % alignment) + if (reinterpret_cast(memory) % m_TypeAlignment) { throw MemoryImportException("NeonTensorHandle::Import Attempting to import unaligned memory"); } @@ -263,6 +264,7 @@ private: MemorySourceFlags m_ImportFlags; bool m_Imported; bool m_IsImportEnabled; + const uintptr_t m_TypeAlignment; }; class NeonSubTensorHandle : public IAclTensorHandle diff --git a/src/backends/reference/RefBackend.hpp b/src/backends/reference/RefBackend.hpp index 6114ce6218..da04f22d93 100644 --- a/src/backends/reference/RefBackend.hpp +++ b/src/backends/reference/RefBackend.hpp @@ -16,7 +16,7 @@ const BackendCapabilities cpuRefCapabilities("CpuRef", {"ProtectedContentAllocation", false}, {"ConstantTensorsAsInputs", true}, {"PreImportIOTensors", true}, - {"ExternallyManagedMemory", false}, + {"ExternallyManagedMemory", true}, {"MultiAxisPacking", false}, {"SingleAxisPacking", true} }); diff --git a/src/backends/reference/RefTensorHandle.cpp b/src/backends/reference/RefTensorHandle.cpp index b9e566eace..5229e9d62b 100644 --- a/src/backends/reference/RefTensorHandle.cpp +++ b/src/backends/reference/RefTensorHandle.cpp @@ -122,7 +122,7 @@ bool RefTensorHandle::Import(void* memory, MemorySource source) if (m_IsImportEnabled && source == MemorySource::Malloc) { // Check memory alignment - constexpr uintptr_t alignment = sizeof(size_t); + uintptr_t alignment = GetDataTypeSize(m_TensorInfo.GetDataType()); if (reinterpret_cast(memory) % alignment) { if (m_Imported) diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp index 75008bc866..36dcd21d32 100644 --- a/src/backends/reference/RefWorkloadFactory.cpp +++ b/src/backends/reference/RefWorkloadFactory.cpp @@ -113,10 +113,14 @@ bool RefWorkloadFactory::IsLayerSupported(const IConnectableLayer& layer, std::unique_ptr RefWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo, const bool isMemoryManaged) const { - // For Ref it is okay to make the TensorHandle memory managed as it can also store a pointer - // to unmanaged memory. This also ensures memory alignment. - IgnoreUnused(isMemoryManaged); - return std::make_unique(tensorInfo, m_MemoryManager); + if (isMemoryManaged) + { + return std::make_unique(tensorInfo, m_MemoryManager); + } + else + { + return std::make_unique(tensorInfo, static_cast(MemorySource::Malloc)); + } } std::unique_ptr RefWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo, @@ -126,7 +130,15 @@ std::unique_ptr RefWorkloadFactory::CreateTensorHandle(const Tens // For Ref it is okay to make the TensorHandle memory managed as it can also store a pointer // to unmanaged memory. This also ensures memory alignment. IgnoreUnused(isMemoryManaged, dataLayout); - return std::make_unique(tensorInfo, m_MemoryManager); + + if (isMemoryManaged) + { + return std::make_unique(tensorInfo, m_MemoryManager); + } + else + { + return std::make_unique(tensorInfo, static_cast(MemorySource::Malloc)); + } } std::unique_ptr RefWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor, -- cgit v1.2.1