// // Copyright © 2017 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #include "LoadedNetwork.hpp" #include "Layer.hpp" #include "Graph.hpp" #include "Profiling.hpp" #include "HeapProfiling.hpp" #include "WorkingMemHandle.hpp" #include #include #include #include #include #include #include #include #include #include #include namespace armnn { using namespace std; using namespace arm::pipe; namespace { template std::string ToErrorMessage(const char * prefix, const ExceptionType & error) { std::stringstream ss; ss << prefix << " " << error.what(); return ss.str(); } void AddLayerStructure(std::unique_ptr& timelineUtils, const Layer& layer, ProfilingGuid networkGuid) { // Add layer to the post-optimisation network structure std::string layerName = layer.GetNameStr().empty() ? "" : layer.GetNameStr(); timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(), networkGuid, layerName, LabelsAndEventClasses::LAYER_GUID); for (auto&& input : layer.GetInputSlots()) { const IOutputSlot* source = input.GetConnectedOutputSlot(); ARMNN_ASSERT(source != NULL); timelineUtils->CreateConnectionRelationship(ProfilingRelationshipType::RetentionLink, source->GetOwningLayerGuid(), layer.GetGuid()); } } void AddWorkloadStructure(std::unique_ptr& timelineUtils, std::unique_ptr& workload, const Layer& layer) { // Add workload to the post-optimisation network structure timelineUtils->CreateTypedEntity(workload->GetGuid(), LabelsAndEventClasses::WORKLOAD_GUID); timelineUtils->MarkEntityWithLabel(workload->GetGuid(), layer.GetBackendId().Get(), LabelsAndEventClasses::BACKENDID_GUID); // Link the workload to the layer timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink, layer.GetGuid(), workload->GetGuid(), LabelsAndEventClasses::CHILD_GUID); } } // anonymous std::unique_ptr LoadedNetwork::MakeLoadedNetwork(std::unique_ptr net, std::string& errorMessage, const INetworkProperties& networkProperties, arm::pipe::IProfilingService* profilingService) { std::unique_ptr loadedNetwork; auto Fail = [&](const std::exception& error) -> std::unique_ptr { errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error); ARMNN_LOG(error) << errorMessage; return std::unique_ptr(); }; try { loadedNetwork.reset(new LoadedNetwork(std::move(net), networkProperties, profilingService)); } catch (const armnn::RuntimeException& error) { return Fail(error); } catch (const armnn::Exception& error) { return Fail(error); } catch (const std::runtime_error& error) { return Fail(error); } return loadedNetwork; } LoadedNetwork::LoadedNetwork(std::unique_ptr net, const INetworkProperties& networkProperties, arm::pipe::IProfilingService* profilingService) : m_OptimizedNetwork(std::move(net)), m_NetworkProperties(networkProperties), m_TensorHandleFactoryRegistry(), m_ProfilingService(profilingService) { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadedNetwork"); // Get the profiler and register it for the current thread. const std::shared_ptr& profiler = m_OptimizedNetwork->GetProfiler(); ProfilerManager::GetInstance().RegisterProfiler(profiler.get()); profiler->EnableProfiling(networkProperties.m_ProfilingEnabled); profiler->EnableNetworkDetailsToStdOut(networkProperties.m_OutputNetworkDetailsMethod); //First create tensor handlers, backends and workload factories. //Handlers are created before workloads are. //Because workload creation can modify some of the handlers, //(for example the splitter and concat layers). bool useExternalMemoryManager = false; bool useInternalMemoryManager = false; Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); // Ensure Topological order order.SetLayersOutOfOrder(); order.TopologicalSort(); if (!networkProperties.m_AsyncEnabled) { m_IsInputImported = std::vector(order.GetNumInputs(), false); m_IsOutputImported = std::vector(order.GetNumOutputs(), false); } for (auto&& layer : order) { auto const& backendId = layer->GetBackendId(); if (m_Backends.count(backendId) == 0) { auto createBackend = BackendRegistryInstance().GetFactory(backendId); auto it = m_Backends.emplace(std::make_pair(backendId, createBackend())); IBackendInternal* backend = it.first->second.get(); if (networkProperties.m_AsyncEnabled && !HasCapability(BackendOptions::BackendOption{"AsyncExecution", true}, backend->GetCapabilities())) { std::string er = backend->GetId(); er += " does not support AsyncExecution"; throw BackendCapabilityException(er); } if (networkProperties.m_AsyncEnabled && !HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true}, backend->GetCapabilities())) { std::string er = backend->GetId(); er += " does not support ExternallyManagedMemory\n"; er += "AsyncEnabled networks require all backends to support ExternallyManagedMemory"; throw BackendCapabilityException(er); } if (HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},backend->GetCapabilities()) && (m_NetworkProperties.m_ExternalMemoryManagementEnabled || m_NetworkProperties.m_AsyncEnabled)) { m_SupportsExternallyManagedMemory[backend->GetId()] = true; useExternalMemoryManager = true; } else { m_SupportsExternallyManagedMemory[backend->GetId()] = false; useInternalMemoryManager = true; } IBackendInternal::IWorkloadFactoryPtr workloadFactory; if (backend->SupportsTensorAllocatorAPI()) { workloadFactory = backend->CreateWorkloadFactory( m_TensorHandleFactoryRegistry, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(), static_cast(m_NetworkProperties.m_InputSource), static_cast(m_NetworkProperties.m_OutputSource)); } else { m_BackendMemoryMangers.emplace_back(backend->CreateMemoryManager()); workloadFactory = backend->CreateWorkloadFactory( m_BackendMemoryMangers.back(), m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()); } m_WorkloadFactories[backendId ] = std::move(workloadFactory); } } if (!networkProperties.m_AsyncEnabled) { for (auto&& layer : order) { auto& workloadFactory = GetWorkloadFactory(*layer); bool supportsExternalManager = m_SupportsExternallyManagedMemory[layer->GetBackendId()]; switch (layer->GetType()) { case LayerType::Input: case LayerType::MemImport: { // If IsImportEnabled is true then we need to set IsMemoryManaged // to false when creating TensorHandles layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !supportsExternalManager && !m_NetworkProperties.m_ImportEnabled); break; } case LayerType::Constant: { layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, true); break; } default: { // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer // If Export is enabled disable memory management so we can export, otherwise we do a copy if ((layer->GetNumOutputSlots() == 1) && (layer->GetOutputSlots()[0].GetNumConnections() == 1) && (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output)) { layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !supportsExternalManager && !m_NetworkProperties.m_ExportEnabled); } else { layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, !supportsExternalManager); } } } } } ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); std::unique_ptr timelineUtils = TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService); if (timelineUtils) { timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID); // Mark the network with a start of life event timelineUtils->RecordEvent(networkGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS); // and with the process ID int processID = arm::pipe::GetCurrentProcessId(); std::stringstream ss; ss << processID; timelineUtils->MarkEntityWithLabel(networkGuid, ss.str(), LabelsAndEventClasses::PROCESS_ID_GUID); } std::vector ConstWorkloads; //Then create workloads. { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_CreateWorkloads"); for (auto&& layer: order) { if (timelineUtils) { // Add layer to the post-optimisation network structure AddLayerStructure(timelineUtils, *layer, networkGuid); } const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer); switch (layer->GetType()) { case LayerType::Input: case LayerType::Output: { // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput(). break; } default: { auto workload = layer->CreateWorkload(workloadFactory); if (!workload) { const char* const layerName = layer->GetNameStr().length() != 0 ? layer->GetName() : ""; throw InvalidArgumentException( fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')", layerName, static_cast(layer->GetType()), layer->GetBackendId().Get() )); } if (timelineUtils) { // Add workload to the post-optimisation network structure AddWorkloadStructure(timelineUtils, workload, *layer); } // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork // and are separated out from the other workloads if((networkProperties.m_AsyncEnabled || useExternalMemoryManager) && layer->GetType() == LayerType::Constant) { m_ConstantTensorHandles[layer->GetGuid()] = layer->GetOutputSlot(0).GetOutputHandler().GetData(); m_ConstantWorkloads[layer->GetGuid()] = std::move(workload); } else { m_WorkloadQueue.push_back(std::move(workload)); if (layer->GetType() == LayerType::Constant) { // Place the Constant Workloads into a queue so that they can be executed first ConstWorkloads.push_back(m_WorkloadQueue.back().get()); } } // release the constant data in the layer.. layer->ReleaseConstantData(); break; } } } } // Gather information about workloads for inputs & outputs if (!networkProperties.m_AsyncEnabled && m_WorkloadQueue.size() != 0) { const int noOfInputs = armnn::numeric_cast(order.GetNumInputs()); // Get indices of all workloads connected to each input and // check if they support tensor handle replacement for (const BindableLayer* layer: order.GetInputLayers()) { const auto bindingId = layer->GetBindingId(); bool supportsReplacement = true; for (const auto inputSlot: layer->GetOutputSlot(0).GetConnections()) { auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(inputSlot->GetOwningLayer())); workloadIndex -= noOfInputs; m_InputWorkloadSlotPairs[bindingId].emplace_back(WorkloadIndices{ armnn::numeric_cast(workloadIndex), inputSlot->GetSlotIndex()}); auto workload = m_WorkloadQueue[m_InputWorkloadSlotPairs[bindingId].back().m_WorkloadIndex].get(); supportsReplacement &= workload->SupportsTensorHandleReplacement(); } ITensorHandleFactory::FactoryId factoryId = layer->GetOutputSlot(0).GetTensorHandleFactoryId(); // Get matching import factory Id ITensorHandleFactory::FactoryId importFactoryId = m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId); ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId); if (supportsReplacement && importFactory) { m_PreImportedInputHandles.emplace_back( bindingId, importFactory->CreateTensorHandle(layer->GetOutputSlot(0).GetTensorInfo(), false)); } else { m_PreImportedInputHandles.emplace_back(bindingId, nullptr); } } // Get indices of all workloads connected to each output and // check if they support tensor handle replacement for (const BindableLayer* layer: order.GetOutputLayers()) { const auto bindingId = layer->GetBindingId(); const auto outputSlot = layer->GetInputSlot(0).GetConnectedOutputSlot(); auto& indices = m_OutputWorkloadSlotPairs[bindingId]; auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(outputSlot->GetOwningLayer())); workloadIndex -= noOfInputs; indices.m_OutputSlotIndices = WorkloadIndices{numeric_cast(workloadIndex), outputSlot->CalculateIndexOnOwner()}; bool supportsReplacement = true; auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get(); supportsReplacement &= outputWorkload->SupportsTensorHandleReplacement(); for (auto &inputSlot: outputSlot->GetConnections()) { if(inputSlot->GetOwningLayer().GetType() != LayerType::Output) { auto inWorkloadIndex = std::distance(order.begin(), order.GetPosInGraph(inputSlot->GetOwningLayer())); inWorkloadIndex -= noOfInputs; indices.m_InputSlotIndices.emplace_back(WorkloadIndices{numeric_cast(inWorkloadIndex), inputSlot->GetSlotIndex()}); auto inputWorkload = m_WorkloadQueue[indices.m_InputSlotIndices.back().m_WorkloadIndex].get(); supportsReplacement &= inputWorkload->SupportsTensorHandleReplacement(); } } ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId(); // Get matching import factory Id ITensorHandleFactory::FactoryId importFactoryId = m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId); ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId); if (supportsReplacement && importFactory) { m_PreImportedOutputHandles.emplace_back( bindingId, importFactory->CreateTensorHandle(outputSlot->GetTensorInfo(), false)); } else { m_PreImportedOutputHandles.emplace_back(bindingId, nullptr); } } } for (auto&& workloadFactory : m_WorkloadFactories) { workloadFactory.second->AfterWorkloadsCreated(); } if (timelineUtils) { // Commit to send the post-optimisation network structure timelineUtils->Commit(); } if (useExternalMemoryManager) { if (networkProperties.m_AsyncEnabled) { CreateMemoryProfileAsync(); } else { CreateMemoryProfile(); } auto backendStrategyMap = BackendRegistryInstance().GetMemoryOptimizerStrategies(); for (auto& backendMemoryProfile : m_MemBlockMap) { const BackendId& backendId = backendMemoryProfile.first; if (backendStrategyMap.find(backendId) != backendStrategyMap.end()) { m_MemBinMap[backendId] = backendStrategyMap[backendId]->Optimize(backendMemoryProfile.second); } else { m_MemBinMap[backendId] = m_ConstantStrategy->Optimize(backendMemoryProfile.second); } } if (!networkProperties.m_AsyncEnabled) { m_ExternalMemoryManager = CreateExternalMemoryManger(m_TensorMemory); // Sort m_TensorMemory, so it's order matches m_Tensorhandles std::sort(m_TensorMemory.begin(), m_TensorMemory.end(), [](const std::pair, MemorySource>& lhs, const std::pair, MemorySource>& rhs) { return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId; }); } } // Now that the intermediate tensor memory has been set-up, // do any post allocation configuration for each workload. if (!networkProperties.m_AsyncEnabled) { if (useInternalMemoryManager) { // Set up memory. m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers(); } for (auto &workload : m_WorkloadQueue) { workload->PostAllocationConfigure(); } } if (useExternalMemoryManager) { if (!networkProperties.m_AsyncEnabled) { AllocateAndExecuteConstantWorkloads(); } else { AllocateAndExecuteConstantWorkloadsAsync(); } } // If synchronous, execute all constant layer workloads if (!networkProperties.m_AsyncEnabled) { for (auto workload: ConstWorkloads) { workload->Execute(); } } } void LoadedNetwork::AllocateAndExecuteConstantWorkloads() { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants"); for (auto& pair : m_ConstantWorkloads) { auto tensorHandle = m_ConstantTensorHandles[pair.first]; tensorHandle->Allocate(); pair.second->Execute(); } } void LoadedNetwork::AllocateAndExecuteConstantWorkloadsAsync() { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants"); Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); for (auto&& layer : order) { if (layer->GetType() == LayerType::Constant) { const auto& outSlot = layer->GetOutputSlots()[0]; const auto factoryId = outSlot.GetTensorHandleFactoryId(); ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId); auto& workloadFactory = GetWorkloadFactory(*layer); layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory); ITensorHandle* tensorHandle = outSlot.GetOutputHandler().GetData(); m_ConstantTensorHandles[layer->GetGuid()] = tensorHandle; tensorHandle->Allocate(); WorkingMemDescriptor memDesc; memDesc.m_Outputs.push_back(tensorHandle); m_ConstantWorkloads[layer->GetGuid()]->ExecuteAsync(memDesc); } } } void LoadedNetwork::SendNetworkStructure(arm::pipe::IProfilingService& profilingService) { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_SendNetworkStructure"); Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); std::unique_ptr timelineUtils = TimelineUtilityMethods::GetTimelineUtils(profilingService); timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID); for (auto&& layer : order) { // Add layer to the post-optimisation network structure AddLayerStructure(timelineUtils, *layer, networkGuid); switch (layer->GetType()) { case LayerType::Input: case LayerType::Output: { // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput(). break; } default: { for (auto& workload : m_WorkloadQueue) { // Add workload to the post-optimisation network structure AddWorkloadStructure(timelineUtils, workload, *layer); } break; } } } // Commit to send the post-optimisation network structure timelineUtils->Commit(); } ProfilingGuid LoadedNetwork::GetNetworkGuid() { return m_OptimizedNetwork->GetGuid(); } TensorInfo LoadedNetwork::GetInputTensorInfo(LayerBindingId layerId) const { for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers()) { ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot"); if (inputLayer->GetBindingId() == layerId) { return inputLayer->GetOutputSlot(0).GetTensorInfo(); } } throw InvalidArgumentException(fmt::format("No input layer is associated with id {}", layerId)); } TensorInfo LoadedNetwork::GetOutputTensorInfo(LayerBindingId layerId) const { for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers()) { ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot"); ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected"); if (outputLayer->GetBindingId() == layerId) { return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo(); } } throw InvalidArgumentException(fmt::format("No output layer is associated with id {}", layerId)); } const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) const { const IWorkloadFactory* workloadFactory = nullptr; auto it = m_WorkloadFactories.find(layer.GetBackendId()); if (it == m_WorkloadFactories.end()) { throw RuntimeException(fmt::format("No workload factory for {0} to be used for layer: {1}", layer.GetBackendId().Get(), layer.GetNameStr()), CHECK_LOCATION()); } workloadFactory = it->second.get(); ARMNN_ASSERT_MSG(workloadFactory, "No workload factory"); std::string reasonIfUnsupported; ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, {}, reasonIfUnsupported, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()), "Factory does not support layer"); IgnoreUnused(reasonIfUnsupported); return *workloadFactory; } namespace { // Non-copyable class owning accelerator-specific tensor data. class TensorPin { public: TensorPin(std::unique_ptr handle, const TensorInfo& info, LayerBindingId id) : m_TensorHandle(std::move(handle)) , m_TensorInfo(info) , m_Id(id) { } ITensorHandle* GetTensorHandle() const { return m_TensorHandle.get(); } const TensorInfo& GetTensorInfo() const { return m_TensorInfo; } LayerBindingId GetBindingId() const { return m_Id; } private: std::unique_ptr m_TensorHandle; TensorInfo m_TensorInfo; LayerBindingId m_Id; }; static const TensorPin& GetTensorPin(LayerBindingId id, const std::vector& pins, char const* bindingPointDesc) { auto it = std::find_if(pins.begin(), pins.end(), [id](const TensorPin& pin) { return pin.GetBindingId() == id; }); if (it != pins.end()) { return *it; } else { throw InvalidArgumentException(fmt::format("No tensor supplied for {0} {1}", bindingPointDesc, id)); } } // Stores data that needs to be kept accessible for the entire execution of a workload. class WorkloadData { public: WorkloadData(const InputTensors& inputTensors, const OutputTensors& outputTensors) { m_InputTensorPins.reserve(inputTensors.size()); m_OutputTensorPins.reserve(outputTensors.size()); for (auto inputTensorPair : inputTensors) { auto inputTensor = inputTensorPair.second; std::unique_ptr tensorHandle = std::make_unique(inputTensor.GetInfo(),inputTensor.GetMemoryArea()); LayerBindingId layerId = inputTensorPair.first; m_InputTensorPins.emplace_back(std::move(tensorHandle), inputTensor.GetInfo(), layerId); } for (auto outputTensorPair : outputTensors) { auto outputTensor = outputTensorPair.second; std::unique_ptr tensorHandle = std::make_unique(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); LayerBindingId layerId = outputTensorPair.first; m_OutputTensorPins.emplace_back(std::move(tensorHandle), outputTensor.GetInfo(), layerId); } } const TensorPin& GetInputTensorPin(LayerBindingId id) const { return GetTensorPin(id, m_InputTensorPins, "input"); } const TensorPin& GetOutputTensorPin(LayerBindingId id) const { return GetTensorPin(id, m_OutputTensorPins, "output"); } private: std::vector m_InputTensorPins; std::vector m_OutputTensorPins; }; } Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors, std::vector preImportedInputIds, std::vector preImportedOutputIds) { const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); // Walk graph to determine the order of execution. if (graph.GetNumLayers() < 2) { ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph"; return Status::Failure; } // Data that must be kept alive for the entire execution of the workload. WorkloadData workloadData(inputTensors, outputTensors); if (graph.GetNumInputs() != inputTensors.size()) { throw InvalidArgumentException("Number of inputs provided does not match network."); } // For each input to the network, call EnqueueInput with the data passed by the user. { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); m_InputQueue.clear(); m_InputQueue.reserve(graph.GetNumInputs()); if (preImportedInputIds.size() > graph.GetNumInputs()) { throw InvalidArgumentException("Invalid number of preImportedInputIds"); } unsigned int inputIndex = 0; unsigned int importedInputIdIndex = 0; std::sort(preImportedInputIds.begin(), preImportedInputIds.end()); for (const BindableLayer* inputLayer : graph.GetInputLayers()) { if (importedInputIdIndex < preImportedInputIds.size() && inputIndex == preImportedInputIds[importedInputIdIndex]) { // Only replace tensorhandles if they have not already been replaced if (!m_IsInputImported[inputIndex]) { auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get(); for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()]) { auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get(); workload->ReplaceInputTensorHandle(outputTensorHandle, workloadInfo.m_SlotIndex); } m_IsInputImported[inputIndex] = true; } importedInputIdIndex++; } else { if (m_IsInputImported[inputIndex]) { OutputHandler& handler = const_cast(inputLayer->GetOutputHandler(0)); for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()]) { auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get(); workload->ReplaceInputTensorHandle(handler.GetData(), workloadInfo.m_SlotIndex); } m_IsInputImported[inputIndex] = false; } // InputTensorHandle is not imported yet, process to enqueue input const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId()); EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); } inputIndex++; } } // For each output to the network, call EnqueueOutput with the data passed by the user. { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); m_OutputQueue.clear(); m_OutputQueue.reserve(graph.GetNumOutputs()); if (preImportedOutputIds.size() > graph.GetNumOutputs()) { throw InvalidArgumentException("Invalid number of preImportedOutputIds"); } unsigned int outputIndex = 0; unsigned int importedOutputIdIndex = 0; std::sort(preImportedOutputIds.begin(), preImportedOutputIds.end()); for (const BindableLayer* outputLayer : graph.GetOutputLayers()) { if (importedOutputIdIndex < preImportedOutputIds.size() && outputIndex == preImportedOutputIds[importedOutputIdIndex]) { // Only replace tensorhandles if they have not already been replaced ITensorHandle* inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get(); if (!m_IsOutputImported[outputIndex]) { const auto bindingId = outputLayer->GetBindingId(); const auto& indices = m_OutputWorkloadSlotPairs[bindingId]; auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get(); outputWorkload->ReplaceOutputTensorHandle(inputTensorHandle, indices.m_OutputSlotIndices.m_SlotIndex); for (const auto& workloadInfo: indices.m_InputSlotIndices) { auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get(); inputWorkload->ReplaceInputTensorHandle(inputTensorHandle, workloadInfo.m_SlotIndex); } m_IsOutputImported[outputIndex] = true; } ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated."); MemSyncQueueDescriptor syncDesc; syncDesc.m_Inputs.push_back(inputTensorHandle); WorkloadInfo info; info.m_InputTensorInfos.push_back( outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo()); auto syncWorkload = std::make_unique(syncDesc, info); ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created"); m_OutputQueue.push_back(move(syncWorkload)); importedOutputIdIndex++; } else { if (m_IsOutputImported[outputIndex]) { const auto bindingId = outputLayer->GetBindingId(); const auto& indices = m_OutputWorkloadSlotPairs[bindingId]; auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get(); const OutputHandler& outputHandler = outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOutputHandler(); outputWorkload->ReplaceOutputTensorHandle( outputHandler.GetData(), indices.m_OutputSlotIndices.m_SlotIndex); for (const auto& workloadInfo: indices.m_InputSlotIndices) { auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get(); inputWorkload->ReplaceInputTensorHandle(outputHandler.GetData(), workloadInfo.m_SlotIndex); } m_IsOutputImported[outputIndex] = false; } const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId()); // OutputTensorHandle is not imported yet, process to enqueue Output EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); } outputIndex++; } } std::unique_ptr timelineUtils = TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService); ProfilingGuid inferenceGuid = m_ProfilingService->GetNextGuid(); if (timelineUtils) { // Add inference timeline trace if profiling is enabled. ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); timelineUtils->CreateTypedEntity(inferenceGuid, LabelsAndEventClasses::INFERENCE_GUID); timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink, networkGuid, inferenceGuid, LabelsAndEventClasses::EXECUTION_OF_GUID); timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS); } bool executionSucceeded = true; { if (m_ProfilingService->IsProfilingEnabled()) { m_ProfilingService->IncrementCounterValue(INFERENCES_RUN); } ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute"); ARMNN_SCOPED_HEAP_PROFILING("Executing"); executionSucceeded = Execute(timelineUtils, inferenceGuid); } if (timelineUtils) { // Add end of life of the inference timeline if profiling is enabled. timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS); timelineUtils->Commit(); } return executionSucceeded ? Status::Success : Status::Failure; } void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo) { if (layer.GetType() != LayerType::Input) { throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer"); } if (tensorHandle == nullptr) { throw InvalidArgumentException("EnqueueInput: tensorHandle must not be NULL"); } InputQueueDescriptor inputQueueDescriptor; WorkloadInfo info; inputQueueDescriptor.m_Inputs.push_back(tensorHandle); info.m_InputTensorInfos.push_back(tensorInfo); ARMNN_ASSERT_MSG(layer.GetNumOutputSlots() == 1, "Can only handle Input Layer with one output"); const OutputHandler& handler = layer.GetOutputHandler(); const TensorInfo& outputTensorInfo = handler.GetTensorInfo(); ITensorHandle* outputTensorHandle = handler.GetData(); ARMNN_ASSERT_MSG(outputTensorHandle != nullptr, "Data should have been allocated."); inputQueueDescriptor.m_Outputs.push_back(outputTensorHandle); info.m_OutputTensorInfos.push_back(outputTensorInfo); MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags(); bool needMemCopy = true; if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor { if(CheckFlag(importFlags, m_NetworkProperties.m_InputSource)) { needMemCopy = false; // This assumes a CPU Tensor handle void* mem = tensorHandle->Map(false); if (outputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource)) { tensorHandle->Unmap(); return; // No need for a workload since the import has been done. } tensorHandle->Unmap(); throw MemoryImportException("EnqueueInput: Memory Import failed"); } } if (needMemCopy) { // Create a mem copy workload for input since we did not import std::unique_ptr inputWorkload = std::make_unique(inputQueueDescriptor, info); ARMNN_ASSERT_MSG(inputWorkload, "No input workload created"); std::unique_ptr timelineUtils = TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService); if (timelineUtils) { // Add Input Workload to the post-optimisation network structure AddWorkloadStructure(timelineUtils, inputWorkload, layer); timelineUtils->Commit(); } m_InputQueue.push_back(move(inputWorkload)); } } void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo) { if (layer.GetType() != LayerType::Output) { throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer"); } if (tensorHandle == nullptr) { throw InvalidArgumentException("EnqueueOutput: tensorHandle must not be NULL"); } OutputQueueDescriptor outputQueueDescriptor; WorkloadInfo info; outputQueueDescriptor.m_Outputs.push_back(tensorHandle); info.m_OutputTensorInfos.push_back(tensorInfo); ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input."); // Gets the output handler from the previous node. const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler(); const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo(); ITensorHandle* inputTensorHandle = outputHandler.GetData(); ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated."); // Try import the output tensor. // Note: We can only import the output pointer if all of the following hold true: // a) The imported pointer is aligned sufficiently // b) The tensor has zero padding // c) There is only one connection to the OutputSlot and it is to an OutputLayer. // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) // e) m_IsExportEnabled must be set to true bool needMemCopy = true; if (m_NetworkProperties.m_ExportEnabled && (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) { if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) { MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource)) { needMemCopy = false; void *mem = tensorHandle->Map(false); bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource); tensorHandle->Unmap(); if (importOk) { // Insert synchronization workload MemSyncQueueDescriptor syncDesc; syncDesc.m_Inputs.push_back(inputTensorHandle); info.m_InputTensorInfos.push_back(inputTensorInfo); auto syncWorkload = std::make_unique(syncDesc, info); ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created"); m_OutputQueue.push_back(move(syncWorkload)); } else { throw MemoryExportException("EnqueueOutput: Memory Export failed"); } } } } if (needMemCopy) { // If we got here then we didn't export the memory, so add an output workload which performs a memcopy. outputQueueDescriptor.m_Inputs.push_back(inputTensorHandle); info.m_InputTensorInfos.push_back(inputTensorInfo); std::unique_ptr outputWorkload = std::make_unique(outputQueueDescriptor, info); ARMNN_ASSERT_MSG(outputWorkload, "No output workload created"); std::unique_ptr timelineUtils = TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService); if (timelineUtils) { // Add Output Workload to the post-optimisation network structure AddWorkloadStructure(timelineUtils, outputWorkload, layer); timelineUtils->Commit(); } m_OutputQueue.push_back(move(outputWorkload)); } } void LoadedNetwork::AllocateWorkingMemory( #if !defined(ARMNN_DISABLE_THREADS) std::lock_guard& lock #endif ) { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation"); #if !defined(ARMNN_DISABLE_THREADS) // this unused parameter makes sure we can only call this function with a valid lock IgnoreUnused(lock); #endif if (m_IsWorkingMemAllocated) { return; } if (m_ExternalMemoryManager) { m_ExternalMemoryManager->Allocate(); for (unsigned int i = 0; i < m_TensorMemory.size(); ++i) { m_Tensorhandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second); } } for (auto&& memoryManager : m_BackendMemoryMangers) { if (memoryManager) { memoryManager->Acquire(); } } m_TensorHandleFactoryRegistry.AquireMemory(); m_IsWorkingMemAllocated = true; } void LoadedNetwork::FreeWorkingMemory() { #if !defined(ARMNN_DISABLE_THREADS) std::lock_guard lockGuard(m_WorkingMemMutex); #endif if (!m_IsWorkingMemAllocated) { return; } if (m_ExternalMemoryManager) { m_ExternalMemoryManager->Deallocate(); } // Informs the memory managers to release memory in its respective memory group for (auto&& memoryManager : m_BackendMemoryMangers) { if (memoryManager) { memoryManager->Release(); } } m_TensorHandleFactoryRegistry.ReleaseMemory(); m_IsWorkingMemAllocated = false; } bool LoadedNetwork::Execute(std::unique_ptr& timelineUtils, ProfilingGuid inferenceGuid) { bool success = true; auto Fail = [&](const std::exception& error) { ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what(); success = false; }; try { #if !defined(ARMNN_DISABLE_THREADS) std::lock_guard lockGuard(m_WorkingMemMutex); AllocateWorkingMemory(lockGuard); #else AllocateWorkingMemory(); #endif ProfilingDynamicGuid workloadInferenceID(0); auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue) { for (auto& workload : queue) { if(timelineUtils) { workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(), inferenceGuid); } workload->Execute(); if(timelineUtils) { timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); } } }; ExecuteQueue(m_InputQueue); ExecuteQueue(m_WorkloadQueue); ExecuteQueue(m_OutputQueue); } catch (const RuntimeException& error) { Fail(error); } catch (const std::runtime_error& error) { Fail(error); } return success; } void LoadedNetwork::EnqueueInput(const ConstTensor& inputTensor, ITensorHandle* inputTensorHandle) { if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor { MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); if (CheckFlag(importFlags, m_NetworkProperties.m_InputSource) ) { std::unique_ptr tensorHandle = std::make_unique(inputTensor.GetInfo(), inputTensor.GetMemoryArea()); void* mem = tensorHandle->Map(false); if (inputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource)) { tensorHandle->Unmap(); return; } tensorHandle->Unmap(); throw MemoryImportException("EnqueueInput: Memory Import failed"); } else { throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import"); } } else { std::unique_ptr tensorHandle = std::make_unique(inputTensor.GetInfo(), inputTensor.GetMemoryArea()); auto copyFunc = [](void* dst, const void* src, size_t size) { memcpy(dst, src, size); }; CopyTensorContentsGeneric(tensorHandle.get(), inputTensorHandle, copyFunc); } } // Note: We can only import the output pointer if all of the following hold true: // a) The imported pointer is aligned sufficiently // b) The tensor has zero padding // c) There is only one connection to the OutputSlot and it is to an OutputLayer. // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) // e) m_IsExportEnabled must be set to true void LoadedNetwork::ImportOutputTensor(const Tensor& outputTensor, ITensorHandle* outputTensorHandle) { ARMNN_ASSERT_MSG(outputTensorHandle != nullptr, "Data should have been allocated."); MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags(); if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource)) { std::unique_ptr tensorHandle = std::make_unique(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); void* mem = tensorHandle->Map(false); bool importOk = outputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource); tensorHandle->Unmap(); if (!importOk) { throw MemoryExportException("ImportOutputTensor: Memory Export failed"); } } else { throw MemoryExportException("ImportOutputTensor: Memory Export failed, attempting to export Input Layer"); } } void CopyToOutputTensor(const Tensor& outputTensor, ITensorHandle* outputTensorHandle) { auto copyFunc = [](void* dst, const void* src, size_t size) { memcpy(dst, src, size); }; std::unique_ptr tensorHandle = std::make_unique(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); CopyTensorContentsGeneric(outputTensorHandle, tensorHandle.get(), copyFunc); } const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors& inputTensors) { for (auto inputTensorPair : inputTensors) { LayerBindingId id = inputTensorPair.first; if (id == layerId) { return inputTensorPair.second; } } throw InvalidArgumentException("Input does not exist."); } const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors& outputTensors) { for (auto outputTensorPair : outputTensors) { LayerBindingId id = outputTensorPair.first; if (id == layerId) { return outputTensorPair.second; } } throw InvalidArgumentException("Output does not exist."); } std::vector LoadedNetwork::ImportInputs(const InputTensors& inputTensors, MemorySource forceImportMemorySource) { if (!m_NetworkProperties.m_AsyncEnabled) { // Cannot import if import is not enabled and forceImportMemorySource is undefined if (forceImportMemorySource == MemorySource::Undefined) { throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled"); } if (inputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumInputs()) { throw MemoryImportException("ImportInputs: Force Import failed, incorrect number of tensors"); } std::vector importedInputs; Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); unsigned int inputIndex = 0; for (const BindableLayer* inputLayer : graph.GetInputLayers()) { auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get(); if (!outputTensorHandle) { inputIndex++; continue; } auto layerBindingId = inputLayer->GetBindingId(); auto it = std::find_if(inputTensors.begin(), inputTensors.end(), [=](const auto& inputTensor) { return inputTensor.first == layerBindingId; }); if (it == inputTensors.end()) { inputIndex++; continue; } const auto& inputTensor = *it; std::unique_ptr passThroughTensorHandle = std::make_unique(inputTensor.second.GetInfo(), inputTensor.second.GetMemoryArea()); if (outputTensorHandle->CanBeImported(passThroughTensorHandle->Map(), forceImportMemorySource) && (outputTensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource))) { importedInputs.push_back(inputIndex); } passThroughTensorHandle->Unmap(); inputIndex++; } return importedInputs; } else { // Import when the import of network properties is enabled std::vector importedInputs; Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); for (auto inputTensor : inputTensors) { auto layerBindingId = inputTensor.first; auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer) { return layer->GetBindingId() == layerBindingId; }); if (it == graph.GetInputLayers().end()) { throw MemoryImportException(fmt::format( "ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId)); } const Layer* layer = *it; if (layer->GetType() != LayerType::Input) { throw InvalidArgumentException("ImportInputs: given layer not an InputLayer"); } auto& backend = m_Backends.at(layer->GetBackendId()); if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities())) { std::string er = backend->GetId(); er += " does not have PreImportIOTensors capability"; throw BackendCapabilityException(er); } const OutputSlot& outputSlot = layer->GetOutputSlots()[0]; ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId(); const TensorInfo& tensorInfo = outputSlot.GetTensorInfo(); ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId); ARMNN_ASSERT(handleFactory); ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, handleFactory->CreateTensorHandle(tensorInfo, false)}; ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get(); if (!CheckFlag(tensorHandle->GetImportFlags(), m_NetworkProperties.m_InputSource)) { throw MemoryImportException( fmt::format("ImportInputs: Memory Import failed, backend: " "{} does not support importing from source {}" , factoryId, m_NetworkProperties.m_InputSource)); } std::unique_ptr passThroughTensorHandle = std::make_unique(inputTensor.second.GetInfo(), inputTensor.second.GetMemoryArea()); if (tensorHandle->Import(passThroughTensorHandle->Map(), m_NetworkProperties.m_InputSource)) { importedInputs.push_back(m_CurImportedInputId++); passThroughTensorHandle->Unmap(); } else { passThroughTensorHandle->Unmap(); throw MemoryImportException("ImportInputs: Memory Import failed"); } m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin)); } return importedInputs; } } std::vector LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors, MemorySource forceImportMemorySource) { if (!m_NetworkProperties.m_AsyncEnabled) { // Cannot import if import is not enabled and forceImportMemorySource is undefined if (forceImportMemorySource == MemorySource::Undefined) { throw MemoryImportException("ImportOutputs: Memory Import failed, NetworkProperties.m_ImportEnabled"); } // If forceImportMemorySource is defined, try import if memory is aligned if (outputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumOutputs()) { throw MemoryImportException("ImportOutputs: Force Import failed, incorrect number of tensors"); } std::vector importedOutputs; Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); unsigned int outputIndex = 0; for (const BindableLayer* const outputLayer : graph.GetOutputLayers()) { auto inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get(); if (!inputTensorHandle) { outputIndex++; continue; } auto layerBindingId = outputLayer->GetBindingId(); auto it = std::find_if(outputTensors.begin(), outputTensors.end(), [=] (const auto& outputTensor) { return outputTensor.first == layerBindingId; }); if (it == outputTensors.end()) { outputIndex++; continue; } const auto outputTensor = *it; // Check if the output memory can be imported if (inputTensorHandle->CanBeImported(outputTensor.second.GetMemoryArea(), forceImportMemorySource) && inputTensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource)) { importedOutputs.push_back(outputIndex); } outputIndex++; } return importedOutputs; } std::vector importedOutputs; Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); for (const auto& outputTensor : outputTensors) { auto layerBindingId = outputTensor.first; auto it = std::find_if(graph.GetOutputLayers().begin(), graph.GetOutputLayers().end(), [=](auto* layer) { return layer->GetBindingId() == layerBindingId; }); if (it == graph.GetOutputLayers().end()) { throw MemoryImportException(fmt::format("ImportOutputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId)); } const Layer* layer = *it; if (layer->GetType() != LayerType::Output) { throw InvalidArgumentException("ImportOutputs: given layer not an OutputLayer"); } auto& backend = m_Backends.at(layer->GetBackendId()); if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities())) { std::string er = backend->GetId(); er += " does not have PreImportIOTensors capability"; throw BackendCapabilityException(er); } const InputSlot& inputSlot = layer->GetInputSlots()[0]; ITensorHandleFactory::FactoryId factoryId = inputSlot.GetConnectedOutputSlot()->GetTensorHandleFactoryId(); const TensorInfo& tensorInfo = inputSlot.GetConnectedOutputSlot()->GetTensorInfo(); ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId); ARMNN_ASSERT(handleFactory); ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, handleFactory->CreateTensorHandle(tensorInfo, false)}; ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get(); if (!CheckFlag(tensorHandle->GetImportFlags(), m_NetworkProperties.m_OutputSource)) { throw MemoryImportException(fmt::format("ImportInputs: Memory Import failed, backend: " "{} does not support importing from source {}" , factoryId, m_NetworkProperties.m_OutputSource)); } if (tensorHandle->Import(outputTensor.second.GetMemoryArea(), m_NetworkProperties.m_OutputSource)) { importedOutputs.push_back(m_CurImportedOutputId++); } else { throw MemoryImportException("ImportInputs: Memory Import failed"); } m_PreImportedOutputHandles.push_back(std::move(importedTensorHandlePin)); } return importedOutputs; } void LoadedNetwork::ClearImportedInputs(const std::vector inputIds) { for (auto id : inputIds) { if (id > m_PreImportedInputHandles.size()) { throw InvalidArgumentException(fmt::format("ClearImportedInputs::Unknown ImportedInputId: {}", id)); } auto& importedTensorHandle = m_PreImportedInputHandles[id].m_TensorHandle; if (!importedTensorHandle) { throw InvalidArgumentException( fmt::format("ClearImportedInputs::ImportedInput with id: {} has already been deleted", id)); } // Call Unimport then destroy the tensorHandle importedTensorHandle->Unimport(); importedTensorHandle = {}; } } void LoadedNetwork::ClearImportedOutputs(const std::vector outputIds) { for (auto id : outputIds) { if (id > m_PreImportedOutputHandles.size()) { throw InvalidArgumentException(fmt::format("ClearImportedOutputs::Unknown ImportedOutputId: {}", id)); } auto& importedTensorHandle = m_PreImportedOutputHandles[id].m_TensorHandle; if (!importedTensorHandle) { throw InvalidArgumentException( fmt::format("ClearImportedOutputs::ImportedOutput with id: {} has already been deleted", id)); } // Call Unimport then destroy the tensorHandle importedTensorHandle->Unimport(); importedTensorHandle = {}; } } Status LoadedNetwork::Execute(const InputTensors& inputTensors, const OutputTensors& outputTensors, IWorkingMemHandle& iWorkingMemHandle, std::vector preImportedInputs, std::vector preImportedOutputs) { const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); if (inputTensors.size() + preImportedInputs.size() != graph.GetNumInputs()) { if (preImportedInputs.empty()) { throw InvalidArgumentException("LoadedNetwork::Execute: Number of inputs provided does not match network."); } else { throw InvalidArgumentException("LoadedNetwork::Execute: " "Number of inputs + preImportedInputs provided does not match network."); } } if (outputTensors.size() + preImportedOutputs.size() != graph.GetNumOutputs()) { if (preImportedOutputs.empty()) { throw InvalidArgumentException("LoadedNetwork::Execute: " "Number of outputs provided does not match network."); } else { throw InvalidArgumentException("LoadedNetwork::Execute: " "Number of outputs + preImportedOutputs provided does not match network."); } } WorkingMemHandle& workingMemHandle = dynamic_cast(iWorkingMemHandle); // Collect all the given LayerBindingIds and check them for duplicates and unknowns. std::vector& bindingIds = workingMemHandle.GetBindingIdVector(); unsigned int index = 0; for (auto pair : inputTensors) { bindingIds[index++] = pair.first; } for (ImportedInputId id : preImportedInputs) { bindingIds[index++] = ValidateImportedInputID(id); } for (auto pair : outputTensors) { bindingIds[index++] = pair.first; } for (ImportedOutputId id : preImportedOutputs) { bindingIds[index++] = ValidateImportedOutputID(id); } workingMemHandle.ValidateBindingIds(); auto resetMemHandle = [&]() { for (ImportedInputId id: preImportedInputs) { const LayerBindingId layerBindingId = m_PreImportedInputHandles[id].m_LayerBindingId; auto inputHandle = workingMemHandle.GetInputHandle(layerBindingId); auto inputConnections = workingMemHandle.GetInputConnections(layerBindingId); for (auto it : inputConnections) { *it = inputHandle; } } for (ImportedOutputId id: preImportedOutputs) { const LayerBindingId layerBindingId = m_PreImportedOutputHandles[id].m_LayerBindingId; auto outputHandle = workingMemHandle.GetOutputHandle(layerBindingId); auto outputConnections = workingMemHandle.GetOutputConnection(layerBindingId); for (auto it : outputConnections) { *it = outputHandle; } } }; std::unique_ptr timelineUtils = TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService); ProfilingGuid inferenceGuid = m_ProfilingService->GetNextGuid(); if (timelineUtils) { // Add inference timeline trace if profiling is enabled. ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid(); timelineUtils->CreateTypedEntity(inferenceGuid,LabelsAndEventClasses::INFERENCE_GUID); timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink, networkGuid, inferenceGuid, LabelsAndEventClasses::EXECUTION_OF_GUID); timelineUtils->RecordEvent(inferenceGuid,LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS); } bool executionSucceeded = true; if (timelineUtils) { // Add end of life of the inference timeline if profiling is enabled. timelineUtils->RecordEvent(inferenceGuid,LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS); timelineUtils->Commit(); } if (!workingMemHandle.IsAllocated()) { workingMemHandle.Allocate(); } { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); for (auto pair : inputTensors) { EnqueueInput(pair.second, workingMemHandle.GetInputHandle(pair.first)); } // Swap in the pre-imported inputs if any for (ImportedInputId id : preImportedInputs) { const ImportedTensorHandlePin& importedInputPin = m_PreImportedInputHandles[id]; const LayerBindingId layerBindingId = m_PreImportedInputHandles[id].m_LayerBindingId; const auto& preimportedHandle = importedInputPin.m_TensorHandle; auto inputConnections = workingMemHandle.GetInputConnections(layerBindingId); for (auto it : inputConnections) { *it = preimportedHandle.get(); } } } { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); if (m_NetworkProperties.m_ExportEnabled) { for (auto pair: outputTensors) { ImportOutputTensor(pair.second, workingMemHandle.GetOutputHandle(pair.first)); } } for (ImportedOutputId id : preImportedOutputs) { const ImportedTensorHandlePin& importedOutputPin = m_PreImportedOutputHandles[id]; const LayerBindingId layerBindingId = m_PreImportedOutputHandles[id].m_LayerBindingId; const auto& preimportedHandle = importedOutputPin.m_TensorHandle; auto outputConnections = workingMemHandle.GetOutputConnection(layerBindingId); for (auto it : outputConnections) { *it = preimportedHandle.get(); } } } auto Fail = [&](const std::exception& error) { ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what(); executionSucceeded = false; }; ProfilingDynamicGuid workloadInferenceID(0); try { for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i) { auto& workload = m_WorkloadQueue[i]; if (timelineUtils) { workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(), inferenceGuid); } workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i)); if (timelineUtils) { timelineUtils->RecordEndOfLifeEvent(workloadInferenceID); } } } catch (const RuntimeException& error) { resetMemHandle(); Fail(error); } catch (const std::runtime_error& error) { resetMemHandle(); Fail(error); } catch (...) { resetMemHandle(); throw; } if (!m_NetworkProperties.m_ExportEnabled) { for (auto pair: outputTensors) { CopyToOutputTensor(pair.second, workingMemHandle.GetOutputHandle(pair.first)); } } else { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute"); workingMemHandle.MemSyncOutputs(); } resetMemHandle(); return executionSucceeded ? Status::Success : Status::Failure; } /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have /// overlapped Execution by calling this function from different threads. std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId) { Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); // Tensors that will need to be allocated internally within armnn std::vector> managedTensorHandles; // Tensors that will be allocated externally by the user std::vector> unmanagedTensorHandles; std::vector workingMemDescriptors; std::unordered_map workingMemDescriptorMap; auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot) { ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId(); const TensorInfo& tensorInfo = outputSlot.GetTensorInfo(); if (factoryId == ITensorHandleFactory::LegacyFactoryId) { BackendId id = layer->GetBackendId(); ARMNN_NO_DEPRECATE_WARN_BEGIN return m_WorkloadFactories.at(id)->CreateTensorHandle(tensorInfo, false); ARMNN_NO_DEPRECATE_WARN_END } else { ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId); ARMNN_ASSERT(handleFactory); return handleFactory->CreateTensorHandle(tensorInfo, false); } }; struct HandleInfo { ITensorHandle* m_TensorHandle; bool m_IsInputLayerHandle = false; bool m_IsOutputLayerHandle = false; WorkingMemHandle::InputMemDescriptorCoords m_InputMemDescriptorCoords; WorkingMemHandle::OutputMemDescriptorCoords m_OutputMemDescriptorCoords; }; std::unordered_map outputToHandleInfoMap; unsigned int layerIndex = 0; for (auto&& layer : order) { // Constant layers execution and management is handled during loaded network construction if (layer->GetType() == LayerType::Constant) { continue; } WorkingMemDescriptor workingMemDescriptor; bool isMemoryManaged = true; bool isInputLayer = false; bool isOutputLayer = false; bool isConnectedToOutputLayer = false; if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::MemImport) { // Input layers/workloads will not be executed so the descriptor is not added to workingMemDescriptors // However we will still need to manage the tensorHandle isInputLayer = true; isMemoryManaged = !m_NetworkProperties.m_ImportEnabled; } else if (layer->GetType() == LayerType::Output) { isOutputLayer = true; } unsigned int slotIndex = 0; // Create a tensor handle for each output slot of a layer // Once we create it, we start managing its lifetime for (auto& slot : layer->GetOutputSlots()) { for (unsigned int i = 0; i < slot.GetNumConnections(); ++i) { if ((slot.GetConnection(i)->GetOwningLayer().GetType() == LayerType::Output)) { if (!isConnectedToOutputLayer) { isConnectedToOutputLayer = true; // If Export is enabled disable memory management, so we can export, otherwise we do a copy isMemoryManaged = !m_NetworkProperties.m_ExportEnabled; } else { // Importing in this case would likely cause unexpected behaviour, so we disallow it. ARMNN_LOG(warning) << fmt::format("Layer name: '{0}' guid: '{1}' has two or more OutputLayers connected to it. " "This will prevent importing on the connected OutputLayers.", layer->GetName(), layer->GetGuid()); isMemoryManaged = true; } } } ITensorHandle* tensorHandle; if (isMemoryManaged) { managedTensorHandles.emplace_back(GetTensorHandle(layer, slot)); tensorHandle = managedTensorHandles.back().get(); } else { unmanagedTensorHandles.emplace_back(GetTensorHandle(layer, slot)); tensorHandle = unmanagedTensorHandles.back().get(); } workingMemDescriptor.m_Outputs.push_back(tensorHandle); HandleInfo& handleInfo = outputToHandleInfoMap[&slot]; handleInfo.m_TensorHandle = tensorHandle; // Store the coordinates of the current layer's OutputSlot that is connected to the OutputLayer if (isConnectedToOutputLayer) { handleInfo.m_IsOutputLayerHandle = true; handleInfo.m_OutputMemDescriptorCoords.m_OutputSlotCoords = {layerIndex, slotIndex}; } // Store the LayerBindingId of the InputLayer if (isInputLayer) { handleInfo.m_IsInputLayerHandle = true; LayerBindingId bindingId = static_cast(layer)->GetBindingId(); handleInfo.m_InputMemDescriptorCoords.m_LayerBindingId = bindingId; } slotIndex++; } // Loop through the input slots in the same layer and decrement the reference counter associated // to each tensor handle we encounter. // Once it reaches zero, the lifetime of the tensor handle has ended, and we mark its memory as available // so that the next tensor handle with a non overlapping lifetime can share its memory. for (auto& slot : layer->GetInputSlots()) { ARMNN_ASSERT(slot.GetConnection()); auto outputSlot = slot.GetConnectedOutputSlot(); auto key = outputSlot->GetOwningLayer().GetGuid(); // Constant layers execution and management is handled during loaded network construction auto found = m_ConstantTensorHandles.find(key); if (found != m_ConstantTensorHandles.end()) { ITensorHandle* tensorHandle = found->second; workingMemDescriptor.m_Inputs.push_back(tensorHandle); // Odd case where a constant layer is connected to an output layer // We will need to create a HandleInfo to track it if (isOutputLayer) { LayerBindingId bindingId = static_cast(layer)->GetBindingId(); HandleInfo& handleInfo = outputToHandleInfoMap[outputSlot]; handleInfo.m_TensorHandle = tensorHandle; handleInfo.m_IsOutputLayerHandle = true; handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId); handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0}); } continue; } HandleInfo& handleInfo = outputToHandleInfoMap.at(outputSlot); ITensorHandle* inputTensorHandle = handleInfo.m_TensorHandle; workingMemDescriptor.m_Inputs.push_back(inputTensorHandle); // Store the LayerBindingId of the OutputLayer if (isOutputLayer) { LayerBindingId bindingId = static_cast(layer)->GetBindingId(); handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId); handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0}); } // In this case the layer is not an Output Layer but shares its input tensorhandle with an OutputLayer // It will need to be updated as well, if we swap out the tensorhandle else if (handleInfo.m_IsOutputLayerHandle) { handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, slot.GetSlotIndex()}); } // Store the coordinates of the InputSlots connected to the InputLayer // There can be more than one InputSlot connected to an InputLayer, so we use a vector if (handleInfo.m_IsInputLayerHandle) { std::pair connectionLocation{layerIndex, slot.GetSlotIndex()}; handleInfo.m_InputMemDescriptorCoords.m_InputSlotCoords.emplace_back(connectionLocation); } } workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor}); // Input/Output layers/workloads will not be executed, so the descriptor is not added to workingMemDescriptors // However we will still need to manage the tensorHandle if (!isInputLayer) { workingMemDescriptors.push_back(workingMemDescriptor); layerIndex++; } } std::vector, MemorySource>> tensorMemory; auto externalMemoryManager = CreateExternalMemoryManger(tensorMemory); // Sort m_TensorMemory, so it's order matches the outputSlot order std::sort(tensorMemory.begin(), tensorMemory.end(), [](const std::pair, MemorySource>& lhs, const std::pair, MemorySource>& rhs) { return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId; }); std::vector inputConnectionsInfo; std::vector outputConnectionsInfo; for (const auto& handleInfo: outputToHandleInfoMap) { if (handleInfo.second.m_IsOutputLayerHandle) { outputConnectionsInfo.emplace_back(handleInfo.second.m_OutputMemDescriptorCoords); } if (handleInfo.second.m_IsInputLayerHandle) { inputConnectionsInfo.emplace_back(handleInfo.second.m_InputMemDescriptorCoords); } } return std::make_unique(networkId, inputConnectionsInfo, outputConnectionsInfo, workingMemDescriptors, workingMemDescriptorMap, std::move(externalMemoryManager), std::move(tensorMemory), std::move(managedTensorHandles), std::move(unmanagedTensorHandles)); } void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func) { for (auto&& workloadPtr: m_WorkloadQueue) { workloadPtr.get()->RegisterDebugCallback(func); } } void LoadedNetwork::CreateMemoryProfileAsync() { struct PartialBlock { unsigned int m_StartOfLife; unsigned int m_Lifetime; size_t m_MemSize; unsigned int m_Index; BackendId m_BackendId; }; auto align = [](size_t numToAlign) { const size_t alignment = sizeof(float); return ((numToAlign + alignment - 1) / alignment) * alignment; }; std::unordered_map memBlockTrackerMap; const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined; const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined; unsigned int timestep = 0; unsigned int outputIndex = 0; Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); for (auto&& layer : order) { const LayerType& layerType = layer->GetType(); // Don't manage memory if importing. if (layerType == LayerType::Input && inputImportingEnabled) { continue; } // Don't manage memory if importing. if (layerType == LayerType::Output && outputImportingEnabled && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1) { continue; } // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution, // management is done separately. if (layerType == LayerType::Constant) { continue; } BackendId backendId = layer->GetBackendId(); for (auto& outputSlot : layer->GetOutputSlots()) { if (!m_SupportsExternallyManagedMemory[backendId]) { continue; } PartialBlock partialBlock; partialBlock.m_StartOfLife = timestep; size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes()); partialBlock.m_MemSize = alignedSize; partialBlock.m_Index = outputIndex++; partialBlock.m_Lifetime = outputSlot.GetNumConnections(); partialBlock.m_BackendId = backendId; if (partialBlock.m_Lifetime == 0) { m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife, partialBlock.m_StartOfLife, partialBlock.m_MemSize, 0, partialBlock.m_Index); } else { memBlockTrackerMap[&outputSlot] = partialBlock; } } for (auto& inputSlot : layer->GetInputSlots()) { const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer(); const LayerType& owningLayerType = connectedInputLayer.GetType(); if (owningLayerType == LayerType::Constant) { continue; } if (inputImportingEnabled && owningLayerType == LayerType::Input) { continue; } auto outputSlot = inputSlot.GetConnectedOutputSlot(); PartialBlock& partialBlock = memBlockTrackerMap.at(outputSlot); auto& lifetime = partialBlock.m_Lifetime; --lifetime; if (lifetime == 0) { m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife, timestep, partialBlock.m_MemSize, 0, partialBlock.m_Index); } } ++timestep; } } void LoadedNetwork::CreateMemoryProfile() { // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided // is a TensorHandle, the function just returns it auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle) { ITensorHandle* ancestor = subTensorHandle; while (ancestor && ancestor->GetParent()) { ancestor = ancestor->GetParent(); } return ancestor; }; struct PartialBlock { unsigned int m_StartOfLife; unsigned int m_Lifetime; size_t m_MemSize; unsigned int m_Index; BackendId m_BackendId; }; auto align = [](size_t numToAlign) { const size_t alignment = sizeof(float); return ((numToAlign + alignment - 1) / alignment) * alignment; }; std::unordered_map memBlockTrackerMap; const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined; const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined; unsigned int timestep = 0; unsigned int outputIndex = 0; Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); for (auto&& layer : order) { const LayerType& layerType = layer->GetType(); // Don't manage memory if importing. if (layerType == LayerType::Input && inputImportingEnabled) { continue; } // Don't manage memory if importing. if (layerType == LayerType::Output && outputImportingEnabled && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1) { continue; } // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution, // management is done separately. if (layerType == LayerType::Constant) { continue; } BackendId backendId = layer->GetBackendId(); for (auto& outputSlot : layer->GetOutputSlots()) { if (!m_SupportsExternallyManagedMemory[backendId]) { continue; } ITensorHandle* tensorHandle = outputSlot.GetOutputHandler().GetData(); tensorHandle = TraceSubTensorHandleAncestry(tensorHandle); if (memBlockTrackerMap.find(tensorHandle) == memBlockTrackerMap.end()) { PartialBlock partialBlock; partialBlock.m_StartOfLife = timestep; size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes()); partialBlock.m_MemSize = alignedSize; partialBlock.m_Index = outputIndex++; partialBlock.m_Lifetime = outputSlot.GetNumConnections(); partialBlock.m_BackendId = backendId; if (partialBlock.m_Lifetime == 0) { m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife, partialBlock.m_StartOfLife, partialBlock.m_MemSize, 0, partialBlock.m_Index); } else { memBlockTrackerMap[tensorHandle] = partialBlock; } m_Tensorhandles.push_back(tensorHandle); } else { memBlockTrackerMap.at(tensorHandle).m_Lifetime += outputSlot.GetNumConnections(); } } for (auto& inputSlot : layer->GetInputSlots()) { const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer(); const LayerType& owningLayerType = connectedInputLayer.GetType(); if (owningLayerType == LayerType::Constant) { continue; } if (inputImportingEnabled && owningLayerType == LayerType::Input) { continue; } if (!m_SupportsExternallyManagedMemory[connectedInputLayer.GetBackendId()]) { continue; } auto outputSlot = inputSlot.GetConnectedOutputSlot(); ITensorHandle* tensorHandle = outputSlot->GetOutputHandler().GetData(); tensorHandle = TraceSubTensorHandleAncestry(tensorHandle); PartialBlock& partialBlock = memBlockTrackerMap.at(tensorHandle); auto& lifetime = partialBlock.m_Lifetime; --lifetime; if (lifetime == 0) { m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife, timestep, partialBlock.m_MemSize, 0, partialBlock.m_Index); } } ++timestep; } } std::unique_ptr LoadedNetwork::CreateExternalMemoryManger( std::vector, MemorySource>>& tensorMemoryVec) { std::unique_ptr memoryManager = std::make_unique(); auto allocatorMap = BackendRegistryInstance().GetAllocators(); for (auto& backend : m_MemBinMap) { std::vector bufferStorageVec; std::shared_ptr backendAllocator; if (allocatorMap.find(backend.first) != allocatorMap.end()) { backendAllocator = allocatorMap[backend.first]; } else { backendAllocator = m_Backends[backend.first]->GetDefaultAllocator(); } for (auto& memBin : backend.second) { BufferStorage bufferStorage; bufferStorage.m_BufferSize = memBin.m_MemSize; bufferStorage.m_TensorMemoryVector.reserve(memBin.m_MemBlocks.size()); for (auto& memBlock : memBin.m_MemBlocks) { auto tensorMemory = std::make_shared(TensorMemory{memBlock.m_Offset, memBlock.m_Index}); tensorMemoryVec.emplace_back(tensorMemory, backendAllocator->GetMemorySourceType()); bufferStorage.m_TensorMemoryVector.emplace_back(tensorMemory); } bufferStorageVec.emplace_back(std::move(bufferStorage)); } memoryManager->StoreMemToAllocate(bufferStorageVec, backendAllocator, 4); } return memoryManager; } LayerBindingId LoadedNetwork::ValidateImportedInputID(ImportedInputId id) { try { const auto& importedTensorHandlePin = m_PreImportedInputHandles.at(id); if (!importedTensorHandlePin.m_TensorHandle) { throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute:" "PreImportedInput: {} has been deleted", id)); } return importedTensorHandlePin.m_LayerBindingId; } catch (const std::out_of_range&) { throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: Unknown ImportedInputId: {}", id)); } } LayerBindingId LoadedNetwork::ValidateImportedOutputID(ImportedOutputId id) { try { const auto& importedTensorHandlePin = m_PreImportedOutputHandles.at(id); if (!importedTensorHandlePin.m_TensorHandle) { throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: " "PreImportedOutput: {} has been deleted", id)); } return importedTensorHandlePin.m_LayerBindingId; } catch (const std::out_of_range&) { throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: Unknown ImportedOutputId: {}", id)); } } }