From e2af6f4322a1e2b8b3c391fb721a6a80c281477f Mon Sep 17 00:00:00 2001 From: Narumol Prangnawarat Date: Fri, 28 Jan 2022 17:59:18 +0000 Subject: IVGCVSW-6552 Add support of aligned host memory * Add AllocatedData functions to OutputHandler * Enable import aligned memory in ImportInputs * Enable import aligned memory in ImportOutputs * Allow to import input and output if the memory is aligned * Implement Reconfigure function on ClConvolution2dWorkload * End-to-end test on Ref and Cl to ensure that input and output memory are imported when aligned Signed-off-by: Narumol Prangnawarat Change-Id: I9e5e4c26d1ac2f1d806803ade5f64c6479c51718 --- include/armnn/IRuntime.hpp | 15 +- src/armnn/LoadedNetwork.cpp | 378 +++++++++++++++++---- src/armnn/LoadedNetwork.hpp | 15 +- src/armnn/OutputHandler.hpp | 5 + src/armnn/Runtime.cpp | 34 +- src/armnn/Runtime.hpp | 10 +- src/backends/cl/ClBackend.cpp | 18 + src/backends/cl/ClImportTensorHandle.hpp | 12 +- src/backends/cl/test/ClCreateWorkloadTests.cpp | 61 ++++ src/backends/cl/test/ClImportTensorHandleTests.cpp | 153 +++++++++ src/backends/cl/workloads/ClBaseWorkload.hpp | 25 +- .../cl/workloads/ClConvolution2dWorkload.cpp | 17 +- .../cl/workloads/ClConvolution2dWorkload.hpp | 8 + src/backends/neon/workloads/NeonBaseWorkload.hpp | 25 +- src/backends/reference/test/RefEndToEndTests.cpp | 87 +++++ 15 files changed, 758 insertions(+), 105 deletions(-) diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp index d85a3e3724..042271fc2b 100644 --- a/include/armnn/IRuntime.hpp +++ b/include/armnn/IRuntime.hpp @@ -216,18 +216,19 @@ public: TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; - /// ImportInputs separates the importing and mapping of InputTensors from network execution. /// Allowing for a set of InputTensors to be imported and mapped once, but used in execution many times. /// This function is not thread safe and must not be used while other threads are calling Execute(). - /// Only compatible with AsyncEnabled networks - std::vector ImportInputs(NetworkId networkId, const InputTensors& inputTensors); + /// Only compatible with AsyncEnabled networks and aligned memory import + std::vector ImportInputs(NetworkId networkId, const InputTensors& inputTensors, + MemorySource forceImportMemorySource = MemorySource::Undefined); /// ImportOutputs separates the importing and mapping of OutputTensors from network execution. /// Allowing for a set of OutputTensors to be imported and mapped once, but used in execution many times. /// This function is not thread safe and must not be used while other threads are calling Execute(). - /// Only compatible with AsyncEnabled networks - std::vector ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors); + /// Only compatible with AsyncEnabled networks and aligned memory import + std::vector ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors, + MemorySource forceImportMemorySource = MemorySource::Undefined); /// Un-import and delete the imported InputTensor/s /// This function is not thread safe and must not be used while other threads are calling Execute(). @@ -242,7 +243,9 @@ public: /// Evaluates a network using input in inputTensors and outputs filled into outputTensors Status EnqueueWorkload(NetworkId networkId, const InputTensors& inputTensors, - const OutputTensors& outputTensors); + const OutputTensors& outputTensors, + std::vector preImportedInputIds = {}, + std::vector preImportedOutputIds = {}); /// This is an experimental function. /// Evaluates a network using input in inputTensors and outputs filled into outputTensors. diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index 1d1aae53a5..45891f7dc3 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -314,21 +314,22 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, { if (layer->GetNumInputSlots() >= 1) { - unsigned int slotIndex = 0; + unsigned int inputSlotIndex = 0; for (auto& inputSlot : layer->GetInputSlots()) { if (inputSlot.GetOwningLayer().GetType() == LayerType::Input) { - m_InputWorkloadSlotPairs.push_back( - std::make_pair(m_WorkloadQueue.size(), slotIndex)); + auto inputLayer = PolymorphicDowncast(&inputSlot.GetOwningLayer()); + m_InputWorkloadSlotPairs[inputLayer->GetBindingId()] = + std::make_pair(m_WorkloadQueue.size(), inputSlotIndex); } - ++slotIndex; + ++inputSlotIndex; } } if (layer->GetNumOutputSlots() >= 1) { - unsigned int slotIndex = 0; + unsigned int outputSlotIndex = 0; for (auto& outputSlot : layer->GetOutputSlots()) { for (unsigned int i = 0; i < outputSlot.GetNumConnections(); i++) @@ -337,12 +338,14 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, // Add its index within layer->GetOutputSlots() to m_OutputWorkloadSlotPairs if (outputSlot.GetConnection(i)->GetOwningLayer().GetType() == LayerType::Output) { - m_OutputWorkloadSlotPairs.push_back( - std::make_pair(m_WorkloadQueue.size(), slotIndex)); + auto outputLayer = PolymorphicDowncast( + &outputSlot.GetConnection(i)->GetOwningLayer()); + m_OutputWorkloadSlotPairs[outputLayer->GetBindingId()] = + std::make_pair(m_WorkloadQueue.size(), outputSlotIndex); continue; } } - ++slotIndex; + ++outputSlotIndex; } } m_WorkloadQueue.push_back(std::move(workload)); @@ -667,7 +670,9 @@ private: } Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, - const OutputTensors& outputTensors) + const OutputTensors& outputTensors, + std::vector preImportedInputIds, + std::vector preImportedOutputIds) { const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph(); @@ -691,10 +696,26 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs"); m_InputQueue.clear(); m_InputQueue.reserve(graph.GetNumInputs()); + for (const BindableLayer* inputLayer : graph.GetInputLayers()) { - const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId()); - EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); + if (preImportedInputIds.size() != m_PreImportedInputHandles.size()) + { + throw InvalidArgumentException("Invalid number of preImportedInputIds"); + } + auto layerBindingId = inputLayer->GetBindingId(); + auto it = std::find_if(preImportedInputIds.begin(), preImportedInputIds.end(), + [=](auto preImportedInputId) + { + return m_PreImportedInputHandles[preImportedInputId].m_LayerBindingId == layerBindingId; + }); + + if (it == preImportedInputIds.end()) + { + // InputTensorHandle is not imported yet, process to enqueue input + const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId()); + EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); + } } } @@ -703,12 +724,57 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs"); m_OutputQueue.clear(); m_OutputQueue.reserve(graph.GetNumOutputs()); + for (const BindableLayer* outputLayer : graph.GetOutputLayers()) { + if (preImportedOutputIds.size() != m_PreImportedOutputHandles.size()) + { + throw InvalidArgumentException("Invalid number of preImportedOutputIds"); + } + auto layerBindingId = outputLayer->GetBindingId(); + auto it = std::find_if(preImportedOutputIds.begin(), preImportedOutputIds.end(), + [=](auto preImportedOutputId) + { + return m_PreImportedOutputHandles[preImportedOutputId].m_LayerBindingId == layerBindingId; + }); + const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId()); - EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); + + if (it == preImportedOutputIds.end()) + { + // OutputTensorHandle is not imported yet, process to enqueue Output + EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); + } + else + { + // Insert synchronization workload for the imported output + OutputQueueDescriptor outputQueueDescriptor; + WorkloadInfo info; + + outputQueueDescriptor.m_Outputs.push_back(pin.GetTensorHandle()); + info.m_OutputTensorInfos.push_back(pin.GetTensorInfo()); + + // Gets the output handler from the previous node. + const OutputHandler& outputHandler = + outputLayer->GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler(); + + const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo(); + ITensorHandle* inputTensorHandle = outputHandler.GetData(); + ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated."); + MemSyncQueueDescriptor syncDesc; + syncDesc.m_Inputs.push_back(inputTensorHandle); + info.m_InputTensorInfos.push_back(inputTensorInfo); + auto syncWorkload = std::make_unique(syncDesc, info); + ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created"); + m_OutputQueue.push_back(move(syncWorkload)); + } } } + // Clear m_PreImportedInputHandles and m_PreImportedOutputHandles + m_PreImportedInputHandles.clear(); + m_PreImportedOutputHandles.clear(); + m_CurImportedInputId = 0; + m_CurImportedOutputId = 0; std::unique_ptr timelineUtils = TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService); @@ -1120,90 +1186,260 @@ const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTe throw InvalidArgumentException("Output does not exist."); } -std::vector LoadedNetwork::ImportInputs(const InputTensors& inputTensors) +std::vector LoadedNetwork::ImportInputs(const InputTensors& inputTensors, + MemorySource forceImportMemorySource) { - if (!m_NetworkProperties.m_ImportEnabled) // Try import the input tensor - { - throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled"); - } - - std::vector importedInputs; - Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); - - for (auto inputTensor : inputTensors) + if (!m_NetworkProperties.m_ImportEnabled) { - auto layerBindingId = inputTensor.first; - auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer) + // Cannot import if import is not enabled and forceImportMemorySource is undefined + if (forceImportMemorySource == MemorySource::Undefined) { - return layer->GetBindingId() == layerBindingId; - }); - - if (it == graph.GetInputLayers().end()) + throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled"); + } + // If forceImportMemorySource is defined, try import if memory is aligned + if (inputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumInputs()) { - throw MemoryImportException(fmt::format("ImportInputs: Memory Import failed, unknown LayerBindingId: {}", - layerBindingId)); + throw MemoryImportException("ImportInputs: Force Import failed, incorrect number of tensors"); } - const Layer* layer = *it; - if (layer->GetType() != LayerType::Input) + std::vector importedInputs; + Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); + for (auto inputTensor : inputTensors) { - throw InvalidArgumentException("ImportInputs: given layer not an InputLayer"); + auto layerBindingId = inputTensor.first; + auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer) + { + return layer->GetBindingId() == layerBindingId; + }); + + if (it == graph.GetInputLayers().end()) + { + throw MemoryImportException(fmt::format( + "ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId)); + } + + const Layer* layer = *it; + if (layer->GetType() != LayerType::Input) + { + throw InvalidArgumentException("ImportInputs: given layer not an InputLayer"); + } + const OutputSlot& outputSlot = layer->GetOutputSlots()[0]; + ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId(); + // Get matching import factory Id + ITensorHandleFactory::FactoryId importFactoryId = + m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId); + ITensorHandleFactory* importFactory = + m_TensorHandleFactoryRegistry.GetFactory(importFactoryId, forceImportMemorySource); + if (!importFactory) + { + throw MemoryImportException("ImportInputs: Force Import failed, cannot find matching Import Factory"); + } + + OutputHandler& handler = const_cast(layer->GetOutputHandler(0)); + handler.SetAllocatedData(); + handler.CreateTensorHandles(*importFactory, false); + ITensorHandle* outputTensorHandle = handler.GetData(); + std::unique_ptr passThroughTensorHandle = + std::make_unique(inputTensor.second.GetInfo(), + inputTensor.second.GetMemoryArea()); + // Check if the input memory can be imported + if (outputTensorHandle->CanBeImported(passThroughTensorHandle->Map(), forceImportMemorySource)) + { + passThroughTensorHandle->Unmap(); + if (outputTensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource)) + { + passThroughTensorHandle->Unmap(); + try + { + m_WorkloadQueue[m_InputWorkloadSlotPairs[layerBindingId].first].get()->ReplaceInputTensorHandle( + outputTensorHandle, m_InputWorkloadSlotPairs[layerBindingId].second); + importedInputs.push_back(m_CurImportedInputId++); + // For force import, we want OutputHandler to own the TensorHandle, + // so we do not move the TensorHandle to m_PreImportedInputHandles as in AsyncEnabled networks + ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, nullptr}; + m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin)); + } + catch(armnn::UnimplementedException& e) + { + IgnoreUnused(e); + // Method not implement, cannot use import tensor and have to use allocated data instead + handler.UseAllocatedData(); + } + } + } + else + { + // Cannot import, use allocated data + handler.UseAllocatedData(); + } + } - auto& backend = m_Backends.at(layer->GetBackendId()); - if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities())) + return importedInputs; + } + else + { + // Import when the import of network properties is enabled + std::vector importedInputs; + Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); + + for (auto inputTensor : inputTensors) { - std::string er = backend->GetId(); - er += " does not have PreImportIOTensors capability"; - throw BackendCapabilityException(er); - } + auto layerBindingId = inputTensor.first; + auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer) + { + return layer->GetBindingId() == layerBindingId; + }); + + if (it == graph.GetInputLayers().end()) + { + throw MemoryImportException(fmt::format( + "ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId)); + } - const OutputSlot& outputSlot = layer->GetOutputSlots()[0]; + const Layer* layer = *it; + if (layer->GetType() != LayerType::Input) + { + throw InvalidArgumentException("ImportInputs: given layer not an InputLayer"); + } - ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId(); - const TensorInfo& tensorInfo = outputSlot.GetTensorInfo(); + auto& backend = m_Backends.at(layer->GetBackendId()); + if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities())) + { + std::string er = backend->GetId(); + er += " does not have PreImportIOTensors capability"; + throw BackendCapabilityException(er); + } - ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId); - ARMNN_ASSERT(handleFactory); + const OutputSlot& outputSlot = layer->GetOutputSlots()[0]; - ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, - handleFactory->CreateTensorHandle(tensorInfo, false)}; + ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId(); + const TensorInfo& tensorInfo = outputSlot.GetTensorInfo(); - ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get(); + ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId); + ARMNN_ASSERT(handleFactory); - if (!CheckFlag(tensorHandle->GetImportFlags(), m_NetworkProperties.m_InputSource)) - { - throw MemoryImportException( - fmt::format("ImportInputs: Memory Import failed, backend: {} does not support importing from source {}" - , factoryId, m_NetworkProperties.m_InputSource)); - } + ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, + handleFactory->CreateTensorHandle(tensorInfo, false)}; + + ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get(); + + if (!CheckFlag(tensorHandle->GetImportFlags(), m_NetworkProperties.m_InputSource)) + { + throw MemoryImportException( + fmt::format("ImportInputs: Memory Import failed, backend: " + "{} does not support importing from source {}" + , factoryId, m_NetworkProperties.m_InputSource)); + } + + std::unique_ptr passThroughTensorHandle = + std::make_unique(inputTensor.second.GetInfo(), + inputTensor.second.GetMemoryArea()); + + if (tensorHandle->Import(passThroughTensorHandle->Map(), m_NetworkProperties.m_InputSource)) + { + importedInputs.push_back(m_CurImportedInputId++); + passThroughTensorHandle->Unmap(); + } + else + { + passThroughTensorHandle->Unmap(); + throw MemoryImportException("ImportInputs: Memory Import failed"); + } - std::unique_ptr passThroughTensorHandle = - std::make_unique(inputTensor.second.GetInfo(), - inputTensor.second.GetMemoryArea()); + m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin)); + } + return importedInputs; + } +} - if (tensorHandle->Import(passThroughTensorHandle->Map(), m_NetworkProperties.m_InputSource)) +std::vector LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors, + MemorySource forceImportMemorySource) +{ + if (!m_NetworkProperties.m_ExportEnabled) + { + // Cannot import if import is not enabled and forceImportMemorySource is undefined + if (forceImportMemorySource == MemorySource::Undefined) { - importedInputs.push_back(m_CurImportedInputId++); - passThroughTensorHandle->Unmap(); + throw MemoryImportException("ImportOutputs: Memory Import failed, NetworkProperties.m_ImportEnabled"); } - else + // If forceImportMemorySource is defined, try import if memory is aligned + if (outputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumOutputs()) { - passThroughTensorHandle->Unmap(); - throw MemoryImportException("ImportInputs: Memory Import failed"); + throw MemoryImportException("ImportOutputs: Force Import failed, incorrect number of tensors"); } + std::vector importedOutputs; + Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort(); + for (auto outputTensor : outputTensors) + { + auto layerBindingId = outputTensor.first; + auto it = std::find_if(graph.GetOutputLayers().begin(), graph.GetOutputLayers().end(), [=](auto* layer) + { + return layer->GetBindingId() == layerBindingId; + }); - m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin)); - } + if (it == graph.GetOutputLayers().end()) + { + throw MemoryImportException(fmt::format("ImportOutputs: Memory Import failed, " + "unknown LayerBindingId: {}", + layerBindingId)); + } - return importedInputs; -} + const Layer* layer = *it; + if (layer->GetType() != LayerType::Output) + { + throw InvalidArgumentException("ImportOutputs: given layer not an OutputLayer"); + } -std::vector LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors) -{ - if (!m_NetworkProperties.m_ExportEnabled) // Try import the output tensor - { - throw MemoryImportException("ImportOutputs: Memory Import failed, NetworkProperties.m_ImportEnabled"); + const OutputSlot* outputSlot = layer->GetInputSlots()[0].GetConnectedOutputSlot(); + ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId(); + ITensorHandleFactory::FactoryId importFactoryId = + m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId); + ITensorHandleFactory* importFactory = + m_TensorHandleFactoryRegistry.GetFactory(importFactoryId, forceImportMemorySource); + if (!importFactory) + { + throw MemoryImportException("ImportOutputs: Force Import failed, cannot find matching Import Factory"); + } + + OutputHandler& outputHandler = + const_cast(layer->GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler()); + outputHandler.SetAllocatedData(); + ITensorHandle* inputTensorHandle = outputHandler.GetData(); + outputHandler.CreateTensorHandles(*importFactory, false); + inputTensorHandle = outputHandler.GetData(); + + // Check if the output memory can be imported + if (inputTensorHandle->CanBeImported(outputTensor.second.GetMemoryArea(), forceImportMemorySource)) + { + if (inputTensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource)) + { + try + { + m_WorkloadQueue[m_OutputWorkloadSlotPairs[layerBindingId].first].get()-> + ReplaceOutputTensorHandle(inputTensorHandle, + m_OutputWorkloadSlotPairs[layerBindingId].second); + importedOutputs.push_back(m_CurImportedOutputId++); + // For force import, we want OutputHandler to own the TensorHandle, + // so we do not move the TensorHandle to m_PreImportedOutputHandles as in AsyncEnabled networks + ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, nullptr}; + m_PreImportedOutputHandles.push_back(std::move(importedTensorHandlePin)); + } + catch(armnn::UnimplementedException& e) + { + IgnoreUnused(e); + // Method not implement, cannot use import tensor and have to use allocated data instead + outputHandler.UseAllocatedData(); + } + } + } + else + { + // Cannot import, use allocated memory + outputHandler.UseAllocatedData(); + } + } + return importedOutputs; } std::vector importedOutputs; diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp index 9de6307938..f637dec8eb 100644 --- a/src/armnn/LoadedNetwork.hpp +++ b/src/armnn/LoadedNetwork.hpp @@ -55,14 +55,18 @@ public: TensorInfo GetInputTensorInfo(LayerBindingId layerId) const; TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const; - std::vector ImportInputs(const InputTensors& inputTensors); - std::vector ImportOutputs(const OutputTensors& outputTensors); + std::vector ImportInputs(const InputTensors& inputTensors, + MemorySource forceImportMemorySource = MemorySource::Undefined); + std::vector ImportOutputs(const OutputTensors& outputTensors, + MemorySource forceImportMemorySource = MemorySource::Undefined); void ClearImportedInputs(const std::vector inputIds); void ClearImportedOutputs(const std::vector outputIds); /// Single thread execution of the loaded network - Status EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors); + Status EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors, + std::vector preImportedInputIds = {}, + std::vector preImportedOutputIds = {}); /// Thread safe execution of the loaded network Status Execute(const InputTensors& inputTensors, @@ -200,8 +204,9 @@ private: // A set of vectors to record the workload queue indexes and their corresponding Input/Output Slot indexes // which are connected to Inputs and Outputs for the network. - std::vector> m_InputWorkloadSlotPairs; - std::vector> m_OutputWorkloadSlotPairs; + std::unordered_map> m_InputWorkloadSlotPairs; + std::unordered_map> m_OutputWorkloadSlotPairs; + }; } diff --git a/src/armnn/OutputHandler.hpp b/src/armnn/OutputHandler.hpp index 41a49af031..3fd2519ed5 100644 --- a/src/armnn/OutputHandler.hpp +++ b/src/armnn/OutputHandler.hpp @@ -50,10 +50,15 @@ public: void SetData(std::unique_ptr data) { m_TensorHandle = std::move(data); } + void SetAllocatedData() { m_AllocatedTensorHandle = std::move(m_TensorHandle); } + + void UseAllocatedData() { m_TensorHandle = std::move(m_AllocatedTensorHandle); } + /// @brief Returns true if SetTensorInfo() has been called at least once on this. bool IsTensorInfoSet() const { return m_bTensorInfoSet; } private: std::unique_ptr m_TensorHandle; + std::unique_ptr m_AllocatedTensorHandle; TensorInfo m_TensorInfo; bool m_bTensorInfoSet = false; }; diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp index 2752e7209c..95fb8a3abb 100644 --- a/src/armnn/Runtime.cpp +++ b/src/armnn/Runtime.cpp @@ -77,14 +77,16 @@ armnn::TensorInfo IRuntime::GetOutputTensorInfo(NetworkId networkId, LayerBindin return pRuntimeImpl->GetOutputTensorInfo(networkId, layerId); } -std::vector IRuntime::ImportInputs(NetworkId networkId, const InputTensors& inputTensors) +std::vector IRuntime::ImportInputs(NetworkId networkId, const InputTensors& inputTensors, + MemorySource forceImportMemorySource) { - return pRuntimeImpl->ImportInputs(networkId, inputTensors); + return pRuntimeImpl->ImportInputs(networkId, inputTensors, forceImportMemorySource); } -std::vector IRuntime::ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors) +std::vector IRuntime::ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors, + MemorySource forceImportMemorySource) { - return pRuntimeImpl->ImportOutputs(networkId, outputTensors); + return pRuntimeImpl->ImportOutputs(networkId, outputTensors, forceImportMemorySource); } void IRuntime::ClearImportedInputs(NetworkId networkId, const std::vector inputIds) @@ -98,9 +100,12 @@ void IRuntime::ClearImportedOutputs(NetworkId networkId, const std::vector preImportedInputIds, + std::vector preImportedOutputIds) { - return pRuntimeImpl->EnqueueWorkload(networkId, inputTensors, outputTensors); + return pRuntimeImpl->EnqueueWorkload(networkId, inputTensors, outputTensors, + preImportedInputIds, preImportedOutputIds); } Status IRuntime::Execute(IWorkingMemHandle& workingMemHandle, @@ -566,14 +571,16 @@ TensorInfo RuntimeImpl::GetOutputTensorInfo(NetworkId networkId, LayerBindingId return GetLoadedNetworkPtr(networkId)->GetOutputTensorInfo(layerId); } -std::vector RuntimeImpl::ImportInputs(NetworkId networkId, const InputTensors& inputTensors) +std::vector RuntimeImpl::ImportInputs(NetworkId networkId, const InputTensors& inputTensors, + MemorySource forceImportMemorySource) { - return GetLoadedNetworkPtr(networkId)->ImportInputs(inputTensors); + return GetLoadedNetworkPtr(networkId)->ImportInputs(inputTensors, forceImportMemorySource); } -std::vector RuntimeImpl::ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors) +std::vector RuntimeImpl::ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors, + MemorySource forceImportMemorySource) { - return GetLoadedNetworkPtr(networkId)->ImportOutputs(outputTensors); + return GetLoadedNetworkPtr(networkId)->ImportOutputs(outputTensors, forceImportMemorySource); } void RuntimeImpl::ClearImportedInputs(NetworkId networkId, const std::vector inputIds) @@ -587,7 +594,9 @@ void RuntimeImpl::ClearImportedOutputs(NetworkId networkId, const std::vector preImportedInputIds, + std::vector preImportedOutputIds) { const auto startTime = armnn::GetTimeNow(); @@ -617,7 +626,8 @@ Status RuntimeImpl::EnqueueWorkload(NetworkId networkId, } lastId=networkId; - auto status = loadedNetwork->EnqueueWorkload(inputTensors, outputTensors); + auto status = loadedNetwork->EnqueueWorkload(inputTensors, outputTensors, + preImportedInputIds, preImportedOutputIds); ARMNN_LOG(info) << "Execution time: " << std::setprecision(2) << std::fixed << armnn::GetTimeDuration(startTime).count() << " ms."; diff --git a/src/armnn/Runtime.hpp b/src/armnn/Runtime.hpp index 4052bb6d3a..bd37013ad0 100644 --- a/src/armnn/Runtime.hpp +++ b/src/armnn/Runtime.hpp @@ -55,8 +55,10 @@ public: armnn::TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; armnn::TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const; - std::vector ImportInputs(NetworkId networkId, const InputTensors& inputTensors); - std::vector ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors); + std::vector ImportInputs(NetworkId networkId, const InputTensors& inputTensors, + MemorySource forceImportMemorySource = MemorySource::Undefined); + std::vector ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors, + MemorySource forceImportMemorySource = MemorySource::Undefined); void ClearImportedInputs(NetworkId networkId, const std::vector inputIds); void ClearImportedOutputs(NetworkId networkId, const std::vector outputIds); @@ -64,7 +66,9 @@ public: // Evaluates network using input in inputTensors, outputs filled into outputTensors. Status EnqueueWorkload(NetworkId networkId, const InputTensors& inputTensors, - const OutputTensors& outputTensors); + const OutputTensors& outputTensors, + std::vector preImportedInputIds = {}, + std::vector preImportedOutputIds = {}); /// This is an experimental function. /// Evaluates a network using input in inputTensors and outputs filled into outputTensors. diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp index 8abb16ccca..0fc5da78d1 100644 --- a/src/backends/cl/ClBackend.cpp +++ b/src/backends/cl/ClBackend.cpp @@ -133,6 +133,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) const { + // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc + if (inputFlags == static_cast(MemorySource::Undefined)) + { + inputFlags = static_cast(MemorySource::Malloc); + } + if (outputFlags == static_cast(MemorySource::Undefined)) + { + outputFlags = static_cast(MemorySource::Malloc); + } std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { @@ -193,6 +202,15 @@ void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& regis MemorySourceFlags inputFlags, MemorySourceFlags outputFlags) { + // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc + if (inputFlags == static_cast(MemorySource::Undefined)) + { + inputFlags = static_cast(MemorySource::Malloc); + } + if (outputFlags == static_cast(MemorySource::Undefined)) + { + outputFlags = static_cast(MemorySource::Malloc); + } std::shared_ptr memoryManager; if (m_UsingCustomAllocator) { diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp index a236a70d7c..54710d8135 100644 --- a/src/backends/cl/ClImportTensorHandle.hpp +++ b/src/backends/cl/ClImportTensorHandle.hpp @@ -205,7 +205,11 @@ public: // We do this to match the behaviour of the Import function later on. auto cachelineAlignment = arm_compute::CLKernelLibrary::get().get_device().getInfo(); - auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + auto roundedSize = totalBytes; + if (totalBytes % cachelineAlignment != 0) + { + roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + } cl_int error = CL_SUCCESS; cl_mem buffer; @@ -252,7 +256,11 @@ private: // This does not change the size of the buffer, only the size of the mapping the buffer is mapped to auto cachelineAlignment = arm_compute::CLKernelLibrary::get().get_device().getInfo(); - auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + auto roundedSize = totalBytes; + if (totalBytes % cachelineAlignment != 0) + { + roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment); + } cl_int error = CL_SUCCESS; cl_mem buffer; diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp index d8b2d4f786..4a28205ade 100644 --- a/src/backends/cl/test/ClCreateWorkloadTests.cpp +++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp @@ -11,11 +11,14 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include @@ -355,6 +358,64 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "CreateConvolution2dFastMathEnabledWo ARMNN_ASSERT(conv2dWorkload->GetConvolutionMethod() == arm_compute::ConvolutionMethod::WINOGRAD); } +TEST_CASE_FIXTURE(ClContextControlFixture, "ClReplaceInputOutputConvolution2dWorkload") +{ + // Create Convolution2dWorkload with ClTensorHandle input and output + // Then replace the input and output with ClImportTensorHandle + Graph graph; + ClWorkloadFactory factory = + ClWorkloadFactoryHelper::GetFactory(ClWorkloadFactoryHelper::GetMemoryManager()); + + auto workload = + CreateConvolution2dWorkloadTest(factory, + graph, + DataLayout::NHWC); + + TensorShape inputShape = std::initializer_list({2, 8, 16, 3}); + TensorShape outputShape = std::initializer_list({2, 2, 10, 2}); + + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). + Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); + auto inputHandle = PolymorphicDowncast(queueDescriptor.m_Inputs[0]); + auto outputHandle = PolymorphicDowncast(queueDescriptor.m_Outputs[0]); + CHECK((inputHandle->GetShape() == inputShape)); + CHECK((outputHandle->GetShape() == outputShape)); + // The input and output handles are created correctly as ClTensorHandle + CHECK((dynamic_cast(inputHandle) != nullptr)); + CHECK((dynamic_cast(outputHandle) != nullptr)); + + // Replace with ImportTensorHandle + ClImportTensorHandleFactory importFactory(static_cast(MemorySource::Malloc), + static_cast(MemorySource::Malloc)); + + TensorInfo inputInfo({ 2, 8, 16, 3 }, DataType::Float32); + TensorInfo outputInfo({ 2, 2, 10, 2 }, DataType::Float32); + + // create TensorHandle for memory import + auto inputImportHandle = importFactory.CreateTensorHandle(inputInfo); + auto outputImportHandle = importFactory.CreateTensorHandle(outputInfo); + + // Calling ReplaceInputTensorHandle and ReplaceOutputTensorHandle does not throw exception + // as Reconfigure function is implemented + workload->ReplaceInputTensorHandle(inputImportHandle.get(), 0); + workload->ReplaceOutputTensorHandle(outputImportHandle.get(), 0); + + // Correctly replaced with the import handles with correct information + queueDescriptor = workload->GetData(); + auto replacedInputHandle = PolymorphicDowncast(queueDescriptor.m_Inputs[0]); + auto replacedOutputHandle = PolymorphicDowncast(queueDescriptor.m_Outputs[0]); + CHECK((replacedInputHandle->GetShape() == inputShape)); + CHECK((replacedOutputHandle->GetShape() == outputShape)); + + CHECK((inputImportHandle.get() == replacedInputHandle)); + CHECK((inputImportHandle.get() == replacedInputHandle)); + + CHECK((dynamic_cast(replacedInputHandle) == nullptr)); + CHECK((dynamic_cast(replacedInputHandle) != nullptr)); + CHECK((dynamic_cast(replacedOutputHandle) == nullptr)); + CHECK((dynamic_cast(replacedOutputHandle) != nullptr)); +} + TEST_CASE_FIXTURE(ClContextControlFixture, "CreateConvolution2dClCompiledContextWorkload") { using namespace armnn; diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp index 3d702642aa..161765484d 100644 --- a/src/backends/cl/test/ClImportTensorHandleTests.cpp +++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp @@ -274,4 +274,157 @@ TEST_CASE("ClCanBeImportedAlignedMemory") // we can be confident that it will be successfully imported. All other cases will need to be handled by the user. } +TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd") +{ + // Create runtime in which test will run + IRuntime::CreationOptions options; + IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + // build up the structure of the network + INetworkPtr network(INetwork::Create()); + + armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32); + armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32); + armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32); + + kernelInfo.SetConstant(true); + + std::vector kernel = + { + 4, 5, 6, + 0, 0, 0, + 3, 2, 1 + }; + + const std::vector expectedOutput = + { + 23, 41, 33, 21, + 44, 65, 76, 52, + 82, 85, 79, 42 + }; + + unsigned int numElements = inputInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input"); + ARMNN_ASSERT(inputLayer); + + armnn::ConstTensor weights(kernelInfo, kernel); + + armnn::Convolution2dDescriptor convDesc2d; + convDesc2d.m_StrideX = 1; + convDesc2d.m_StrideY = 1; + convDesc2d.m_PadLeft = 1; + convDesc2d.m_PadRight = 1; + convDesc2d.m_PadTop = 1; + convDesc2d.m_PadBottom = 1; + convDesc2d.m_DataLayout = DataLayout::NHWC; + armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d, + weights, + armnn::EmptyOptional(), + "conv"); + ARMNN_ASSERT(convLayer); + + inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); + inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); + + IConnectableLayer* output = network->AddOutputLayer(0, "output"); + convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); + + // Optimize the network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = false; + std::vector backends = {armnn::Compute::GpuAcc}; + IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions); + CHECK(optNet); + + // Loads it into the runtime. + NetworkId netId; + std::string ignoredErrorMessage; + // Enable Importing + INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); + runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + const size_t alignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo(); + size_t space = totalBytes + alignment + alignment; + auto inputData = std::make_unique(space); + void* alignedInputPtr = inputData.get(); + CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); + + // Input with negative values + auto* inputPtr = reinterpret_cast(alignedInputPtr); + inputPtr[0] = 1; + inputPtr[1] = 5; + inputPtr[2] = 2; + inputPtr[3] = 3; + inputPtr[4] = 8; + inputPtr[5] = 7; + inputPtr[6] = 3; + inputPtr[7] = 6; + inputPtr[8] = 3; + inputPtr[9] = 3; + inputPtr[10] = 9; + inputPtr[11] = 1; + + + auto outputData = std::make_unique(space); + void* alignedOutputPtr = outputData.get(); + CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space)); + auto* outputPtr = reinterpret_cast(alignedOutputPtr); + std::fill_n(outputPtr, numElements, -10.0f); + + TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0); + inputTensorInfo.SetConstant(true); + InputTensors inputTensors + { + {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)}, + }; + OutputTensors outputTensors + { + {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + INFO("Run ImportInputs"); + std::vector importedInputIds = + runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); + std::vector importedOutputIds = + runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Contains Convolution2dWorkload + std::size_t found = dump.find("Convolution2dWorkload"); + CHECK(found != std::string::npos); + + // Contains SyncMemGeneric + found = dump.find("SyncMemGeneric"); + CHECK(found != std::string::npos); + + // Does not contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + CHECK(found == std::string::npos); + + runtime->UnloadNetwork(netId); + + // Check output is as expected + // Validate result by checking that the output has no negative values + auto* outputResult = reinterpret_cast(alignedOutputPtr); + CHECK(outputResult); + + // Check the output is correct + CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end())); +} + } diff --git a/src/backends/cl/workloads/ClBaseWorkload.hpp b/src/backends/cl/workloads/ClBaseWorkload.hpp index e74fc84f4f..03417e33ae 100644 --- a/src/backends/cl/workloads/ClBaseWorkload.hpp +++ b/src/backends/cl/workloads/ClBaseWorkload.hpp @@ -20,17 +20,38 @@ public: // Replace input tensor handle with the given TensorHandle and call Reconfigure() void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override { + ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot]; this->m_Data.m_Inputs[slot] = tensorHandle; - Reconfigure(); + try + { + Reconfigure(); + } + catch(armnn::UnimplementedException& e) + { + // Cannot reconfigure, revert the slot back and throw the exception. + this->m_Data.m_Inputs[slot] = backupHandle; + throw e; + } } // Replace output tensor handle with the given TensorHandle and call Reconfigure() void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override { + ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot]; this->m_Data.m_Outputs[slot] = tensorHandle; - Reconfigure(); + try + { + Reconfigure(); + } + catch(armnn::UnimplementedException& e) + { + // Cannot reconfigure, revert the slot back and throw the exception. + this->m_Data.m_Inputs[slot] = backupHandle; + throw e; + } } +protected: // Reconfigure the workload configuration. Throw armnn::UnimplementedException by default. virtual void Reconfigure() { diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp index 705e92d307..cdfa885f67 100644 --- a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp +++ b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp @@ -90,6 +90,10 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); + // Create Proxy tensor and set the initial tensor handle to it + m_InputProxy = std::make_unique(&input); + m_OutputProxy = std::make_unique(&output); + arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout); input.info()->set_data_layout(aclDataLayout); output.info()->set_data_layout(aclDataLayout); @@ -101,10 +105,10 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip { ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_configure"); m_ConvolutionLayer.configure(clCompileContext, - &input, + m_InputProxy.get(), m_KernelTensor.get(), m_BiasTensor.get(), - &output, + m_OutputProxy.get(), padStrideInfo, arm_compute::WeightsInfo(), aclDilationInfo, @@ -174,4 +178,13 @@ void ClConvolution2dWorkload::FreeUnusedTensors() FreeTensorIfUnused(m_BiasTensor); } +void ClConvolution2dWorkload::Reconfigure() +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_Reconfigure"); + arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); + m_InputProxy->set(&input); + m_OutputProxy->set(&output); +} + } //namespace armnn diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp index 8a4599df47..891d5096cd 100644 --- a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp +++ b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp @@ -13,6 +13,8 @@ #include #include +#include + #include namespace armnn @@ -38,6 +40,9 @@ public: arm_compute::ConvolutionMethod GetConvolutionMethod() const; +protected: + void Reconfigure() override; + private: mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; @@ -47,6 +52,9 @@ private: arm_compute::ConvolutionMethod m_ConvolutionMethod; void FreeUnusedTensors(); + + std::unique_ptr m_InputProxy; + std::unique_ptr m_OutputProxy; }; } //namespace armnn diff --git a/src/backends/neon/workloads/NeonBaseWorkload.hpp b/src/backends/neon/workloads/NeonBaseWorkload.hpp index a92f35a173..63f3539164 100644 --- a/src/backends/neon/workloads/NeonBaseWorkload.hpp +++ b/src/backends/neon/workloads/NeonBaseWorkload.hpp @@ -20,17 +20,38 @@ public: // Replace input tensor handle with the given TensorHandle and call Reconfigure() void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override { + ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot]; this->m_Data.m_Inputs[slot] = tensorHandle; - Reconfigure(); + try + { + Reconfigure(); + } + catch(armnn::UnimplementedException& e) + { + // Cannot reconfigure, revert the slot back and throw the exception. + this->m_Data.m_Inputs[slot] = backupHandle; + throw e; + } } // Replace output tensor handle with the given TensorHandle and call Reconfigure() void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override { + ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot]; this->m_Data.m_Outputs[slot] = tensorHandle; - Reconfigure(); + try + { + Reconfigure(); + } + catch(armnn::UnimplementedException& e) + { + // Cannot reconfigure, revert the slot back and throw the exception. + this->m_Data.m_Inputs[slot] = backupHandle; + throw e; + } } +protected: // Reconfigure the workload configuration. Throw armnn::UnimplementedException by default. virtual void Reconfigure() { diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp index 4444f5c361..7a6cf97936 100644 --- a/src/backends/reference/test/RefEndToEndTests.cpp +++ b/src/backends/reference/test/RefEndToEndTests.cpp @@ -1378,6 +1378,93 @@ TEST_CASE("RefRankEndToEndTestQSymmS8") RankEndToEnd(defaultBackends); } +TEST_CASE("RefForceImportTest") +{ + using namespace armnn; + + std::vector backends = defaultBackends; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input = net->AddInputLayer(0); + + ActivationDescriptor descriptor; + descriptor.m_Function = ActivationFunction::Square; + IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor); + + IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0)); + activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true)); + activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32)); + + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + + // Load it into the runtime. It should pass. + NetworkId netId; + std::string ignoredErrorMessage; + + INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined); + + CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties) + == Status::Success); + + // Creates structures for input & output + std::vector inputData + { + 1.0f, 2.0f, 3.0f, 4.0f + }; + + std::vector outputData(4); + + std::vector expectedOutput + { + 1.0f, 4.0f, 9.0f, 16.0f + }; + + InputTensors inputTensors + { + {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())}, + }; + OutputTensors outputTensors + { + {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())} + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + std::vector importedInputIds = + runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc); + std::vector importedOutputIds = + runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc); + + // Do the inference and force the import as the memory is alligned. + runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Check there is a SyncMemGeneric workload as we exported + int count = SubStringCounter(dump, "SyncMemGeneric"); + CHECK(count == 1); + + // Shouldn't be any CopyMemGeneric workloads + count = SubStringCounter(dump, "CopyMemGeneric"); + CHECK(count == 0); + + // Check the output is correct + CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end())); +} + #if !defined(__ANDROID__) // Only run these tests on non Android platforms TEST_CASE("RefImportNonAlignedPointerTest") -- cgit v1.2.1