aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNarumol Prangnawarat <narumol.prangnawarat@arm.com>2022-01-28 17:59:18 +0000
committerJim Flynn <jim.flynn@arm.com>2022-01-31 12:53:51 +0000
commite2af6f4322a1e2b8b3c391fb721a6a80c281477f (patch)
treeb0dd53289e27304a6d724821459cb0f4b6343a39
parentfd313fef775ed210f8dab84452ea382a0b4164b0 (diff)
downloadarmnn-e2af6f4322a1e2b8b3c391fb721a6a80c281477f.tar.gz
IVGCVSW-6552 Add support of aligned host memory
* Add AllocatedData functions to OutputHandler * Enable import aligned memory in ImportInputs * Enable import aligned memory in ImportOutputs * Allow to import input and output if the memory is aligned * Implement Reconfigure function on ClConvolution2dWorkload * End-to-end test on Ref and Cl to ensure that input and output memory are imported when aligned Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com> Change-Id: I9e5e4c26d1ac2f1d806803ade5f64c6479c51718
-rw-r--r--include/armnn/IRuntime.hpp15
-rw-r--r--src/armnn/LoadedNetwork.cpp378
-rw-r--r--src/armnn/LoadedNetwork.hpp15
-rw-r--r--src/armnn/OutputHandler.hpp5
-rw-r--r--src/armnn/Runtime.cpp34
-rw-r--r--src/armnn/Runtime.hpp10
-rw-r--r--src/backends/cl/ClBackend.cpp18
-rw-r--r--src/backends/cl/ClImportTensorHandle.hpp12
-rw-r--r--src/backends/cl/test/ClCreateWorkloadTests.cpp61
-rw-r--r--src/backends/cl/test/ClImportTensorHandleTests.cpp153
-rw-r--r--src/backends/cl/workloads/ClBaseWorkload.hpp25
-rw-r--r--src/backends/cl/workloads/ClConvolution2dWorkload.cpp17
-rw-r--r--src/backends/cl/workloads/ClConvolution2dWorkload.hpp8
-rw-r--r--src/backends/neon/workloads/NeonBaseWorkload.hpp25
-rw-r--r--src/backends/reference/test/RefEndToEndTests.cpp87
15 files changed, 758 insertions, 105 deletions
diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp
index d85a3e3724..042271fc2b 100644
--- a/include/armnn/IRuntime.hpp
+++ b/include/armnn/IRuntime.hpp
@@ -216,18 +216,19 @@ public:
TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
-
/// ImportInputs separates the importing and mapping of InputTensors from network execution.
/// Allowing for a set of InputTensors to be imported and mapped once, but used in execution many times.
/// This function is not thread safe and must not be used while other threads are calling Execute().
- /// Only compatible with AsyncEnabled networks
- std::vector<ImportedInputId> ImportInputs(NetworkId networkId, const InputTensors& inputTensors);
+ /// Only compatible with AsyncEnabled networks and aligned memory import
+ std::vector<ImportedInputId> ImportInputs(NetworkId networkId, const InputTensors& inputTensors,
+ MemorySource forceImportMemorySource = MemorySource::Undefined);
/// ImportOutputs separates the importing and mapping of OutputTensors from network execution.
/// Allowing for a set of OutputTensors to be imported and mapped once, but used in execution many times.
/// This function is not thread safe and must not be used while other threads are calling Execute().
- /// Only compatible with AsyncEnabled networks
- std::vector<ImportedOutputId> ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors);
+ /// Only compatible with AsyncEnabled networks and aligned memory import
+ std::vector<ImportedOutputId> ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors,
+ MemorySource forceImportMemorySource = MemorySource::Undefined);
/// Un-import and delete the imported InputTensor/s
/// This function is not thread safe and must not be used while other threads are calling Execute().
@@ -242,7 +243,9 @@ public:
/// Evaluates a network using input in inputTensors and outputs filled into outputTensors
Status EnqueueWorkload(NetworkId networkId,
const InputTensors& inputTensors,
- const OutputTensors& outputTensors);
+ const OutputTensors& outputTensors,
+ std::vector<ImportedInputId> preImportedInputIds = {},
+ std::vector<ImportedOutputId> preImportedOutputIds = {});
/// This is an experimental function.
/// Evaluates a network using input in inputTensors and outputs filled into outputTensors.
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index 1d1aae53a5..45891f7dc3 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -314,21 +314,22 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
{
if (layer->GetNumInputSlots() >= 1)
{
- unsigned int slotIndex = 0;
+ unsigned int inputSlotIndex = 0;
for (auto& inputSlot : layer->GetInputSlots())
{
if (inputSlot.GetOwningLayer().GetType() == LayerType::Input)
{
- m_InputWorkloadSlotPairs.push_back(
- std::make_pair(m_WorkloadQueue.size(), slotIndex));
+ auto inputLayer = PolymorphicDowncast<InputLayer*>(&inputSlot.GetOwningLayer());
+ m_InputWorkloadSlotPairs[inputLayer->GetBindingId()] =
+ std::make_pair(m_WorkloadQueue.size(), inputSlotIndex);
}
- ++slotIndex;
+ ++inputSlotIndex;
}
}
if (layer->GetNumOutputSlots() >= 1)
{
- unsigned int slotIndex = 0;
+ unsigned int outputSlotIndex = 0;
for (auto& outputSlot : layer->GetOutputSlots())
{
for (unsigned int i = 0; i < outputSlot.GetNumConnections(); i++)
@@ -337,12 +338,14 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
// Add its index within layer->GetOutputSlots() to m_OutputWorkloadSlotPairs
if (outputSlot.GetConnection(i)->GetOwningLayer().GetType() == LayerType::Output)
{
- m_OutputWorkloadSlotPairs.push_back(
- std::make_pair(m_WorkloadQueue.size(), slotIndex));
+ auto outputLayer = PolymorphicDowncast<OutputLayer*>(
+ &outputSlot.GetConnection(i)->GetOwningLayer());
+ m_OutputWorkloadSlotPairs[outputLayer->GetBindingId()] =
+ std::make_pair(m_WorkloadQueue.size(), outputSlotIndex);
continue;
}
}
- ++slotIndex;
+ ++outputSlotIndex;
}
}
m_WorkloadQueue.push_back(std::move(workload));
@@ -667,7 +670,9 @@ private:
}
Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
- const OutputTensors& outputTensors)
+ const OutputTensors& outputTensors,
+ std::vector<ImportedInputId> preImportedInputIds,
+ std::vector<ImportedOutputId> preImportedOutputIds)
{
const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
@@ -691,10 +696,26 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
m_InputQueue.clear();
m_InputQueue.reserve(graph.GetNumInputs());
+
for (const BindableLayer* inputLayer : graph.GetInputLayers())
{
- const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
- EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+ if (preImportedInputIds.size() != m_PreImportedInputHandles.size())
+ {
+ throw InvalidArgumentException("Invalid number of preImportedInputIds");
+ }
+ auto layerBindingId = inputLayer->GetBindingId();
+ auto it = std::find_if(preImportedInputIds.begin(), preImportedInputIds.end(),
+ [=](auto preImportedInputId)
+ {
+ return m_PreImportedInputHandles[preImportedInputId].m_LayerBindingId == layerBindingId;
+ });
+
+ if (it == preImportedInputIds.end())
+ {
+ // InputTensorHandle is not imported yet, process to enqueue input
+ const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
+ EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+ }
}
}
@@ -703,12 +724,57 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
m_OutputQueue.clear();
m_OutputQueue.reserve(graph.GetNumOutputs());
+
for (const BindableLayer* outputLayer : graph.GetOutputLayers())
{
+ if (preImportedOutputIds.size() != m_PreImportedOutputHandles.size())
+ {
+ throw InvalidArgumentException("Invalid number of preImportedOutputIds");
+ }
+ auto layerBindingId = outputLayer->GetBindingId();
+ auto it = std::find_if(preImportedOutputIds.begin(), preImportedOutputIds.end(),
+ [=](auto preImportedOutputId)
+ {
+ return m_PreImportedOutputHandles[preImportedOutputId].m_LayerBindingId == layerBindingId;
+ });
+
const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
- EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+
+ if (it == preImportedOutputIds.end())
+ {
+ // OutputTensorHandle is not imported yet, process to enqueue Output
+ EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+ }
+ else
+ {
+ // Insert synchronization workload for the imported output
+ OutputQueueDescriptor outputQueueDescriptor;
+ WorkloadInfo info;
+
+ outputQueueDescriptor.m_Outputs.push_back(pin.GetTensorHandle());
+ info.m_OutputTensorInfos.push_back(pin.GetTensorInfo());
+
+ // Gets the output handler from the previous node.
+ const OutputHandler& outputHandler =
+ outputLayer->GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();
+
+ const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
+ ITensorHandle* inputTensorHandle = outputHandler.GetData();
+ ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
+ MemSyncQueueDescriptor syncDesc;
+ syncDesc.m_Inputs.push_back(inputTensorHandle);
+ info.m_InputTensorInfos.push_back(inputTensorInfo);
+ auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
+ ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
+ m_OutputQueue.push_back(move(syncWorkload));
+ }
}
}
+ // Clear m_PreImportedInputHandles and m_PreImportedOutputHandles
+ m_PreImportedInputHandles.clear();
+ m_PreImportedOutputHandles.clear();
+ m_CurImportedInputId = 0;
+ m_CurImportedOutputId = 0;
std::unique_ptr<TimelineUtilityMethods> timelineUtils =
TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
@@ -1120,90 +1186,260 @@ const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTe
throw InvalidArgumentException("Output does not exist.");
}
-std::vector<ImportedInputId> LoadedNetwork::ImportInputs(const InputTensors& inputTensors)
+std::vector<ImportedInputId> LoadedNetwork::ImportInputs(const InputTensors& inputTensors,
+ MemorySource forceImportMemorySource)
{
- if (!m_NetworkProperties.m_ImportEnabled) // Try import the input tensor
- {
- throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
- }
-
- std::vector<ImportedInputId> importedInputs;
- Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
-
- for (auto inputTensor : inputTensors)
+ if (!m_NetworkProperties.m_ImportEnabled)
{
- auto layerBindingId = inputTensor.first;
- auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer)
+ // Cannot import if import is not enabled and forceImportMemorySource is undefined
+ if (forceImportMemorySource == MemorySource::Undefined)
{
- return layer->GetBindingId() == layerBindingId;
- });
-
- if (it == graph.GetInputLayers().end())
+ throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
+ }
+ // If forceImportMemorySource is defined, try import if memory is aligned
+ if (inputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumInputs())
{
- throw MemoryImportException(fmt::format("ImportInputs: Memory Import failed, unknown LayerBindingId: {}",
- layerBindingId));
+ throw MemoryImportException("ImportInputs: Force Import failed, incorrect number of tensors");
}
- const Layer* layer = *it;
- if (layer->GetType() != LayerType::Input)
+ std::vector<ImportedInputId> importedInputs;
+ Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+ for (auto inputTensor : inputTensors)
{
- throw InvalidArgumentException("ImportInputs: given layer not an InputLayer");
+ auto layerBindingId = inputTensor.first;
+ auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer)
+ {
+ return layer->GetBindingId() == layerBindingId;
+ });
+
+ if (it == graph.GetInputLayers().end())
+ {
+ throw MemoryImportException(fmt::format(
+ "ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId));
+ }
+
+ const Layer* layer = *it;
+ if (layer->GetType() != LayerType::Input)
+ {
+ throw InvalidArgumentException("ImportInputs: given layer not an InputLayer");
+ }
+ const OutputSlot& outputSlot = layer->GetOutputSlots()[0];
+ ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
+ // Get matching import factory Id
+ ITensorHandleFactory::FactoryId importFactoryId =
+ m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
+ ITensorHandleFactory* importFactory =
+ m_TensorHandleFactoryRegistry.GetFactory(importFactoryId, forceImportMemorySource);
+ if (!importFactory)
+ {
+ throw MemoryImportException("ImportInputs: Force Import failed, cannot find matching Import Factory");
+ }
+
+ OutputHandler& handler = const_cast<OutputHandler&>(layer->GetOutputHandler(0));
+ handler.SetAllocatedData();
+ handler.CreateTensorHandles(*importFactory, false);
+ ITensorHandle* outputTensorHandle = handler.GetData();
+ std::unique_ptr<ITensorHandle> passThroughTensorHandle =
+ std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
+ inputTensor.second.GetMemoryArea());
+ // Check if the input memory can be imported
+ if (outputTensorHandle->CanBeImported(passThroughTensorHandle->Map(), forceImportMemorySource))
+ {
+ passThroughTensorHandle->Unmap();
+ if (outputTensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource))
+ {
+ passThroughTensorHandle->Unmap();
+ try
+ {
+ m_WorkloadQueue[m_InputWorkloadSlotPairs[layerBindingId].first].get()->ReplaceInputTensorHandle(
+ outputTensorHandle, m_InputWorkloadSlotPairs[layerBindingId].second);
+ importedInputs.push_back(m_CurImportedInputId++);
+ // For force import, we want OutputHandler to own the TensorHandle,
+ // so we do not move the TensorHandle to m_PreImportedInputHandles as in AsyncEnabled networks
+ ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, nullptr};
+ m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin));
+ }
+ catch(armnn::UnimplementedException& e)
+ {
+ IgnoreUnused(e);
+ // Method not implement, cannot use import tensor and have to use allocated data instead
+ handler.UseAllocatedData();
+ }
+ }
+ }
+ else
+ {
+ // Cannot import, use allocated data
+ handler.UseAllocatedData();
+ }
+
}
- auto& backend = m_Backends.at(layer->GetBackendId());
- if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities()))
+ return importedInputs;
+ }
+ else
+ {
+ // Import when the import of network properties is enabled
+ std::vector<ImportedInputId> importedInputs;
+ Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+ for (auto inputTensor : inputTensors)
{
- std::string er = backend->GetId();
- er += " does not have PreImportIOTensors capability";
- throw BackendCapabilityException(er);
- }
+ auto layerBindingId = inputTensor.first;
+ auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer)
+ {
+ return layer->GetBindingId() == layerBindingId;
+ });
+
+ if (it == graph.GetInputLayers().end())
+ {
+ throw MemoryImportException(fmt::format(
+ "ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId));
+ }
- const OutputSlot& outputSlot = layer->GetOutputSlots()[0];
+ const Layer* layer = *it;
+ if (layer->GetType() != LayerType::Input)
+ {
+ throw InvalidArgumentException("ImportInputs: given layer not an InputLayer");
+ }
- ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
- const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
+ auto& backend = m_Backends.at(layer->GetBackendId());
+ if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities()))
+ {
+ std::string er = backend->GetId();
+ er += " does not have PreImportIOTensors capability";
+ throw BackendCapabilityException(er);
+ }
- ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
- ARMNN_ASSERT(handleFactory);
+ const OutputSlot& outputSlot = layer->GetOutputSlots()[0];
- ImportedTensorHandlePin importedTensorHandlePin{layerBindingId,
- handleFactory->CreateTensorHandle(tensorInfo, false)};
+ ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
+ const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
- ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get();
+ ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
+ ARMNN_ASSERT(handleFactory);
- if (!CheckFlag(tensorHandle->GetImportFlags(), m_NetworkProperties.m_InputSource))
- {
- throw MemoryImportException(
- fmt::format("ImportInputs: Memory Import failed, backend: {} does not support importing from source {}"
- , factoryId, m_NetworkProperties.m_InputSource));
- }
+ ImportedTensorHandlePin importedTensorHandlePin{layerBindingId,
+ handleFactory->CreateTensorHandle(tensorInfo, false)};
+
+ ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get();
+
+ if (!CheckFlag(tensorHandle->GetImportFlags(), m_NetworkProperties.m_InputSource))
+ {
+ throw MemoryImportException(
+ fmt::format("ImportInputs: Memory Import failed, backend: "
+ "{} does not support importing from source {}"
+ , factoryId, m_NetworkProperties.m_InputSource));
+ }
+
+ std::unique_ptr<ITensorHandle> passThroughTensorHandle =
+ std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
+ inputTensor.second.GetMemoryArea());
+
+ if (tensorHandle->Import(passThroughTensorHandle->Map(), m_NetworkProperties.m_InputSource))
+ {
+ importedInputs.push_back(m_CurImportedInputId++);
+ passThroughTensorHandle->Unmap();
+ }
+ else
+ {
+ passThroughTensorHandle->Unmap();
+ throw MemoryImportException("ImportInputs: Memory Import failed");
+ }
- std::unique_ptr<ITensorHandle> passThroughTensorHandle =
- std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
- inputTensor.second.GetMemoryArea());
+ m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin));
+ }
+ return importedInputs;
+ }
+}
- if (tensorHandle->Import(passThroughTensorHandle->Map(), m_NetworkProperties.m_InputSource))
+std::vector<ImportedOutputId> LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors,
+ MemorySource forceImportMemorySource)
+{
+ if (!m_NetworkProperties.m_ExportEnabled)
+ {
+ // Cannot import if import is not enabled and forceImportMemorySource is undefined
+ if (forceImportMemorySource == MemorySource::Undefined)
{
- importedInputs.push_back(m_CurImportedInputId++);
- passThroughTensorHandle->Unmap();
+ throw MemoryImportException("ImportOutputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
}
- else
+ // If forceImportMemorySource is defined, try import if memory is aligned
+ if (outputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumOutputs())
{
- passThroughTensorHandle->Unmap();
- throw MemoryImportException("ImportInputs: Memory Import failed");
+ throw MemoryImportException("ImportOutputs: Force Import failed, incorrect number of tensors");
}
+ std::vector<ImportedInputId> importedOutputs;
+ Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+ for (auto outputTensor : outputTensors)
+ {
+ auto layerBindingId = outputTensor.first;
+ auto it = std::find_if(graph.GetOutputLayers().begin(), graph.GetOutputLayers().end(), [=](auto* layer)
+ {
+ return layer->GetBindingId() == layerBindingId;
+ });
- m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin));
- }
+ if (it == graph.GetOutputLayers().end())
+ {
+ throw MemoryImportException(fmt::format("ImportOutputs: Memory Import failed, "
+ "unknown LayerBindingId: {}",
+ layerBindingId));
+ }
- return importedInputs;
-}
+ const Layer* layer = *it;
+ if (layer->GetType() != LayerType::Output)
+ {
+ throw InvalidArgumentException("ImportOutputs: given layer not an OutputLayer");
+ }
-std::vector<ImportedOutputId> LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors)
-{
- if (!m_NetworkProperties.m_ExportEnabled) // Try import the output tensor
- {
- throw MemoryImportException("ImportOutputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
+ const OutputSlot* outputSlot = layer->GetInputSlots()[0].GetConnectedOutputSlot();
+ ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
+ ITensorHandleFactory::FactoryId importFactoryId =
+ m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
+ ITensorHandleFactory* importFactory =
+ m_TensorHandleFactoryRegistry.GetFactory(importFactoryId, forceImportMemorySource);
+ if (!importFactory)
+ {
+ throw MemoryImportException("ImportOutputs: Force Import failed, cannot find matching Import Factory");
+ }
+
+ OutputHandler& outputHandler =
+ const_cast<OutputHandler&>(layer->GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler());
+ outputHandler.SetAllocatedData();
+ ITensorHandle* inputTensorHandle = outputHandler.GetData();
+ outputHandler.CreateTensorHandles(*importFactory, false);
+ inputTensorHandle = outputHandler.GetData();
+
+ // Check if the output memory can be imported
+ if (inputTensorHandle->CanBeImported(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
+ {
+ if (inputTensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
+ {
+ try
+ {
+ m_WorkloadQueue[m_OutputWorkloadSlotPairs[layerBindingId].first].get()->
+ ReplaceOutputTensorHandle(inputTensorHandle,
+ m_OutputWorkloadSlotPairs[layerBindingId].second);
+ importedOutputs.push_back(m_CurImportedOutputId++);
+ // For force import, we want OutputHandler to own the TensorHandle,
+ // so we do not move the TensorHandle to m_PreImportedOutputHandles as in AsyncEnabled networks
+ ImportedTensorHandlePin importedTensorHandlePin{layerBindingId, nullptr};
+ m_PreImportedOutputHandles.push_back(std::move(importedTensorHandlePin));
+ }
+ catch(armnn::UnimplementedException& e)
+ {
+ IgnoreUnused(e);
+ // Method not implement, cannot use import tensor and have to use allocated data instead
+ outputHandler.UseAllocatedData();
+ }
+ }
+ }
+ else
+ {
+ // Cannot import, use allocated memory
+ outputHandler.UseAllocatedData();
+ }
+ }
+ return importedOutputs;
}
std::vector<ImportedOutputId> importedOutputs;
diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp
index 9de6307938..f637dec8eb 100644
--- a/src/armnn/LoadedNetwork.hpp
+++ b/src/armnn/LoadedNetwork.hpp
@@ -55,14 +55,18 @@ public:
TensorInfo GetInputTensorInfo(LayerBindingId layerId) const;
TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const;
- std::vector<ImportedInputId> ImportInputs(const InputTensors& inputTensors);
- std::vector<ImportedOutputId> ImportOutputs(const OutputTensors& outputTensors);
+ std::vector<ImportedInputId> ImportInputs(const InputTensors& inputTensors,
+ MemorySource forceImportMemorySource = MemorySource::Undefined);
+ std::vector<ImportedOutputId> ImportOutputs(const OutputTensors& outputTensors,
+ MemorySource forceImportMemorySource = MemorySource::Undefined);
void ClearImportedInputs(const std::vector<ImportedInputId> inputIds);
void ClearImportedOutputs(const std::vector<ImportedOutputId> outputIds);
/// Single thread execution of the loaded network
- Status EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors);
+ Status EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors,
+ std::vector<ImportedInputId> preImportedInputIds = {},
+ std::vector<ImportedOutputId> preImportedOutputIds = {});
/// Thread safe execution of the loaded network
Status Execute(const InputTensors& inputTensors,
@@ -200,8 +204,9 @@ private:
// A set of vectors to record the workload queue indexes and their corresponding Input/Output Slot indexes
// which are connected to Inputs and Outputs for the network.
- std::vector<std::pair<unsigned int, unsigned int>> m_InputWorkloadSlotPairs;
- std::vector<std::pair<unsigned int, unsigned int>> m_OutputWorkloadSlotPairs;
+ std::unordered_map<LayerBindingId, std::pair<unsigned int, unsigned int>> m_InputWorkloadSlotPairs;
+ std::unordered_map<LayerBindingId, std::pair<unsigned int, unsigned int>> m_OutputWorkloadSlotPairs;
+
};
}
diff --git a/src/armnn/OutputHandler.hpp b/src/armnn/OutputHandler.hpp
index 41a49af031..3fd2519ed5 100644
--- a/src/armnn/OutputHandler.hpp
+++ b/src/armnn/OutputHandler.hpp
@@ -50,10 +50,15 @@ public:
void SetData(std::unique_ptr<ITensorHandle> data) { m_TensorHandle = std::move(data); }
+ void SetAllocatedData() { m_AllocatedTensorHandle = std::move(m_TensorHandle); }
+
+ void UseAllocatedData() { m_TensorHandle = std::move(m_AllocatedTensorHandle); }
+
/// @brief Returns true if SetTensorInfo() has been called at least once on this.
bool IsTensorInfoSet() const { return m_bTensorInfoSet; }
private:
std::unique_ptr<ITensorHandle> m_TensorHandle;
+ std::unique_ptr<ITensorHandle> m_AllocatedTensorHandle;
TensorInfo m_TensorInfo;
bool m_bTensorInfoSet = false;
};
diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp
index 2752e7209c..95fb8a3abb 100644
--- a/src/armnn/Runtime.cpp
+++ b/src/armnn/Runtime.cpp
@@ -77,14 +77,16 @@ armnn::TensorInfo IRuntime::GetOutputTensorInfo(NetworkId networkId, LayerBindin
return pRuntimeImpl->GetOutputTensorInfo(networkId, layerId);
}
-std::vector<ImportedInputId> IRuntime::ImportInputs(NetworkId networkId, const InputTensors& inputTensors)
+std::vector<ImportedInputId> IRuntime::ImportInputs(NetworkId networkId, const InputTensors& inputTensors,
+ MemorySource forceImportMemorySource)
{
- return pRuntimeImpl->ImportInputs(networkId, inputTensors);
+ return pRuntimeImpl->ImportInputs(networkId, inputTensors, forceImportMemorySource);
}
-std::vector<ImportedOutputId> IRuntime::ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors)
+std::vector<ImportedOutputId> IRuntime::ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors,
+ MemorySource forceImportMemorySource)
{
- return pRuntimeImpl->ImportOutputs(networkId, outputTensors);
+ return pRuntimeImpl->ImportOutputs(networkId, outputTensors, forceImportMemorySource);
}
void IRuntime::ClearImportedInputs(NetworkId networkId, const std::vector<ImportedInputId> inputIds)
@@ -98,9 +100,12 @@ void IRuntime::ClearImportedOutputs(NetworkId networkId, const std::vector<Impor
Status IRuntime::EnqueueWorkload(NetworkId networkId,
const InputTensors& inputTensors,
- const OutputTensors& outputTensors)
+ const OutputTensors& outputTensors,
+ std::vector<ImportedInputId> preImportedInputIds,
+ std::vector<ImportedOutputId> preImportedOutputIds)
{
- return pRuntimeImpl->EnqueueWorkload(networkId, inputTensors, outputTensors);
+ return pRuntimeImpl->EnqueueWorkload(networkId, inputTensors, outputTensors,
+ preImportedInputIds, preImportedOutputIds);
}
Status IRuntime::Execute(IWorkingMemHandle& workingMemHandle,
@@ -566,14 +571,16 @@ TensorInfo RuntimeImpl::GetOutputTensorInfo(NetworkId networkId, LayerBindingId
return GetLoadedNetworkPtr(networkId)->GetOutputTensorInfo(layerId);
}
-std::vector<ImportedInputId> RuntimeImpl::ImportInputs(NetworkId networkId, const InputTensors& inputTensors)
+std::vector<ImportedInputId> RuntimeImpl::ImportInputs(NetworkId networkId, const InputTensors& inputTensors,
+ MemorySource forceImportMemorySource)
{
- return GetLoadedNetworkPtr(networkId)->ImportInputs(inputTensors);
+ return GetLoadedNetworkPtr(networkId)->ImportInputs(inputTensors, forceImportMemorySource);
}
-std::vector<ImportedOutputId> RuntimeImpl::ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors)
+std::vector<ImportedOutputId> RuntimeImpl::ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors,
+ MemorySource forceImportMemorySource)
{
- return GetLoadedNetworkPtr(networkId)->ImportOutputs(outputTensors);
+ return GetLoadedNetworkPtr(networkId)->ImportOutputs(outputTensors, forceImportMemorySource);
}
void RuntimeImpl::ClearImportedInputs(NetworkId networkId, const std::vector<ImportedInputId> inputIds)
@@ -587,7 +594,9 @@ void RuntimeImpl::ClearImportedOutputs(NetworkId networkId, const std::vector<Im
Status RuntimeImpl::EnqueueWorkload(NetworkId networkId,
const InputTensors& inputTensors,
- const OutputTensors& outputTensors)
+ const OutputTensors& outputTensors,
+ std::vector<ImportedInputId> preImportedInputIds,
+ std::vector<ImportedOutputId> preImportedOutputIds)
{
const auto startTime = armnn::GetTimeNow();
@@ -617,7 +626,8 @@ Status RuntimeImpl::EnqueueWorkload(NetworkId networkId,
}
lastId=networkId;
- auto status = loadedNetwork->EnqueueWorkload(inputTensors, outputTensors);
+ auto status = loadedNetwork->EnqueueWorkload(inputTensors, outputTensors,
+ preImportedInputIds, preImportedOutputIds);
ARMNN_LOG(info) << "Execution time: " << std::setprecision(2)
<< std::fixed << armnn::GetTimeDuration(startTime).count() << " ms.";
diff --git a/src/armnn/Runtime.hpp b/src/armnn/Runtime.hpp
index 4052bb6d3a..bd37013ad0 100644
--- a/src/armnn/Runtime.hpp
+++ b/src/armnn/Runtime.hpp
@@ -55,8 +55,10 @@ public:
armnn::TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
armnn::TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
- std::vector<ImportedInputId> ImportInputs(NetworkId networkId, const InputTensors& inputTensors);
- std::vector<ImportedOutputId> ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors);
+ std::vector<ImportedInputId> ImportInputs(NetworkId networkId, const InputTensors& inputTensors,
+ MemorySource forceImportMemorySource = MemorySource::Undefined);
+ std::vector<ImportedOutputId> ImportOutputs(NetworkId networkId, const OutputTensors& outputTensors,
+ MemorySource forceImportMemorySource = MemorySource::Undefined);
void ClearImportedInputs(NetworkId networkId, const std::vector<ImportedInputId> inputIds);
void ClearImportedOutputs(NetworkId networkId, const std::vector<ImportedOutputId> outputIds);
@@ -64,7 +66,9 @@ public:
// Evaluates network using input in inputTensors, outputs filled into outputTensors.
Status EnqueueWorkload(NetworkId networkId,
const InputTensors& inputTensors,
- const OutputTensors& outputTensors);
+ const OutputTensors& outputTensors,
+ std::vector<ImportedInputId> preImportedInputIds = {},
+ std::vector<ImportedOutputId> preImportedOutputIds = {});
/// This is an experimental function.
/// Evaluates a network using input in inputTensors and outputs filled into outputTensors.
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index 8abb16ccca..0fc5da78d1 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -133,6 +133,15 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
MemorySourceFlags inputFlags,
MemorySourceFlags outputFlags) const
{
+ // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+ if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+ if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
std::shared_ptr<ClMemoryManager> memoryManager;
if (m_UsingCustomAllocator)
{
@@ -193,6 +202,15 @@ void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& regis
MemorySourceFlags inputFlags,
MemorySourceFlags outputFlags)
{
+ // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
+ if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
+ if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
+ {
+ outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
+ }
std::shared_ptr<ClMemoryManager> memoryManager;
if (m_UsingCustomAllocator)
{
diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp
index a236a70d7c..54710d8135 100644
--- a/src/backends/cl/ClImportTensorHandle.hpp
+++ b/src/backends/cl/ClImportTensorHandle.hpp
@@ -205,7 +205,11 @@ public:
// We do this to match the behaviour of the Import function later on.
auto cachelineAlignment =
arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
- auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+ auto roundedSize = totalBytes;
+ if (totalBytes % cachelineAlignment != 0)
+ {
+ roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+ }
cl_int error = CL_SUCCESS;
cl_mem buffer;
@@ -252,7 +256,11 @@ private:
// This does not change the size of the buffer, only the size of the mapping the buffer is mapped to
auto cachelineAlignment =
arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
- auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+ auto roundedSize = totalBytes;
+ if (totalBytes % cachelineAlignment != 0)
+ {
+ roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+ }
cl_int error = CL_SUCCESS;
cl_mem buffer;
diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp
index d8b2d4f786..4a28205ade 100644
--- a/src/backends/cl/test/ClCreateWorkloadTests.cpp
+++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp
@@ -11,11 +11,14 @@
#include <armnn/utility/PolymorphicDowncast.hpp>
#include <armnn/backends/MemCopyWorkload.hpp>
#include <armnnTestUtils/TensorCopyUtils.hpp>
+#include <TensorHelpers.hpp>
#include <armnnTestUtils/WorkloadTestUtils.hpp>
#include <aclCommon/test/CreateWorkloadClNeon.hpp>
#include <aclCommon/ArmComputeTensorUtils.hpp>
+#include <cl/ClImportTensorHandle.hpp>
+#include <cl/ClImportTensorHandleFactory.hpp>
#include <cl/ClTensorHandle.hpp>
#include <cl/ClWorkloadFactory.hpp>
#include <cl/workloads/ClWorkloads.hpp>
@@ -355,6 +358,64 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "CreateConvolution2dFastMathEnabledWo
ARMNN_ASSERT(conv2dWorkload->GetConvolutionMethod() == arm_compute::ConvolutionMethod::WINOGRAD);
}
+TEST_CASE_FIXTURE(ClContextControlFixture, "ClReplaceInputOutputConvolution2dWorkload")
+{
+ // Create Convolution2dWorkload with ClTensorHandle input and output
+ // Then replace the input and output with ClImportTensorHandle
+ Graph graph;
+ ClWorkloadFactory factory =
+ ClWorkloadFactoryHelper::GetFactory(ClWorkloadFactoryHelper::GetMemoryManager());
+
+ auto workload =
+ CreateConvolution2dWorkloadTest<ClConvolution2dWorkload, DataType::Float32>(factory,
+ graph,
+ DataLayout::NHWC);
+
+ TensorShape inputShape = std::initializer_list<unsigned int>({2, 8, 16, 3});
+ TensorShape outputShape = std::initializer_list<unsigned int>({2, 2, 10, 2});
+
+ // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest).
+ Convolution2dQueueDescriptor queueDescriptor = workload->GetData();
+ auto inputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Inputs[0]);
+ auto outputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Outputs[0]);
+ CHECK((inputHandle->GetShape() == inputShape));
+ CHECK((outputHandle->GetShape() == outputShape));
+ // The input and output handles are created correctly as ClTensorHandle
+ CHECK((dynamic_cast<ClTensorHandle*>(inputHandle) != nullptr));
+ CHECK((dynamic_cast<ClTensorHandle*>(outputHandle) != nullptr));
+
+ // Replace with ImportTensorHandle
+ ClImportTensorHandleFactory importFactory(static_cast<MemorySourceFlags>(MemorySource::Malloc),
+ static_cast<MemorySourceFlags>(MemorySource::Malloc));
+
+ TensorInfo inputInfo({ 2, 8, 16, 3 }, DataType::Float32);
+ TensorInfo outputInfo({ 2, 2, 10, 2 }, DataType::Float32);
+
+ // create TensorHandle for memory import
+ auto inputImportHandle = importFactory.CreateTensorHandle(inputInfo);
+ auto outputImportHandle = importFactory.CreateTensorHandle(outputInfo);
+
+ // Calling ReplaceInputTensorHandle and ReplaceOutputTensorHandle does not throw exception
+ // as Reconfigure function is implemented
+ workload->ReplaceInputTensorHandle(inputImportHandle.get(), 0);
+ workload->ReplaceOutputTensorHandle(outputImportHandle.get(), 0);
+
+ // Correctly replaced with the import handles with correct information
+ queueDescriptor = workload->GetData();
+ auto replacedInputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Inputs[0]);
+ auto replacedOutputHandle = PolymorphicDowncast<ITensorHandle*>(queueDescriptor.m_Outputs[0]);
+ CHECK((replacedInputHandle->GetShape() == inputShape));
+ CHECK((replacedOutputHandle->GetShape() == outputShape));
+
+ CHECK((inputImportHandle.get() == replacedInputHandle));
+ CHECK((inputImportHandle.get() == replacedInputHandle));
+
+ CHECK((dynamic_cast<ClTensorHandle*>(replacedInputHandle) == nullptr));
+ CHECK((dynamic_cast<ClImportTensorHandle*>(replacedInputHandle) != nullptr));
+ CHECK((dynamic_cast<ClTensorHandle*>(replacedOutputHandle) == nullptr));
+ CHECK((dynamic_cast<ClImportTensorHandle*>(replacedOutputHandle) != nullptr));
+}
+
TEST_CASE_FIXTURE(ClContextControlFixture, "CreateConvolution2dClCompiledContextWorkload")
{
using namespace armnn;
diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp
index 3d702642aa..161765484d 100644
--- a/src/backends/cl/test/ClImportTensorHandleTests.cpp
+++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp
@@ -274,4 +274,157 @@ TEST_CASE("ClCanBeImportedAlignedMemory")
// we can be confident that it will be successfully imported. All other cases will need to be handled by the user.
}
+TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
+{
+ // Create runtime in which test will run
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+ // build up the structure of the network
+ INetworkPtr network(INetwork::Create());
+
+ armnn::TensorInfo inputInfo({ 1, 3, 4, 1 }, DataType::Float32);
+ armnn::TensorInfo kernelInfo({ 1, 3, 3, 1 }, DataType::Float32);
+ armnn::TensorInfo outputInfo({ 1, 3, 4, 1 }, DataType::Float32);
+
+ kernelInfo.SetConstant(true);
+
+ std::vector<float> kernel =
+ {
+ 4, 5, 6,
+ 0, 0, 0,
+ 3, 2, 1
+ };
+
+ const std::vector<float> expectedOutput =
+ {
+ 23, 41, 33, 21,
+ 44, 65, 76, 52,
+ 82, 85, 79, 42
+ };
+
+ unsigned int numElements = inputInfo.GetNumElements();
+ size_t totalBytes = numElements * sizeof(float);
+
+ IConnectableLayer* const inputLayer = network->AddInputLayer(0, "input");
+ ARMNN_ASSERT(inputLayer);
+
+ armnn::ConstTensor weights(kernelInfo, kernel);
+
+ armnn::Convolution2dDescriptor convDesc2d;
+ convDesc2d.m_StrideX = 1;
+ convDesc2d.m_StrideY = 1;
+ convDesc2d.m_PadLeft = 1;
+ convDesc2d.m_PadRight = 1;
+ convDesc2d.m_PadTop = 1;
+ convDesc2d.m_PadBottom = 1;
+ convDesc2d.m_DataLayout = DataLayout::NHWC;
+ armnn::IConnectableLayer* const convLayer = network->AddConvolution2dLayer(convDesc2d,
+ weights,
+ armnn::EmptyOptional(),
+ "conv");
+ ARMNN_ASSERT(convLayer);
+
+ inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
+ inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+ IConnectableLayer* output = network->AddOutputLayer(0, "output");
+ convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+ convLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);
+
+ // Optimize the network
+ OptimizerOptions optOptions;
+ optOptions.m_ImportEnabled = false;
+ std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+ IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec(), optOptions);
+ CHECK(optNet);
+
+ // Loads it into the runtime.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+ // Enable Importing
+ INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
+ runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+ // Creates structures for input & output
+ const size_t alignment =
+ arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+ size_t space = totalBytes + alignment + alignment;
+ auto inputData = std::make_unique<uint8_t[]>(space);
+ void* alignedInputPtr = inputData.get();
+ CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
+
+ // Input with negative values
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ inputPtr[0] = 1;
+ inputPtr[1] = 5;
+ inputPtr[2] = 2;
+ inputPtr[3] = 3;
+ inputPtr[4] = 8;
+ inputPtr[5] = 7;
+ inputPtr[6] = 3;
+ inputPtr[7] = 6;
+ inputPtr[8] = 3;
+ inputPtr[9] = 3;
+ inputPtr[10] = 9;
+ inputPtr[11] = 1;
+
+
+ auto outputData = std::make_unique<uint8_t[]>(space);
+ void* alignedOutputPtr = outputData.get();
+ CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
+ auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
+ std::fill_n(outputPtr, numElements, -10.0f);
+
+ TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
+ inputTensorInfo.SetConstant(true);
+ InputTensors inputTensors
+ {
+ {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
+ };
+ OutputTensors outputTensors
+ {
+ {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ INFO("Run ImportInputs");
+ std::vector<ImportedInputId> importedInputIds =
+ runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
+ std::vector<ImportedOutputId> importedOutputIds =
+ runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Contains Convolution2dWorkload
+ std::size_t found = dump.find("Convolution2dWorkload");
+ CHECK(found != std::string::npos);
+
+ // Contains SyncMemGeneric
+ found = dump.find("SyncMemGeneric");
+ CHECK(found != std::string::npos);
+
+ // Does not contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ CHECK(found == std::string::npos);
+
+ runtime->UnloadNetwork(netId);
+
+ // Check output is as expected
+ // Validate result by checking that the output has no negative values
+ auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
+ CHECK(outputResult);
+
+ // Check the output is correct
+ CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
+}
+
}
diff --git a/src/backends/cl/workloads/ClBaseWorkload.hpp b/src/backends/cl/workloads/ClBaseWorkload.hpp
index e74fc84f4f..03417e33ae 100644
--- a/src/backends/cl/workloads/ClBaseWorkload.hpp
+++ b/src/backends/cl/workloads/ClBaseWorkload.hpp
@@ -20,17 +20,38 @@ public:
// Replace input tensor handle with the given TensorHandle and call Reconfigure()
void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
{
+ ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
this->m_Data.m_Inputs[slot] = tensorHandle;
- Reconfigure();
+ try
+ {
+ Reconfigure();
+ }
+ catch(armnn::UnimplementedException& e)
+ {
+ // Cannot reconfigure, revert the slot back and throw the exception.
+ this->m_Data.m_Inputs[slot] = backupHandle;
+ throw e;
+ }
}
// Replace output tensor handle with the given TensorHandle and call Reconfigure()
void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
{
+ ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot];
this->m_Data.m_Outputs[slot] = tensorHandle;
- Reconfigure();
+ try
+ {
+ Reconfigure();
+ }
+ catch(armnn::UnimplementedException& e)
+ {
+ // Cannot reconfigure, revert the slot back and throw the exception.
+ this->m_Data.m_Inputs[slot] = backupHandle;
+ throw e;
+ }
}
+protected:
// Reconfigure the workload configuration. Throw armnn::UnimplementedException by default.
virtual void Reconfigure()
{
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
index 705e92d307..cdfa885f67 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp
@@ -90,6 +90,10 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip
arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ // Create Proxy tensor and set the initial tensor handle to it
+ m_InputProxy = std::make_unique<ICLTensorProxy>(&input);
+ m_OutputProxy = std::make_unique<ICLTensorProxy>(&output);
+
arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
input.info()->set_data_layout(aclDataLayout);
output.info()->set_data_layout(aclDataLayout);
@@ -101,10 +105,10 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_configure");
m_ConvolutionLayer.configure(clCompileContext,
- &input,
+ m_InputProxy.get(),
m_KernelTensor.get(),
m_BiasTensor.get(),
- &output,
+ m_OutputProxy.get(),
padStrideInfo,
arm_compute::WeightsInfo(),
aclDilationInfo,
@@ -174,4 +178,13 @@ void ClConvolution2dWorkload::FreeUnusedTensors()
FreeTensorIfUnused(m_BiasTensor);
}
+void ClConvolution2dWorkload::Reconfigure()
+{
+ ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClConvolution2dWorkload_Reconfigure");
+ arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+ arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+ m_InputProxy->set(&input);
+ m_OutputProxy->set(&output);
+}
+
} //namespace armnn
diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
index 8a4599df47..891d5096cd 100644
--- a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
+++ b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp
@@ -13,6 +13,8 @@
#include <arm_compute/runtime/CL/functions/CLConvolutionLayer.h>
#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+#include <cl/ICLTensorProxy.hpp>
+
#include <memory>
namespace armnn
@@ -38,6 +40,9 @@ public:
arm_compute::ConvolutionMethod GetConvolutionMethod() const;
+protected:
+ void Reconfigure() override;
+
private:
mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer;
@@ -47,6 +52,9 @@ private:
arm_compute::ConvolutionMethod m_ConvolutionMethod;
void FreeUnusedTensors();
+
+ std::unique_ptr<ICLTensorProxy> m_InputProxy;
+ std::unique_ptr<ICLTensorProxy> m_OutputProxy;
};
} //namespace armnn
diff --git a/src/backends/neon/workloads/NeonBaseWorkload.hpp b/src/backends/neon/workloads/NeonBaseWorkload.hpp
index a92f35a173..63f3539164 100644
--- a/src/backends/neon/workloads/NeonBaseWorkload.hpp
+++ b/src/backends/neon/workloads/NeonBaseWorkload.hpp
@@ -20,17 +20,38 @@ public:
// Replace input tensor handle with the given TensorHandle and call Reconfigure()
void ReplaceInputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
{
+ ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
this->m_Data.m_Inputs[slot] = tensorHandle;
- Reconfigure();
+ try
+ {
+ Reconfigure();
+ }
+ catch(armnn::UnimplementedException& e)
+ {
+ // Cannot reconfigure, revert the slot back and throw the exception.
+ this->m_Data.m_Inputs[slot] = backupHandle;
+ throw e;
+ }
}
// Replace output tensor handle with the given TensorHandle and call Reconfigure()
void ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot) override
{
+ ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot];
this->m_Data.m_Outputs[slot] = tensorHandle;
- Reconfigure();
+ try
+ {
+ Reconfigure();
+ }
+ catch(armnn::UnimplementedException& e)
+ {
+ // Cannot reconfigure, revert the slot back and throw the exception.
+ this->m_Data.m_Inputs[slot] = backupHandle;
+ throw e;
+ }
}
+protected:
// Reconfigure the workload configuration. Throw armnn::UnimplementedException by default.
virtual void Reconfigure()
{
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index 4444f5c361..7a6cf97936 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -1378,6 +1378,93 @@ TEST_CASE("RefRankEndToEndTestQSymmS8")
RankEndToEnd<armnn::DataType::QSymmS8>(defaultBackends);
}
+TEST_CASE("RefForceImportTest")
+{
+ using namespace armnn;
+
+ std::vector<BackendId> backends = defaultBackends;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ IConnectableLayer* input = net->AddInputLayer(0);
+
+ ActivationDescriptor descriptor;
+ descriptor.m_Function = ActivationFunction::Square;
+ IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
+
+ IConnectableLayer* output = net->AddOutputLayer(0);
+
+ input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
+ activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
+ activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
+
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+
+ INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
+
+ CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
+ == Status::Success);
+
+ // Creates structures for input & output
+ std::vector<float> inputData
+ {
+ 1.0f, 2.0f, 3.0f, 4.0f
+ };
+
+ std::vector<float> outputData(4);
+
+ std::vector<float> expectedOutput
+ {
+ 1.0f, 4.0f, 9.0f, 16.0f
+ };
+
+ InputTensors inputTensors
+ {
+ {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
+ };
+ OutputTensors outputTensors
+ {
+ {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ std::vector<ImportedInputId> importedInputIds =
+ runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
+ std::vector<ImportedOutputId> importedOutputIds =
+ runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
+
+ // Do the inference and force the import as the memory is alligned.
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Check there is a SyncMemGeneric workload as we exported
+ int count = SubStringCounter(dump, "SyncMemGeneric");
+ CHECK(count == 1);
+
+ // Shouldn't be any CopyMemGeneric workloads
+ count = SubStringCounter(dump, "CopyMemGeneric");
+ CHECK(count == 0);
+
+ // Check the output is correct
+ CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
+}
+
#if !defined(__ANDROID__)
// Only run these tests on non Android platforms
TEST_CASE("RefImportNonAlignedPointerTest")