From 0c76b23726a86a46da44c3f52348db53f73ae242 Mon Sep 17 00:00:00 2001 From: Narumol Prangnawarat Date: Fri, 7 May 2021 17:52:36 +0100 Subject: IVGCVSW-5818 Enable import on GPU Signed-off-by: Narumol Prangnawarat Change-Id: I4e4eb107aa2bfa09625840d738001f33152e6792 --- include/armnn/IRuntime.hpp | 6 +- include/armnn/backends/IBackendInternal.hpp | 13 +++ include/armnn/backends/ITensorHandleFactory.hpp | 3 +- src/armnn/Layer.cpp | 12 +-- src/armnn/Layer.hpp | 3 +- src/armnn/LoadedNetwork.cpp | 31 +++--- src/armnn/Network.cpp | 75 +++++++++++--- src/armnn/layers/ConcatLayer.cpp | 13 +-- src/armnn/layers/ConcatLayer.hpp | 3 +- src/armnn/layers/OutputLayer.hpp | 5 +- src/armnn/layers/SplitterLayer.cpp | 13 +-- src/armnn/layers/SplitterLayer.hpp | 3 +- src/armnn/test/TensorHandleStrategyTest.cpp | 8 +- src/backends/backendsCommon/IBackendInternal.cpp | 18 ++++ src/backends/cl/ClBackend.cpp | 41 +++++++- src/backends/cl/ClBackend.hpp | 13 ++- src/backends/cl/ClImportTensorHandleFactory.cpp | 20 ++++ src/backends/cl/ClImportTensorHandleFactory.hpp | 6 ++ src/backends/cl/backend.mk | 1 + src/backends/cl/test/ClFallbackTests.cpp | 91 ++++++++++++----- src/backends/cl/test/ClImportTensorHandleTests.cpp | 112 ++++++++++++++++++++- src/backends/neon/test/NeonFallbackTests.cpp | 73 ++++++++++---- 22 files changed, 436 insertions(+), 127 deletions(-) diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp index f296a5f564..870e027f33 100644 --- a/include/armnn/IRuntime.hpp +++ b/include/armnn/IRuntime.hpp @@ -38,9 +38,9 @@ struct INetworkProperties , m_ExportEnabled(exportEnabled) , m_AsyncEnabled(asyncEnabled) , m_NumThreads(numThreads) - , m_InputSource(MemorySource::Undefined) - , m_OutputSource(MemorySource::Undefined) - {} + , m_InputSource(m_ImportEnabled ? MemorySource::Malloc : MemorySource::Undefined) + , m_OutputSource(m_ExportEnabled ? MemorySource::Malloc : MemorySource::Undefined) + {} INetworkProperties(bool asyncEnabled, MemorySource m_InputSource, diff --git a/include/armnn/backends/IBackendInternal.hpp b/include/armnn/backends/IBackendInternal.hpp index 8035cff456..135d279c21 100644 --- a/include/armnn/backends/IBackendInternal.hpp +++ b/include/armnn/backends/IBackendInternal.hpp @@ -126,6 +126,12 @@ public: class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry, const ModelOptions& modelOptions) const; + virtual IWorkloadFactoryPtr CreateWorkloadFactory( + class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry, + const ModelOptions& modelOptions, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) const; + /// Create the runtime context of the backend /// /// Implementations may return a default-constructed IBackendContextPtr if @@ -162,6 +168,13 @@ public: /// IWorkloadFactory::CreateTensor()/IWorkloadFactory::CreateSubtensor() methods must be implemented. virtual void RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& /*registry*/) {} + /// (Optional) Register TensorHandleFactories + /// Either this method or CreateMemoryManager() and + /// IWorkloadFactory::CreateTensor()/IWorkloadFactory::CreateSubtensor() methods must be implemented. + virtual void RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags); + /// Returns the version of the Backend API static constexpr BackendVersion GetApiVersion() { return BackendVersion(1, 0); } diff --git a/include/armnn/backends/ITensorHandleFactory.hpp b/include/armnn/backends/ITensorHandleFactory.hpp index ae2f44e8c6..501d97b852 100644 --- a/include/armnn/backends/ITensorHandleFactory.hpp +++ b/include/armnn/backends/ITensorHandleFactory.hpp @@ -20,6 +20,7 @@ namespace armnn enum class CapabilityClass { PaddingRequired = 1, + FallbackImportDisabled = 2, // add new enum values here @@ -80,7 +81,7 @@ public: virtual bool SupportsSubTensors() const = 0; - virtual bool SupportsMapUnmap() const final { return true; } + virtual bool SupportsMapUnmap() const { return true; } virtual MemorySourceFlags GetExportFlags() const { return 0; } virtual MemorySourceFlags GetImportFlags() const { return 0; } diff --git a/src/armnn/Layer.cpp b/src/armnn/Layer.cpp index e0d988d8ea..7761063650 100644 --- a/src/armnn/Layer.cpp +++ b/src/armnn/Layer.cpp @@ -249,8 +249,7 @@ void Layer::SetAdditionalInfo(QueueDescriptor& descriptor) const void Layer::CreateTensorHandles(const TensorHandleFactoryRegistry& registry, const IWorkloadFactory& workloadFactory, - const bool IsMemoryManaged, - MemorySource memSource) + const bool IsMemoryManaged) { for (unsigned int idx=0; idx < GetNumOutputSlots(); idx++) { @@ -266,14 +265,7 @@ void Layer::CreateTensorHandles(const TensorHandleFactoryRegistry& registry, else { ITensorHandleFactory* handleFactory; - if (memSource == MemorySource::Undefined ) - { - handleFactory = registry.GetFactory(factoryId); - } - else - { - handleFactory = registry.GetFactory(factoryId, memSource); - } + handleFactory = registry.GetFactory(factoryId); ARMNN_ASSERT(handleFactory); handler.CreateTensorHandles(*handleFactory, IsMemoryManaged); } diff --git a/src/armnn/Layer.hpp b/src/armnn/Layer.hpp index 76f9b41f4c..0e0883c1cd 100644 --- a/src/armnn/Layer.hpp +++ b/src/armnn/Layer.hpp @@ -275,8 +275,7 @@ public: virtual void CreateTensorHandles(const TensorHandleFactoryRegistry& registry, const IWorkloadFactory& factory, - const bool IsMemoryManaged = true, - MemorySource memSource = MemorySource::Undefined); + const bool IsMemoryManaged = true); /// Creates a dynamically-allocated copy of this layer. /// @param graph - The Graph into which this Layer is being cloned. diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index 67de00f0f3..53a9e18863 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -150,7 +150,9 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, if (backend->SupportsTensorAllocatorAPI()) { auto workloadFactory = backend->CreateWorkloadFactory( - m_TensorHandleFactoryRegistry, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()); + m_TensorHandleFactoryRegistry, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(), + static_cast(m_NetworkProperties.m_InputSource), + static_cast(m_NetworkProperties.m_OutputSource)); m_WorkloadFactories.emplace( std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr))); } @@ -188,8 +190,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, // to false when creating TensorHandles layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, - !m_NetworkProperties.m_ImportEnabled, - m_NetworkProperties.m_InputSource); + !m_NetworkProperties.m_ImportEnabled); break; } default: @@ -202,8 +203,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, { layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, - !m_NetworkProperties.m_ExportEnabled, - m_NetworkProperties.m_OutputSource); + !m_NetworkProperties.m_ExportEnabled); } else { @@ -643,12 +643,12 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tens bool needMemCopy = true; if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor { - if(CheckFlag(importFlags, MemorySource::Malloc) ) + if(CheckFlag(importFlags, m_NetworkProperties.m_InputSource)) { needMemCopy = false; // This assumes a CPU Tensor handle void* mem = tensorHandle->Map(false); - if (outputTensorHandle->Import(mem, MemorySource::Malloc)) + if (outputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource)) { tensorHandle->Unmap(); return; // No need for a workload since the import has been done. @@ -718,11 +718,11 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) { MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); - if (CheckFlag(importFlags, MemorySource::Malloc)) + if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource)) { needMemCopy = false; void *mem = tensorHandle->Map(false); - bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc); + bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource); tensorHandle->Unmap(); if (importOk) @@ -1013,7 +1013,7 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags(); if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor { - if (CheckFlag(importFlags, MemorySource::Malloc) ) + if (CheckFlag(importFlags, m_NetworkProperties.m_InputSource) ) { // This assumes a CPU Tensor handle std::unique_ptr tensorHandle = @@ -1021,7 +1021,7 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, inputTensor.GetMemoryArea()); void* mem = tensorHandle->Map(false); - if (descriptor.m_Outputs[0]->Import(mem, MemorySource::Malloc)) + if (descriptor.m_Outputs[0]->Import(mem, m_NetworkProperties.m_InputSource)) { tensorHandle->Unmap(); return; @@ -1078,14 +1078,14 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outp if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) { MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); - if (CheckFlag(importFlags, MemorySource::Malloc)) + if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource)) { std::unique_ptr tensorHandle = std::make_unique(outputTensor.GetInfo(), outputTensor.GetMemoryArea()); void* mem = tensorHandle->Map(false); - bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc); + bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource); tensorHandle->Unmap(); if (importOk) @@ -1270,7 +1270,10 @@ std::unique_ptr LoadedNetwork::CreateWorkingMemHandle(Network { if (backend.second->SupportsTensorAllocatorAPI()) { - backend.second->RegisterTensorHandleFactories(tensorHandleFactoryRegistry); + backend.second->RegisterTensorHandleFactories( + tensorHandleFactoryRegistry, + static_cast(m_NetworkProperties.m_InputSource), + static_cast(m_NetworkProperties.m_OutputSource)); memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back()); } else diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index b79576c87e..f097e677d7 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -1165,7 +1165,8 @@ bool RequiresCopy(ITensorHandleFactory::FactoryId src, // Find the handle factory for the input layer which results in fewest required copies. ITensorHandleFactory::FactoryId CalculateSlotOptionForInput(BackendsMap& backends, OutputSlot& slot, - TensorHandleFactoryRegistry& registry) + TensorHandleFactoryRegistry& registry, + bool importEnabled) { Layer& layer = slot.GetOwningLayer(); ARMNN_ASSERT(layer.GetType() == LayerType::Input); @@ -1191,6 +1192,7 @@ ITensorHandleFactory::FactoryId CalculateSlotOptionForInput(BackendsMap& backend for (auto&& connection : slot.GetConnections()) { + const Layer& connectedLayer = connection->GetOwningLayer(); auto toBackend = backends.find(connectedLayer.GetBackendId()); @@ -1208,11 +1210,12 @@ ITensorHandleFactory::FactoryId CalculateSlotOptionForInput(BackendsMap& backend // Input layers use the mem copy workload or import, so the selected factory must // support either the map/unmap API or Import API ITensorHandleFactory* factory = registry.GetFactory(dst); - if (!factory->SupportsMapUnmap() && - !CheckFlag(factory->GetImportFlags(), MemorySource::Malloc)) // Just support cpu mem imports for now + if (importEnabled && factory->GetImportFlags() == 0) + { + continue; + } + else if (!importEnabled && !factory->SupportsMapUnmap()) { - // The current tensor handle factory does not support the map/unmap or import - // strategy, move to the next one continue; } @@ -1257,7 +1260,8 @@ ITensorHandleFactory::FactoryId CalculateSlotOptionForOutput(BackendsMap& backen // when considering all connections. ITensorHandleFactory::FactoryId CalculateSlotOption(BackendsMap& backends, OutputSlot& outputSlot, - TensorHandleFactoryRegistry& registry) + TensorHandleFactoryRegistry& registry, + bool importEnabled) { // First ensure the from backends can support the TensorHandeAPI Layer& layer = outputSlot.GetOwningLayer(); @@ -1268,14 +1272,13 @@ ITensorHandleFactory::FactoryId CalculateSlotOption(BackendsMap& backends, return ITensorHandleFactory::LegacyFactoryId; } - // Connections to Output Layers requires support for map/unmap on the TensorHandle. - bool requiresMapUnmap = false; + bool outputConnection = false; for (auto&& connection : outputSlot.GetConnections()) { const Layer& connectedLayer = connection->GetOwningLayer(); if (connectedLayer.GetType() == LayerType::Output) { - requiresMapUnmap = true; + outputConnection = true; } } @@ -1286,8 +1289,48 @@ ITensorHandleFactory::FactoryId CalculateSlotOption(BackendsMap& backends, std::map factoryScores; for (auto&& pref : srcPrefs) { - if (requiresMapUnmap) // Only consider factories that support map/unmap if required + if (importEnabled) + { + ITensorHandleFactory* factory = registry.GetFactory(pref); + if (outputConnection) + { + // Check if this is fallback case + bool fallbackConnection = false; + for (auto&& inputSlot : layer.GetInputSlots()) + { + if (inputSlot.GetConnectedOutputSlot()->GetOwningLayer().GetBackendId() != layer.GetBackendId()) + { + fallbackConnection = true; + } + } + if (fallbackConnection) + { + auto factoryCap = factory->GetCapabilities(&layer, &layer, CapabilityClass::FallbackImportDisabled); + // Cannot use factory import if fallback import is not supported. + if (!factoryCap.empty()) + { + continue; + } + } + else if (factory->GetExportFlags() == 0) + { + continue; + } + } + if (!outputConnection) + { + auto factoryCap = factory->GetCapabilities(&layer, &layer, CapabilityClass::FallbackImportDisabled); + // Cannot use factory import if fallback import is not supported. + if (!factoryCap.empty()) + { + continue; + } + } + + } + else { + // Only consider factories that support map/unmap ITensorHandleFactory* factory = registry.GetFactory(pref); if (!factory->SupportsMapUnmap()) { @@ -1296,6 +1339,7 @@ ITensorHandleFactory::FactoryId CalculateSlotOption(BackendsMap& backends, } } + auto it = factoryScores.find(pref); if (it == factoryScores.end()) { @@ -1417,15 +1461,18 @@ EdgeStrategy CalculateEdgeStrategy(BackendsMap& backends, if (!dstFactory) { continue; } - if ((dstFactory->GetImportFlags() & srcFactory->GetExportFlags()) != 0) { auto srcCapability = srcFactory->GetCapabilities(&layer, &layer, CapabilityClass::PaddingRequired); auto dstCapability = dstFactory->GetCapabilities(&connectedLayer, &connectedLayer, CapabilityClass::PaddingRequired); + auto srcFallback = srcFactory->GetCapabilities(&layer, &layer, CapabilityClass::FallbackImportDisabled); + auto dstFallback = dstFactory->GetCapabilities(&connectedLayer, + &connectedLayer, + CapabilityClass::FallbackImportDisabled); // Do not require memory copy if the source and destination do not require padding. - if (srcCapability.empty() && dstCapability.empty()) + if (srcCapability.empty() && dstCapability.empty() && srcFallback.empty() && dstFallback.empty()) { return EdgeStrategy::ExportToTarget; } @@ -1477,13 +1524,13 @@ OptimizationResult SelectTensorHandleStrategy(Graph& optGraph, switch(layer->GetType()) { case LayerType::Input: - slotOption = CalculateSlotOptionForInput(backends, outputSlot, registry); + slotOption = CalculateSlotOptionForInput(backends, outputSlot, registry, importEnabled); break; case LayerType::Output: slotOption = CalculateSlotOptionForOutput(backends, outputSlot, registry); break; default: - slotOption = CalculateSlotOption(backends, outputSlot, registry); + slotOption = CalculateSlotOption(backends, outputSlot, registry, importEnabled); break; } outputSlot.SetTensorHandleFactory(slotOption); diff --git a/src/armnn/layers/ConcatLayer.cpp b/src/armnn/layers/ConcatLayer.cpp index 3a20e1b3f6..238fdb66d9 100644 --- a/src/armnn/layers/ConcatLayer.cpp +++ b/src/armnn/layers/ConcatLayer.cpp @@ -179,8 +179,7 @@ void ConcatLayer::CreateTensors(const TensorHandleFactoryRegistry& registry, void ConcatLayer::CreateTensorHandles(const TensorHandleFactoryRegistry& registry, const IWorkloadFactory& workloadFactory, - const bool isMemoryManaged, - MemorySource memSource) + const bool isMemoryManaged) { OutputSlot& slot = GetOutputSlot(0); ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId(); @@ -191,15 +190,7 @@ void ConcatLayer::CreateTensorHandles(const TensorHandleFactoryRegistry& registr } else { - ITensorHandleFactory* handleFactory; - if (memSource == MemorySource::Undefined) - { - handleFactory = registry.GetFactory(factoryId); - } - else - { - handleFactory = registry.GetFactory(factoryId, memSource); - } + ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); ARMNN_ASSERT(handleFactory); CreateTensors(registry, *handleFactory, isMemoryManaged); } diff --git a/src/armnn/layers/ConcatLayer.hpp b/src/armnn/layers/ConcatLayer.hpp index 6a43318382..4315d66436 100644 --- a/src/armnn/layers/ConcatLayer.hpp +++ b/src/armnn/layers/ConcatLayer.hpp @@ -27,8 +27,7 @@ public: /// @param [in] MemorySource Determine the source of memory e.g Malloc virtual void CreateTensorHandles(const TensorHandleFactoryRegistry& registry, const IWorkloadFactory& factory, - const bool IsMemoryManaged = true, - MemorySource memSource = MemorySource::Undefined) override; + const bool IsMemoryManaged = true) override; /// Creates a dynamically-allocated copy of this layer. /// @param [in] graph The graph into which this layer is being cloned. diff --git a/src/armnn/layers/OutputLayer.hpp b/src/armnn/layers/OutputLayer.hpp index fc6a8aa6b2..408a28a6f3 100644 --- a/src/armnn/layers/OutputLayer.hpp +++ b/src/armnn/layers/OutputLayer.hpp @@ -26,10 +26,9 @@ public: /// @param [in] IsMemoryManaged Determine whether or not to assign a memory manager during creation virtual void CreateTensorHandles(const TensorHandleFactoryRegistry& registry, const IWorkloadFactory& factory, - const bool isMemoryManaged = true, - MemorySource memSource = MemorySource::Undefined) override + const bool isMemoryManaged = true) override { - IgnoreUnused(registry, factory, isMemoryManaged, memSource); + IgnoreUnused(registry, factory, isMemoryManaged); } /// Creates a dynamically-allocated copy of this layer. diff --git a/src/armnn/layers/SplitterLayer.cpp b/src/armnn/layers/SplitterLayer.cpp index adef9aa1a2..5e6622e13a 100644 --- a/src/armnn/layers/SplitterLayer.cpp +++ b/src/armnn/layers/SplitterLayer.cpp @@ -177,8 +177,7 @@ void SplitterLayer::CreateTensors(const TensorHandleFactoryRegistry& registry, void SplitterLayer::CreateTensorHandles(const TensorHandleFactoryRegistry& registry, const IWorkloadFactory& workloadFactory, - const bool isMemoryManaged, - MemorySource memSource) + const bool isMemoryManaged) { OutputSlot& slot = GetOutputSlot(0); ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId(); @@ -189,15 +188,7 @@ void SplitterLayer::CreateTensorHandles(const TensorHandleFactoryRegistry& regis } else { - ITensorHandleFactory* handleFactory; - if (memSource == MemorySource::Undefined) - { - handleFactory = registry.GetFactory(factoryId); - } - else - { - handleFactory = registry.GetFactory(factoryId, memSource); - } + ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId); ARMNN_ASSERT(handleFactory); CreateTensors(registry, *handleFactory, isMemoryManaged); } diff --git a/src/armnn/layers/SplitterLayer.hpp b/src/armnn/layers/SplitterLayer.hpp index 075b136da9..f90696b1ad 100644 --- a/src/armnn/layers/SplitterLayer.hpp +++ b/src/armnn/layers/SplitterLayer.hpp @@ -26,8 +26,7 @@ public: /// @param [in] IsMemoryManaged Determine whether or not to assign a memory manager during creation virtual void CreateTensorHandles(const TensorHandleFactoryRegistry& registry, const IWorkloadFactory& factory, - const bool IsMemoryManaged = true, - MemorySource memSource = MemorySource::Undefined) override; + const bool IsMemoryManaged = true) override; /// Creates a dynamically-allocated copy of this layer. /// @param [in] graph The graph into which this layer is being cloned. diff --git a/src/armnn/test/TensorHandleStrategyTest.cpp b/src/armnn/test/TensorHandleStrategyTest.cpp index c7aa30f701..47d0666414 100644 --- a/src/armnn/test/TensorHandleStrategyTest.cpp +++ b/src/armnn/test/TensorHandleStrategyTest.cpp @@ -139,7 +139,8 @@ public: { "TestHandleFactoryA1", "TestHandleFactoryA2", - "TestHandleFactoryB1" + "TestHandleFactoryB1", + "TestHandleFactoryD1" }; } @@ -252,7 +253,7 @@ public: std::vector GetHandleFactoryPreferences() const override { return std::vector{ - "TestHandleFactoryD1" + "TestHandleFactoryD1", }; } @@ -279,6 +280,7 @@ BOOST_AUTO_TEST_CASE(RegisterFactories) BOOST_TEST(backendA.GetHandleFactoryPreferences()[0] == "TestHandleFactoryA1"); BOOST_TEST(backendA.GetHandleFactoryPreferences()[1] == "TestHandleFactoryA2"); BOOST_TEST(backendA.GetHandleFactoryPreferences()[2] == "TestHandleFactoryB1"); + BOOST_TEST(backendA.GetHandleFactoryPreferences()[3] == "TestHandleFactoryD1"); TensorHandleFactoryRegistry registry; backendA.RegisterTensorHandleFactories(registry); @@ -351,7 +353,7 @@ BOOST_AUTO_TEST_CASE(TensorHandleSelectionStrategy) OutputSlot& softmaxLayer4Out = softmaxLayer4->GetOutputSlot(0); // Check that the correct factory was selected - BOOST_TEST(inputLayerOut.GetTensorHandleFactoryId() == "TestHandleFactoryA1"); + BOOST_TEST(inputLayerOut.GetTensorHandleFactoryId() == "TestHandleFactoryD1"); BOOST_TEST(softmaxLayer1Out.GetTensorHandleFactoryId() == "TestHandleFactoryB1"); BOOST_TEST(softmaxLayer2Out.GetTensorHandleFactoryId() == "TestHandleFactoryB1"); BOOST_TEST(softmaxLayer3Out.GetTensorHandleFactoryId() == "TestHandleFactoryC1"); diff --git a/src/backends/backendsCommon/IBackendInternal.cpp b/src/backends/backendsCommon/IBackendInternal.cpp index b08dff84ed..31706eb1e7 100644 --- a/src/backends/backendsCommon/IBackendInternal.cpp +++ b/src/backends/backendsCommon/IBackendInternal.cpp @@ -76,6 +76,17 @@ IBackendInternal::IWorkloadFactoryPtr IBackendInternal::CreateWorkloadFactory( return CreateWorkloadFactory(tensorHandleFactoryRegistry); } +IBackendInternal::IWorkloadFactoryPtr IBackendInternal::CreateWorkloadFactory( + class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry, + const ModelOptions& modelOptions, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) const +{ + IgnoreUnused(inputFlags); + IgnoreUnused(outputFlags); + return CreateWorkloadFactory(tensorHandleFactoryRegistry, modelOptions); +} + IBackendInternal::IBackendContextPtr IBackendInternal::CreateBackendContext(const IRuntime::CreationOptions&) const { return IBackendContextPtr{}; @@ -147,6 +158,13 @@ bool IBackendInternal::SupportsTensorAllocatorAPI() const return !GetHandleFactoryPreferences().empty(); } +void IBackendInternal::RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry, + MemorySourceFlags /*inputFlags*/, + MemorySourceFlags /*outputFlags*/) +{ + return RegisterTensorHandleFactories(registry); +} + ITensorHandleFactory::FactoryId IBackendInternal::GetBackwardCompatibleFavoriteHandleFactory() { auto favorites = GetHandleFactoryPreferences(); diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp index f97cb4bba8..35770d9219 100644 --- a/src/backends/cl/ClBackend.cpp +++ b/src/backends/cl/ClBackend.cpp @@ -4,12 +4,13 @@ // #include "ClBackend.hpp" +#include "ClBackendContext.hpp" #include "ClBackendId.hpp" #include "ClBackendModelContext.hpp" -#include "ClWorkloadFactory.hpp" -#include "ClBackendContext.hpp" +#include "ClImportTensorHandleFactory.hpp" #include "ClLayerSupport.hpp" #include "ClTensorHandleFactory.hpp" +#include "ClWorkloadFactory.hpp" #include #include @@ -71,6 +72,8 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::make_unique(memoryManager)); + registry.RegisterFactory(std::make_unique( + static_cast(MemorySource::Malloc), static_cast(MemorySource::Malloc))); return std::make_unique( PolymorphicPointerDowncast(memoryManager)); @@ -83,6 +86,24 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( registry.RegisterMemoryManager(memoryManager); registry.RegisterFactory(std::make_unique(memoryManager)); + registry.RegisterFactory(std::make_unique( + static_cast(MemorySource::Malloc), static_cast(MemorySource::Malloc))); + + return std::make_unique( + PolymorphicPointerDowncast(memoryManager), CreateBackendSpecificModelContext(modelOptions)); +} + +IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( + TensorHandleFactoryRegistry& registry, + const ModelOptions& modelOptions, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) const +{ + auto memoryManager = std::make_shared(std::make_unique()); + + registry.RegisterMemoryManager(memoryManager); + registry.RegisterFactory(std::make_unique(memoryManager)); + registry.RegisterFactory(std::make_unique(inputFlags, outputFlags)); return std::make_unique( PolymorphicPointerDowncast(memoryManager), CreateBackendSpecificModelContext(modelOptions)); @@ -90,7 +111,8 @@ IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory( std::vector ClBackend::GetHandleFactoryPreferences() const { - return std::vector {ClTensorHandleFactory::GetIdStatic()}; + return std::vector {ClTensorHandleFactory::GetIdStatic(), + ClImportTensorHandleFactory::GetIdStatic()}; } void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) @@ -99,6 +121,19 @@ void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& regis registry.RegisterMemoryManager(mgr); registry.RegisterFactory(std::make_unique(mgr)); + registry.RegisterFactory(std::make_unique( + static_cast(MemorySource::Malloc), static_cast(MemorySource::Malloc))); +} + +void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) +{ + auto mgr = std::make_shared(std::make_unique()); + + registry.RegisterMemoryManager(mgr); + registry.RegisterFactory(std::make_unique(mgr)); + registry.RegisterFactory(std::make_unique(inputFlags, outputFlags)); } IBackendInternal::IBackendContextPtr ClBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp index f9a5745eb3..252d87edea 100644 --- a/src/backends/cl/ClBackend.hpp +++ b/src/backends/cl/ClBackend.hpp @@ -30,16 +30,25 @@ public: IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory( TensorHandleFactoryRegistry& registry) const override; - IWorkloadFactoryPtr CreateWorkloadFactory( const IMemoryManagerSharedPtr& memoryManager, - const ModelOptions& modelOptions) const override; + IWorkloadFactoryPtr CreateWorkloadFactory(const IMemoryManagerSharedPtr& memoryManager, + const ModelOptions& modelOptions) const override; IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry, const ModelOptions& modelOptions) const override; + IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry, + const ModelOptions& modelOptions, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) const override; + std::vector GetHandleFactoryPreferences() const override; void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override; + void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry, + MemorySourceFlags inputFlags, + MemorySourceFlags outputFlags) override; + IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override; IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext( const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override; diff --git a/src/backends/cl/ClImportTensorHandleFactory.cpp b/src/backends/cl/ClImportTensorHandleFactory.cpp index 594e05423e..26d5f9c47a 100644 --- a/src/backends/cl/ClImportTensorHandleFactory.cpp +++ b/src/backends/cl/ClImportTensorHandleFactory.cpp @@ -106,6 +106,11 @@ bool ClImportTensorHandleFactory::SupportsSubTensors() const return true; } +bool ClImportTensorHandleFactory::SupportsMapUnmap() const +{ + return false; +} + MemorySourceFlags ClImportTensorHandleFactory::GetExportFlags() const { return m_ExportFlags; @@ -116,4 +121,19 @@ MemorySourceFlags ClImportTensorHandleFactory::GetImportFlags() const return m_ImportFlags; } +std::vector ClImportTensorHandleFactory::GetCapabilities(const IConnectableLayer* layer, + const IConnectableLayer* connectedLayer, + CapabilityClass capabilityClass) +{ + IgnoreUnused(layer); + IgnoreUnused(connectedLayer); + std::vector capabilities; + if (capabilityClass == CapabilityClass::FallbackImportDisabled) + { + Capability paddingCapability(CapabilityClass::FallbackImportDisabled, true); + capabilities.push_back(paddingCapability); + } + return capabilities; +} + } // namespace armnn \ No newline at end of file diff --git a/src/backends/cl/ClImportTensorHandleFactory.hpp b/src/backends/cl/ClImportTensorHandleFactory.hpp index ee2f84efda..7e22949647 100644 --- a/src/backends/cl/ClImportTensorHandleFactory.hpp +++ b/src/backends/cl/ClImportTensorHandleFactory.hpp @@ -58,10 +58,16 @@ public: bool SupportsSubTensors() const override; + bool SupportsMapUnmap() const override; + MemorySourceFlags GetExportFlags() const override; MemorySourceFlags GetImportFlags() const override; + std::vector GetCapabilities(const IConnectableLayer* layer, + const IConnectableLayer* connectedLayer, + CapabilityClass capabilityClass) override; + private: MemorySourceFlags m_ImportFlags; MemorySourceFlags m_ExportFlags; diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk index 976f614cff..e6c289cf39 100644 --- a/src/backends/cl/backend.mk +++ b/src/backends/cl/backend.mk @@ -20,6 +20,7 @@ BACKEND_SOURCES := \ ClContextControl.cpp \ ClContextDeserializer.cpp \ ClContextSerializer.cpp \ + ClImportTensorHandleFactory.cpp \ ClLayerSupport.cpp \ ClRegistryInitializer.cpp \ ClTensorHandleFactory.cpp \ diff --git a/src/backends/cl/test/ClFallbackTests.cpp b/src/backends/cl/test/ClFallbackTests.cpp index eec3afe447..183b8caa2e 100644 --- a/src/backends/cl/test/ClFallbackTests.cpp +++ b/src/backends/cl/test/ClFallbackTests.cpp @@ -11,7 +11,7 @@ BOOST_AUTO_TEST_SUITE(ClFallback) -BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon, * boost::unit_test::disabled()) +BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon) { using namespace armnn; @@ -34,7 +34,7 @@ BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon, * boost::unit_test::disabled add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32); input0->GetOutputSlot(0).SetTensorInfo(info); input1->GetOutputSlot(0).SetTensorInfo(info); @@ -82,30 +82,49 @@ BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon, * boost::unit_test::disabled runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); // Creates structures for input & output - std::vector inputData0 + std::vector inputValue0 { - 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f }; - std::vector inputData1 + std::vector inputValue1 { - 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f }; std::vector inputData2 { - 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f }; - std::vector outputData(12); + std::vector outputData(16); std::vector expectedOutput { - 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f + 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f, 11.0f, 9.0f, 7.0f, 5.0f }; + // Prepare aligned data + unsigned int numElements = info.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + const size_t alignment = 64; + size_t space = totalBytes + alignment + alignment; + auto inputData0 = std::make_unique(space); + void* alignedInputPtr0 = inputData0.get(); + BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr0, space)); + + auto* intputPtr0 = reinterpret_cast(alignedInputPtr0); + std::copy(inputValue0.begin(), inputValue0.end(), intputPtr0); + + auto inputData1 = std::make_unique(space); + void* alignedInputPtr1 = inputData1.get(); + BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr1, space)); + + auto* intputPtr1 = reinterpret_cast(alignedInputPtr1); + std::copy(inputValue1.begin(), inputValue1.end(), intputPtr1); + InputTensors inputTensors { - { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, - { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr0) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), alignedInputPtr1) }, { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } }; OutputTensors outputTensors @@ -134,6 +153,8 @@ BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon, * boost::unit_test::disabled // Check output is as expected BOOST_TEST(outputData == expectedOutput); + + runtime->UnloadNetwork(netId); } BOOST_AUTO_TEST_CASE(ClImportDisabledFallbackToNeon) @@ -258,7 +279,7 @@ BOOST_AUTO_TEST_CASE(ClImportDisabledFallbackToNeon) BOOST_TEST(outputData == expectedOutput); } -BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon, * boost::unit_test::disabled()) +BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon) { using namespace armnn; @@ -269,6 +290,10 @@ BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon, * boost::unit_test:: INetworkPtr net(INetwork::Create()); Pooling2dDescriptor desc; + desc.m_PoolWidth = 2; + desc.m_PoolHeight = 2; + desc.m_StrideX = 2; + desc.m_StrideY = 2; IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); @@ -285,8 +310,8 @@ BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon, * boost::unit_test:: sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); - TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); + TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32); + TensorInfo poolingInfo = TensorInfo({ 1, 2, 2, 1 }, DataType::Float32); input0->GetOutputSlot(0).SetTensorInfo(info); input1->GetOutputSlot(0).SetTensorInfo(info); @@ -340,27 +365,45 @@ BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon, * boost::unit_test:: runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); // Creates structures for input & output - std::vector inputData0 + std::vector inputValue0 { - 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f }; - std::vector inputData1 + std::vector inputValue1 { - 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f }; std::vector inputData2 { - 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f }; - std::vector outputData(2); + std::vector outputData(4); - std::vector expectedOutput{ 11.0f, -1.0f }; + std::vector expectedOutput{ 11.0f, 3.0f, -5.0f, 11.0f }; + + unsigned int numElements = info.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + const size_t alignment = 64; + size_t space = totalBytes + alignment + alignment; + auto inputData0 = std::make_unique(space); + void* alignedInputPtr0 = inputData0.get(); + BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr0, space)); + + auto* intputPtr0 = reinterpret_cast(alignedInputPtr0); + std::copy(inputValue0.begin(), inputValue0.end(), intputPtr0); + + auto inputData1 = std::make_unique(space); + void* alignedInputPtr1 = inputData1.get(); + BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr1, space)); + + auto* intputPtr1 = reinterpret_cast(alignedInputPtr1); + std::copy(inputValue1.begin(), inputValue1.end(), intputPtr1); InputTensors inputTensors { - { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, - { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr0) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), alignedInputPtr1) }, { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } }; OutputTensors outputTensors @@ -393,6 +436,8 @@ BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon, * boost::unit_test:: // Check output is as expected BOOST_TEST(outputData == expectedOutput); + + runtime->UnloadNetwork(netId); } BOOST_AUTO_TEST_CASE(ClImportDisableFallbackSubgraphToNeon) diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp index bfb74af801..85ff35f0af 100644 --- a/src/backends/cl/test/ClImportTensorHandleTests.cpp +++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp @@ -11,6 +11,9 @@ #include +#include +#include + using namespace armnn; BOOST_AUTO_TEST_SUITE(ClImportTensorHandleTests) @@ -38,7 +41,7 @@ BOOST_FIXTURE_TEST_CASE(ClMallocImport, ClContextControlFixture) const size_t totalBytes = tensor.info()->total_size(); const size_t alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo(); - size_t space = totalBytes + alignment; + size_t space = totalBytes + alignment + alignment; auto testData = std::make_unique(space); void* alignedPtr = testData.get(); BOOST_CHECK(std::align(alignment, totalBytes, alignedPtr, space)); @@ -57,7 +60,7 @@ BOOST_FIXTURE_TEST_CASE(ClMallocImport, ClContextControlFixture) // Validate result by checking that the output has no negative values for(unsigned int i = 0; i < numElements; ++i) { - BOOST_ASSERT(typedPtr[i] >= 0); + BOOST_TEST(typedPtr[i] >= 0); } } @@ -78,7 +81,7 @@ BOOST_FIXTURE_TEST_CASE(ClIncorrectMemorySourceImport, ClContextControlFixture) const size_t totalBytes = tensor.info()->total_size(); const size_t alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo(); - size_t space = totalBytes + alignment; + size_t space = totalBytes + alignment + alignment; auto testData = std::make_unique(space); void* alignedPtr = testData.get(); BOOST_CHECK(std::align(alignment, totalBytes, alignedPtr, space)); @@ -108,4 +111,105 @@ BOOST_FIXTURE_TEST_CASE(ClInvalidMemorySourceImport, ClContextControlFixture) BOOST_CHECK_THROW(handle->Import(inputData.data(), invalidMemSource), MemoryImportException); } -BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file +BOOST_FIXTURE_TEST_CASE(ClImportEndToEnd, ClContextControlFixture) +{ + // Create runtime in which test will run + IRuntime::CreationOptions options; + IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + // build up the structure of the network + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input = net->AddInputLayer(0, "Input"); + + ActivationDescriptor descriptor; + descriptor.m_Function = ActivationFunction::ReLu; + IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation"); + + IConnectableLayer* output = net->AddOutputLayer(0, "Output"); + + input->GetOutputSlot(0).Connect(activation->GetInputSlot(0)); + activation->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32); + unsigned int numElements = tensorInfo.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + input->GetOutputSlot(0).SetTensorInfo(tensorInfo); + activation->GetOutputSlot(0).SetTensorInfo(tensorInfo); + + // Optimize the network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = true; + std::vector backends = {armnn::Compute::GpuAcc}; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + BOOST_CHECK(optNet); + + // Loads it into the runtime. + NetworkId netId; + std::string ignoredErrorMessage; + // Enable Importing + INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc); + runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + const size_t alignment = + arm_compute::CLKernelLibrary::get().get_device().getInfo(); + size_t space = totalBytes + alignment + alignment; + auto inputData = std::make_unique(space); + void* alignedInputPtr = inputData.get(); + BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); + + // Input with negative values + auto* intputPtr = reinterpret_cast(alignedInputPtr); + std::fill_n(intputPtr, numElements, -5.0f); + + auto outputData = std::make_unique(space); + void* alignedOutputPtr = outputData.get(); + BOOST_CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space)); + + InputTensors inputTensors + { + {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr)}, + }; + OutputTensors outputTensors + { + {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)} + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Contains ActivationWorkload + std::size_t found = dump.find("ActivationWorkload"); + BOOST_TEST(found != std::string::npos); + + // Contains SyncMemGeneric + found = dump.find("SyncMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Does not contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found == std::string::npos); + + // Check output is as expected + // Validate result by checking that the output has no negative values + auto* outputResult = reinterpret_cast(alignedOutputPtr); + BOOST_TEST(outputResult); + for(unsigned int i = 0; i < numElements; ++i) + { + BOOST_TEST(outputResult[i] >= 0); + } + + runtime->UnloadNetwork(netId); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/backends/neon/test/NeonFallbackTests.cpp b/src/backends/neon/test/NeonFallbackTests.cpp index 8dc592db5d..383a5f654c 100644 --- a/src/backends/neon/test/NeonFallbackTests.cpp +++ b/src/backends/neon/test/NeonFallbackTests.cpp @@ -16,7 +16,7 @@ BOOST_AUTO_TEST_CASE(FallbackImportToCpuAcc) { using namespace armnn; - // Create a mock backend object + // Create a mock backend objectN MockImportBackendInitialiser initialiser; // Register the Mock Backend auto backendObjPtr = CreateBackendObject(MockImportBackendId()); BOOST_TEST((backendObjPtr != nullptr)); @@ -677,7 +677,7 @@ BOOST_AUTO_TEST_CASE(FallbackDisableImportFromCpuAcc) } #if defined(ARMCOMPUTECL_ENABLED) -BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackToCl, * boost::unit_test::disabled()) +BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackToCl) { using namespace armnn; @@ -700,7 +700,7 @@ BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackToCl, * boost::unit_test::disabled add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32); input0->GetOutputSlot(0).SetTensorInfo(info); input1->GetOutputSlot(0).SetTensorInfo(info); @@ -752,29 +752,43 @@ BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackToCl, * boost::unit_test::disabled // Creates structures for input & output std::vector inputData0 { - 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f }; std::vector inputData1 { - 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f }; std::vector inputData2 { - 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f }; - std::vector outputData(12); + std::vector outputData(16); std::vector expectedOutput { - 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f + 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f, 11.0f, 9.0f, 7.0f, 5.0f }; + // Creates structures for input & output + unsigned int numElements = info.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + + // Prepare aligned data + const size_t alignment = 64; + size_t space = totalBytes + alignment + alignment; + auto inputData = std::make_unique(space); + void* alignedInputPtr = inputData.get(); + BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); + + auto* intputPtr = reinterpret_cast(alignedInputPtr); + std::copy(inputData2.begin(), inputData2.end(), intputPtr); + InputTensors inputTensors { { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, - { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), alignedInputPtr) } }; OutputTensors outputTensors { @@ -801,7 +815,11 @@ BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackToCl, * boost::unit_test::disabled BOOST_TEST(found != std::string::npos); // Check output is as expected - BOOST_TEST(outputData == expectedOutput); + for(unsigned int i = 0; i < numElements; ++i) + { + BOOST_TEST(outputData[i] == expectedOutput[i]); + } + runtime->UnloadNetwork(netId); } BOOST_AUTO_TEST_CASE(NeonImportDisabledFallbackToCl) @@ -926,7 +944,7 @@ BOOST_AUTO_TEST_CASE(NeonImportDisabledFallbackToCl) BOOST_TEST(outputData == expectedOutput); } -BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackSubgraphToCl, * boost::unit_test::disabled()) +BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackSubgraphToCl) { using namespace armnn; @@ -937,6 +955,10 @@ BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackSubgraphToCl, * boost::unit_test:: INetworkPtr net(INetwork::Create()); Pooling2dDescriptor desc; + desc.m_PoolWidth = 2; + desc.m_PoolHeight = 2; + desc.m_StrideX = 2; + desc.m_StrideY = 2; IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); @@ -953,8 +975,8 @@ BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackSubgraphToCl, * boost::unit_test:: sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); - TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); + TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32); + TensorInfo poolingInfo = TensorInfo({ 1, 2, 2, 1 }, DataType::Float32); input0->GetOutputSlot(0).SetTensorInfo(info); input1->GetOutputSlot(0).SetTensorInfo(info); @@ -1012,26 +1034,38 @@ BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackSubgraphToCl, * boost::unit_test:: // Creates structures for input & output std::vector inputData0 { - 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f }; std::vector inputData1 { - 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f }; std::vector inputData2 { - 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f }; - std::vector outputData(2); + std::vector outputData(4); - std::vector expectedOutput{ 11.0f, -1.0f }; + std::vector expectedOutput{ 11.0f, 3.0f, -5.0f, 11.0f }; + + // Prepare aligned data + unsigned int numElements = info.GetNumElements(); + size_t totalBytes = numElements * sizeof(float); + const size_t alignment = 64; + size_t space = totalBytes + alignment + alignment; + auto inputData = std::make_unique(space); + void* alignedInputPtr = inputData.get(); + BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr, space)); + + auto* intputPtr = reinterpret_cast(alignedInputPtr); + std::copy(inputData2.begin(), inputData2.end(), intputPtr); InputTensors inputTensors { { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, - { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), alignedInputPtr) } }; OutputTensors outputTensors { @@ -1067,6 +1101,7 @@ BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackSubgraphToCl, * boost::unit_test:: // Check output is as expected BOOST_TEST(outputData == expectedOutput); + runtime->UnloadNetwork(netId); } BOOST_AUTO_TEST_CASE(NeonImportDisableFallbackSubgraphToCl) -- cgit v1.2.1