aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNarumol Prangnawarat <narumol.prangnawarat@arm.com>2020-10-30 16:06:55 +0000
committerNarumol Prangnawarat <narumol.prangnawarat@arm.com>2020-10-30 18:29:37 +0000
commit265e53e61b472f7de9897b0dbcff1661e3f576cc (patch)
treea99253a1d9fe8297830be83accd6d7c08fe9b44f
parent25d80eed552df4d0346d1f245d1e6264d7b477f3 (diff)
downloadarmnn-265e53e61b472f7de9897b0dbcff1661e3f576cc.tar.gz
IVGCVSW-5322 Fix segfault between Neon and Cl layers
* Fallback to memory copy if memory import is not supported * Remove direct compatibility between Neon and Cl Tensors * Unit tests fallback from Neon to Cl and Cl to Neon Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com> Change-Id: Iec00a77423fb23b37a6b1aefee1b2ec4d649efca
-rw-r--r--src/armnn/LoadedNetwork.cpp20
-rw-r--r--src/backends/backendsCommon/test/CompatibilityTests.cpp2
-rw-r--r--src/backends/cl/ClTensorHandleFactory.cpp10
-rw-r--r--src/backends/cl/test/CMakeLists.txt6
-rw-r--r--src/backends/cl/test/ClFallbackTests.cpp538
-rw-r--r--src/backends/neon/NeonBackend.cpp3
-rw-r--r--src/backends/neon/test/NeonFallbackTests.cpp532
7 files changed, 1088 insertions, 23 deletions
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index 00ac90b121..b5a1b392b4 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -570,10 +570,12 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tens
info.m_OutputTensorInfos.push_back(outputTensorInfo);
MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
+ bool needMemCopy = true;
if (m_IsImportEnabled) // Try import the input tensor
{
if(CheckFlag(importFlags, MemorySource::Malloc) )
{
+ needMemCopy = false;
// This assumes a CPU Tensor handle
void* mem = tensorHandle->Map(false);
if (outputTensorHandle->Import(mem, MemorySource::Malloc))
@@ -584,12 +586,8 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tens
tensorHandle->Unmap();
throw MemoryImportException("EnqueueInput: Memory Import failed");
}
- else
- {
- throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
- }
}
- else
+ if (needMemCopy)
{
// Create a mem copy workload for input since we did not import
std::unique_ptr<IWorkload> inputWorkload = std::make_unique<CopyMemGenericWorkload>(inputQueueDescriptor, info);
@@ -643,6 +641,7 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten
// c) There is only one connection to the OutputSlot and it is to an OutputLayer.
// d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
// e) m_IsExportEnabled must be set to true
+ bool needMemCopy = true;
if (m_IsExportEnabled && (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
{
if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
@@ -650,6 +649,7 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten
MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
if (CheckFlag(importFlags, MemorySource::Malloc))
{
+ needMemCopy = false;
void *mem = tensorHandle->Map(false);
bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc);
tensorHandle->Unmap();
@@ -669,17 +669,9 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten
throw MemoryExportException("EnqueueOutput: Memory Export failed");
}
}
- else
- {
- throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export");
- }
- }
- else
- {
- throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer");
}
}
- else
+ if (needMemCopy)
{
const Layer& connectedLayer = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer();
// Do not add MemCopy Layer if OutputLayer is already connected the MemCopy Layer
diff --git a/src/backends/backendsCommon/test/CompatibilityTests.cpp b/src/backends/backendsCommon/test/CompatibilityTests.cpp
index 90aa76e3f3..b69e11253d 100644
--- a/src/backends/backendsCommon/test/CompatibilityTests.cpp
+++ b/src/backends/backendsCommon/test/CompatibilityTests.cpp
@@ -15,7 +15,7 @@
using namespace armnn;
-BOOST_AUTO_TEST_SUITE(BackendsCompatibility)
+BOOST_AUTO_TEST_SUITE(BackendsCompatibility, * boost::unit_test::disabled())
BOOST_AUTO_TEST_CASE(Neon_Cl_DirectCompatibility_Test)
{
diff --git a/src/backends/cl/ClTensorHandleFactory.cpp b/src/backends/cl/ClTensorHandleFactory.cpp
index 33995f7b34..237f27a4ed 100644
--- a/src/backends/cl/ClTensorHandleFactory.cpp
+++ b/src/backends/cl/ClTensorHandleFactory.cpp
@@ -73,10 +73,11 @@ std::unique_ptr<ITensorHandle> ClTensorHandleFactory::CreateTensorHandle(const T
const bool IsMemoryManaged) const
{
std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo);
- if (IsMemoryManaged)
+ if (!IsMemoryManaged)
{
- tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+ ARMNN_LOG(warning) << "ClTensorHandleFactory only has support for memory managed.";
}
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
return tensorHandle;
}
@@ -85,10 +86,11 @@ std::unique_ptr<ITensorHandle> ClTensorHandleFactory::CreateTensorHandle(const T
const bool IsMemoryManaged) const
{
std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo, dataLayout);
- if (IsMemoryManaged)
+ if (!IsMemoryManaged)
{
- tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
+ ARMNN_LOG(warning) << "ClTensorHandleFactory only has support for memory managed.";
}
+ tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
return tensorHandle;
}
diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt
index b0b330931d..2cf00d106b 100644
--- a/src/backends/cl/test/CMakeLists.txt
+++ b/src/backends/cl/test/CMakeLists.txt
@@ -23,6 +23,12 @@ if (ARMNNREF)
)
endif()
+if (ARMCOMPUTENEON)
+ list(APPEND armnnClBackendUnitTests_sources
+ ClFallbackTests.cpp
+ )
+endif()
+
add_library(armnnClBackendUnitTests OBJECT ${armnnClBackendUnitTests_sources})
target_include_directories(armnnClBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn)
target_include_directories(armnnClBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils)
diff --git a/src/backends/cl/test/ClFallbackTests.cpp b/src/backends/cl/test/ClFallbackTests.cpp
new file mode 100644
index 0000000000..5885cbe8ef
--- /dev/null
+++ b/src/backends/cl/test/ClFallbackTests.cpp
@@ -0,0 +1,538 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <backendsCommon/test/CommonTestUtils.hpp>
+
+#include <test/GraphUtils.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(ClFallback)
+
+BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon)
+{
+ using namespace armnn;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+ IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+ IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+ IConnectableLayer* add = net->AddAdditionLayer("add");
+ IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+ IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+ input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+ input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+ input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+ add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+ sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+
+ input0->GetOutputSlot(0).SetTensorInfo(info);
+ input1->GetOutputSlot(0).SetTensorInfo(info);
+ input2->GetOutputSlot(0).SetTensorInfo(info);
+ add->GetOutputSlot(0).SetTensorInfo(info);
+ sub->GetOutputSlot(0).SetTensorInfo(info);
+
+ std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc };
+ // Use BackendSelectionHint to specify CpuAcc for Subtraction layer
+ sub->BackendSelectionHint(backends[1]);
+
+ // optimize the network
+ OptimizerOptions optOptions;
+ optOptions.m_ImportEnabled = true;
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+
+ OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+ Graph& graph = optNetObjPtr->GetGraph();
+
+ armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+ armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+ armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+ armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+ armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+ armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+ armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
+
+ // Checks order is valid.
+ BOOST_TEST(CheckOrder(graph, layer0, layer1));
+ BOOST_TEST(CheckOrder(graph, layer1, layer2));
+ BOOST_TEST(CheckOrder(graph, layer2, layer3));
+ BOOST_TEST(CheckOrder(graph, layer3, layer4));
+ BOOST_TEST(CheckOrder(graph, layer4, layer5));
+ BOOST_TEST(CheckOrder(graph, layer5, layer6));
+
+ // Use memory import between backends
+ BOOST_TEST((layer4->GetType() == LayerType::MemCopy));
+
+ // Correctly use backend hint
+ BOOST_TEST((layer5->GetBackendId() == Compute::CpuAcc ));
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+ INetworkProperties networkProperties(true, true);
+
+ runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+ // Creates structures for input & output
+ std::vector<float> inputData0
+ {
+ 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+ };
+ std::vector<float> inputData1
+ {
+ 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+ };
+ std::vector<float> inputData2
+ {
+ 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+ };
+
+ std::vector<float> outputData(12);
+
+ std::vector<float> expectedOutput
+ {
+ 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
+ };
+
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+ { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+ { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Executed Subtraction using CpuAcc
+ std::size_t found = dump.find("NeonSubtractionWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Check output is as expected
+ BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(ClImportDisabledFallbackToNeon)
+{
+ using namespace armnn;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+ IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+ IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+ IConnectableLayer* add = net->AddAdditionLayer("add");
+ IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+ IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+ input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+ input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+ input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+ add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+ sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+
+ input0->GetOutputSlot(0).SetTensorInfo(info);
+ input1->GetOutputSlot(0).SetTensorInfo(info);
+ input2->GetOutputSlot(0).SetTensorInfo(info);
+ add->GetOutputSlot(0).SetTensorInfo(info);
+ sub->GetOutputSlot(0).SetTensorInfo(info);
+
+ std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc };
+ // Use BackendSelectionHint to specify CpuAcc for Subtraction layer
+ sub->BackendSelectionHint(backends[1]);
+
+ // optimize the network
+ OptimizerOptions optOptions;
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+
+ OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+ Graph& graph = optNetObjPtr->GetGraph();
+
+ armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+ armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+ armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+ armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+ armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+ armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+ armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
+
+ // Checks order is valid.
+ BOOST_TEST(CheckOrder(graph, layer0, layer1));
+ BOOST_TEST(CheckOrder(graph, layer1, layer2));
+ BOOST_TEST(CheckOrder(graph, layer2, layer3));
+ BOOST_TEST(CheckOrder(graph, layer3, layer4));
+ BOOST_TEST(CheckOrder(graph, layer4, layer5));
+ BOOST_TEST(CheckOrder(graph, layer5, layer6));
+
+ // Use memory import between backends
+ BOOST_TEST((layer4->GetType() == LayerType::MemCopy));
+
+ // Correctly use backend hint
+ BOOST_TEST((layer5->GetBackendId() == Compute::CpuAcc ));
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ runtime->LoadNetwork(netId, std::move(optNet));
+
+ // Creates structures for input & output
+ std::vector<float> inputData0
+ {
+ 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+ };
+ std::vector<float> inputData1
+ {
+ 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+ };
+ std::vector<float> inputData2
+ {
+ 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+ };
+
+ std::vector<float> outputData(12);
+
+ std::vector<float> expectedOutput
+ {
+ 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
+ };
+
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+ { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+ { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Executed Subtraction using CpuAcc
+ std::size_t found = dump.find("NeonSubtractionWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Check output is as expected
+ BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon)
+{
+ using namespace armnn;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ Pooling2dDescriptor desc;
+
+ IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+ IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+ IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+ IConnectableLayer* add = net->AddAdditionLayer("add");
+ IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+ IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
+ IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+ input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+ input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+ input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+ add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+ sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
+ pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+ TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
+
+ input0->GetOutputSlot(0).SetTensorInfo(info);
+ input1->GetOutputSlot(0).SetTensorInfo(info);
+ input2->GetOutputSlot(0).SetTensorInfo(info);
+ add->GetOutputSlot(0).SetTensorInfo(info);
+ sub->GetOutputSlot(0).SetTensorInfo(info);
+ pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
+
+ std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc };
+ // Use BackendSelectionHint to specify CpuAcc for Subtraction layer
+ sub->BackendSelectionHint(backends[1]);
+
+ // optimize the network
+ OptimizerOptions optOptions;
+ optOptions.m_ImportEnabled = true;
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+
+ OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+ Graph& graph = optNetObjPtr->GetGraph();
+
+ armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+ armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+ armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+ armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+ armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+ armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+ armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]");
+ armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling");
+ armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output");
+
+ // Checks order is valid.
+ BOOST_TEST(CheckOrder(graph, layer0, layer1));
+ BOOST_TEST(CheckOrder(graph, layer1, layer2));
+ BOOST_TEST(CheckOrder(graph, layer2, layer3));
+ BOOST_TEST(CheckOrder(graph, layer3, layer4));
+ BOOST_TEST(CheckOrder(graph, layer4, layer5));
+ BOOST_TEST(CheckOrder(graph, layer5, layer6));
+ BOOST_TEST(CheckOrder(graph, layer6, layer7));
+ BOOST_TEST(CheckOrder(graph, layer7, layer8));
+
+ // Use memory import between backends
+ BOOST_TEST((layer4->GetType() == LayerType::MemCopy));
+ BOOST_TEST((layer6->GetType() == LayerType::MemCopy));
+
+ // Correctly use backend hint
+ BOOST_TEST((layer5->GetBackendId() == Compute::CpuAcc ));
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+ INetworkProperties networkProperties(true, true);
+
+ runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+ // Creates structures for input & output
+ std::vector<float> inputData0
+ {
+ 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+ };
+ std::vector<float> inputData1
+ {
+ 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+ };
+ std::vector<float> inputData2
+ {
+ 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+ };
+
+ std::vector<float> outputData(2);
+
+ std::vector<float> expectedOutput{ 11.0f, -1.0f };
+
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+ { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+ { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Executed Subtraction using CpuAcc
+ std::size_t found = dump.find("NeonSubtractionWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Correctly switch back to GpuAcc
+ found = dump.find("ClPooling2dWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Check output is as expected
+ BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(ClImportDisableFallbackSubgraphToNeon)
+{
+ using namespace armnn;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ Pooling2dDescriptor desc;
+
+ IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+ IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+ IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+ IConnectableLayer* add = net->AddAdditionLayer("add");
+ IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+ IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
+ IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+ input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+ input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+ input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+ add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+ sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
+ pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+ TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
+
+ input0->GetOutputSlot(0).SetTensorInfo(info);
+ input1->GetOutputSlot(0).SetTensorInfo(info);
+ input2->GetOutputSlot(0).SetTensorInfo(info);
+ add->GetOutputSlot(0).SetTensorInfo(info);
+ sub->GetOutputSlot(0).SetTensorInfo(info);
+ pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
+
+ std::vector<BackendId> backends = { Compute::GpuAcc, Compute::CpuAcc };
+ // Use BackendSelectionHint to specify CpuAcc for Subtraction layer
+ sub->BackendSelectionHint(backends[1]);
+
+ // optimize the network
+ OptimizerOptions optOptions;
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+
+ OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+ Graph& graph = optNetObjPtr->GetGraph();
+
+ armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+ armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+ armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+ armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+ armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+ armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+ armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]");
+ armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling");
+ armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output");
+
+ // Checks order is valid.
+ BOOST_TEST(CheckOrder(graph, layer0, layer1));
+ BOOST_TEST(CheckOrder(graph, layer1, layer2));
+ BOOST_TEST(CheckOrder(graph, layer2, layer3));
+ BOOST_TEST(CheckOrder(graph, layer3, layer4));
+ BOOST_TEST(CheckOrder(graph, layer4, layer5));
+ BOOST_TEST(CheckOrder(graph, layer5, layer6));
+ BOOST_TEST(CheckOrder(graph, layer6, layer7));
+ BOOST_TEST(CheckOrder(graph, layer7, layer8));
+
+ // Use memory import between backends
+ BOOST_TEST((layer4->GetType() == LayerType::MemCopy));
+ BOOST_TEST((layer6->GetType() == LayerType::MemCopy));
+
+ // Correctly use backend hint
+ BOOST_TEST((layer5->GetBackendId() == Compute::CpuAcc ));
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ runtime->LoadNetwork(netId, std::move(optNet));
+
+ // Creates structures for input & output
+ std::vector<float> inputData0
+ {
+ 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+ };
+ std::vector<float> inputData1
+ {
+ 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+ };
+ std::vector<float> inputData2
+ {
+ 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+ };
+
+ std::vector<float> outputData(2);
+
+ std::vector<float> expectedOutput{ 11.0f, -1.0f };
+
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+ { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+ { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Executed Subtraction using CpuAcc
+ std::size_t found = dump.find("NeonSubtractionWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Correctly switch back to GpuAcc
+ found = dump.find("ClPooling2dWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Check output is as expected
+ BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp
index 642c19dbe4..9862ddbd70 100644
--- a/src/backends/neon/NeonBackend.cpp
+++ b/src/backends/neon/NeonBackend.cpp
@@ -129,8 +129,7 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph
std::vector<ITensorHandleFactory::FactoryId> NeonBackend::GetHandleFactoryPreferences() const
{
- return std::vector<ITensorHandleFactory::FactoryId>() = {"Arm/Neon/TensorHandleFactory",
- "Arm/Cl/TensorHandleFactory"};
+ return std::vector<ITensorHandleFactory::FactoryId>() = { NeonTensorHandleFactory::GetIdStatic() };
}
void NeonBackend::RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry)
diff --git a/src/backends/neon/test/NeonFallbackTests.cpp b/src/backends/neon/test/NeonFallbackTests.cpp
index 9a07ed236f..fd7fbbc4d5 100644
--- a/src/backends/neon/test/NeonFallbackTests.cpp
+++ b/src/backends/neon/test/NeonFallbackTests.cpp
@@ -12,8 +12,6 @@
BOOST_AUTO_TEST_SUITE(NeonFallback)
-std::vector<armnn::BackendId> defaultBackends = { armnn::Compute::CpuAcc };
-
BOOST_AUTO_TEST_CASE(FallbackImportToCpuAcc)
{
using namespace armnn;
@@ -684,4 +682,534 @@ BOOST_AUTO_TEST_CASE(FallbackDisableImportFromCpuAcc)
BOOST_TEST(outputData == expectedOutput);
}
+#if defined(ARMCOMPUTECL_ENABLED)
+BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackToCl)
+{
+ using namespace armnn;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+ IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+ IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+ IConnectableLayer* add = net->AddAdditionLayer("add");
+ IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+ IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+ input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+ input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+ input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+ add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+ sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+
+ input0->GetOutputSlot(0).SetTensorInfo(info);
+ input1->GetOutputSlot(0).SetTensorInfo(info);
+ input2->GetOutputSlot(0).SetTensorInfo(info);
+ add->GetOutputSlot(0).SetTensorInfo(info);
+ sub->GetOutputSlot(0).SetTensorInfo(info);
+
+ std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc };
+ // Use BackendSelectionHint to specify GpuAcc for Subtraction layer
+ sub->BackendSelectionHint(backends[1]);
+
+ // optimize the network
+ OptimizerOptions optOptions;
+ optOptions.m_ImportEnabled = true;
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+
+ OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+ Graph& graph = optNetObjPtr->GetGraph();
+
+ armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+ armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+ armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+ armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+ armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+ armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+ armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
+
+ // Checks order is valid.
+ BOOST_TEST(CheckOrder(graph, layer0, layer1));
+ BOOST_TEST(CheckOrder(graph, layer1, layer2));
+ BOOST_TEST(CheckOrder(graph, layer2, layer3));
+ BOOST_TEST(CheckOrder(graph, layer3, layer4));
+ BOOST_TEST(CheckOrder(graph, layer4, layer5));
+ BOOST_TEST(CheckOrder(graph, layer5, layer6));
+
+ // Use memory import between backends
+ BOOST_TEST((layer4->GetType() == LayerType::MemCopy));
+
+ // Correctly use backend hint
+ BOOST_TEST((layer5->GetBackendId() == Compute::GpuAcc ));
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+ INetworkProperties networkProperties(true, true);
+
+ runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+ // Creates structures for input & output
+ std::vector<float> inputData0
+ {
+ 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+ };
+ std::vector<float> inputData1
+ {
+ 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+ };
+ std::vector<float> inputData2
+ {
+ 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+ };
+
+ std::vector<float> outputData(12);
+
+ std::vector<float> expectedOutput
+ {
+ 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
+ };
+
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+ { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+ { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Executed Subtraction using GpuAcc
+ std::size_t found = dump.find("ClSubtractionWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Check output is as expected
+ BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(NeonImportDisabledFallbackToCl)
+{
+ using namespace armnn;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+ IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+ IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+ IConnectableLayer* add = net->AddAdditionLayer("add");
+ IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+ IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+ input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+ input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+ input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+ add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+ sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+
+ input0->GetOutputSlot(0).SetTensorInfo(info);
+ input1->GetOutputSlot(0).SetTensorInfo(info);
+ input2->GetOutputSlot(0).SetTensorInfo(info);
+ add->GetOutputSlot(0).SetTensorInfo(info);
+ sub->GetOutputSlot(0).SetTensorInfo(info);
+
+ std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc };
+ // Use BackendSelectionHint to specify GpuAcc for Subtraction layer
+ sub->BackendSelectionHint(backends[1]);
+
+ // optimize the network
+ OptimizerOptions optOptions;
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+
+ OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+ Graph& graph = optNetObjPtr->GetGraph();
+
+ armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+ armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+ armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+ armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+ armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+ armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+ armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
+
+ // Checks order is valid.
+ BOOST_TEST(CheckOrder(graph, layer0, layer1));
+ BOOST_TEST(CheckOrder(graph, layer1, layer2));
+ BOOST_TEST(CheckOrder(graph, layer2, layer3));
+ BOOST_TEST(CheckOrder(graph, layer3, layer4));
+ BOOST_TEST(CheckOrder(graph, layer4, layer5));
+ BOOST_TEST(CheckOrder(graph, layer5, layer6));
+
+ // Use memory import between backends
+ BOOST_TEST((layer4->GetType() == LayerType::MemCopy));
+
+ // Correctly use backend hint
+ BOOST_TEST((layer5->GetBackendId() == Compute::GpuAcc ));
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ runtime->LoadNetwork(netId, std::move(optNet));
+
+ // Creates structures for input & output
+ std::vector<float> inputData0
+ {
+ 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+ };
+ std::vector<float> inputData1
+ {
+ 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+ };
+ std::vector<float> inputData2
+ {
+ 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+ };
+
+ std::vector<float> outputData(12);
+
+ std::vector<float> expectedOutput
+ {
+ 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
+ };
+
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+ { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+ { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Executed Subtraction using GpuAcc
+ std::size_t found = dump.find("ClSubtractionWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Check output is as expected
+ BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackSubgraphToCl)
+{
+ using namespace armnn;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ Pooling2dDescriptor desc;
+
+ IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+ IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+ IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+ IConnectableLayer* add = net->AddAdditionLayer("add");
+ IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+ IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
+ IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+ input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+ input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+ input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+ add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+ sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
+ pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+ TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
+
+ input0->GetOutputSlot(0).SetTensorInfo(info);
+ input1->GetOutputSlot(0).SetTensorInfo(info);
+ input2->GetOutputSlot(0).SetTensorInfo(info);
+ add->GetOutputSlot(0).SetTensorInfo(info);
+ sub->GetOutputSlot(0).SetTensorInfo(info);
+ pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
+
+ std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc };
+ // Use BackendSelectionHint to specify GpuAcc for Subtraction layer
+ sub->BackendSelectionHint(backends[1]);
+
+ // optimize the network
+ OptimizerOptions optOptions;
+ optOptions.m_ImportEnabled = true;
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+
+ OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+ Graph& graph = optNetObjPtr->GetGraph();
+
+ armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+ armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+ armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+ armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+ armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+ armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+ armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]");
+ armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling");
+ armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output");
+
+ // Checks order is valid.
+ BOOST_TEST(CheckOrder(graph, layer0, layer1));
+ BOOST_TEST(CheckOrder(graph, layer1, layer2));
+ BOOST_TEST(CheckOrder(graph, layer2, layer3));
+ BOOST_TEST(CheckOrder(graph, layer3, layer4));
+ BOOST_TEST(CheckOrder(graph, layer4, layer5));
+ BOOST_TEST(CheckOrder(graph, layer5, layer6));
+ BOOST_TEST(CheckOrder(graph, layer6, layer7));
+ BOOST_TEST(CheckOrder(graph, layer7, layer8));
+
+ // Use memory import between backends
+ BOOST_TEST((layer4->GetType() == LayerType::MemCopy));
+ BOOST_TEST((layer6->GetType() == LayerType::MemCopy));
+
+ // Correctly use backend hint
+ BOOST_TEST((layer5->GetBackendId() == Compute::GpuAcc ));
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+ INetworkProperties networkProperties(true, true);
+
+ runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+ // Creates structures for input & output
+ std::vector<float> inputData0
+ {
+ 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+ };
+ std::vector<float> inputData1
+ {
+ 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+ };
+ std::vector<float> inputData2
+ {
+ 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+ };
+
+ std::vector<float> outputData(2);
+
+ std::vector<float> expectedOutput{ 11.0f, -1.0f };
+
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+ { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+ { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Executed Subtraction using GpuAcc
+ std::size_t found = dump.find("ClSubtractionWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Correctly switch back to CpuAcc
+ found = dump.find("NeonPooling2dWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contains SyncMemGeneric for output
+ found = dump.find("SyncMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Check output is as expected
+ BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(NeonImportDisableFallbackSubgraphToCl)
+{
+ using namespace armnn;
+
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(IRuntime::Create(options));
+
+ // Builds up the structure of the network.
+ INetworkPtr net(INetwork::Create());
+
+ Pooling2dDescriptor desc;
+
+ IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+ IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+ IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+ IConnectableLayer* add = net->AddAdditionLayer("add");
+ IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+ IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
+ IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+ input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+ input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+ input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+ add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+ sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
+ pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+ TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+ TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
+
+ input0->GetOutputSlot(0).SetTensorInfo(info);
+ input1->GetOutputSlot(0).SetTensorInfo(info);
+ input2->GetOutputSlot(0).SetTensorInfo(info);
+ add->GetOutputSlot(0).SetTensorInfo(info);
+ sub->GetOutputSlot(0).SetTensorInfo(info);
+ pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
+
+ std::vector<BackendId> backends = { Compute::CpuAcc, Compute::GpuAcc };
+ // Use BackendSelectionHint to specify GpuAcc for Subtraction layer
+ sub->BackendSelectionHint(backends[1]);
+
+ // optimize the network
+ OptimizerOptions optOptions;
+ IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+
+ OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+ Graph& graph = optNetObjPtr->GetGraph();
+
+ armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+ armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+ armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+ armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+ armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+ armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+ armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]");
+ armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling");
+ armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output");
+
+ // Checks order is valid.
+ BOOST_TEST(CheckOrder(graph, layer0, layer1));
+ BOOST_TEST(CheckOrder(graph, layer1, layer2));
+ BOOST_TEST(CheckOrder(graph, layer2, layer3));
+ BOOST_TEST(CheckOrder(graph, layer3, layer4));
+ BOOST_TEST(CheckOrder(graph, layer4, layer5));
+ BOOST_TEST(CheckOrder(graph, layer5, layer6));
+ BOOST_TEST(CheckOrder(graph, layer6, layer7));
+ BOOST_TEST(CheckOrder(graph, layer7, layer8));
+
+ // Use memory import between backends
+ BOOST_TEST((layer4->GetType() == LayerType::MemCopy));
+ BOOST_TEST((layer6->GetType() == LayerType::MemCopy));
+
+ // Correctly use backend hint
+ BOOST_TEST((layer5->GetBackendId() == Compute::GpuAcc ));
+
+ // Load it into the runtime. It should pass.
+ NetworkId netId;
+ runtime->LoadNetwork(netId, std::move(optNet));
+
+ // Creates structures for input & output
+ std::vector<float> inputData0
+ {
+ 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+ };
+ std::vector<float> inputData1
+ {
+ 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+ };
+ std::vector<float> inputData2
+ {
+ 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+ };
+
+ std::vector<float> outputData(2);
+
+ std::vector<float> expectedOutput{ 11.0f, -1.0f };
+
+ InputTensors inputTensors
+ {
+ { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+ { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+ { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+ };
+ OutputTensors outputTensors
+ {
+ { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Executed Subtraction using GpuAcc
+ std::size_t found = dump.find("ClSubtractionWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Correctly switch back to CpuAcc
+ found = dump.find("NeonPooling2dWorkload_Execute");
+ BOOST_TEST(found != std::string::npos);
+
+ // Contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ BOOST_TEST(found != std::string::npos);
+
+ // Check output is as expected
+ BOOST_TEST(outputData == expectedOutput);
+}
+#endif
+
BOOST_AUTO_TEST_SUITE_END()