From 265e53e61b472f7de9897b0dbcff1661e3f576cc Mon Sep 17 00:00:00 2001 From: Narumol Prangnawarat Date: Fri, 30 Oct 2020 16:06:55 +0000 Subject: IVGCVSW-5322 Fix segfault between Neon and Cl layers * Fallback to memory copy if memory import is not supported * Remove direct compatibility between Neon and Cl Tensors * Unit tests fallback from Neon to Cl and Cl to Neon Signed-off-by: Narumol Prangnawarat Change-Id: Iec00a77423fb23b37a6b1aefee1b2ec4d649efca --- src/armnn/LoadedNetwork.cpp | 20 +- .../backendsCommon/test/CompatibilityTests.cpp | 2 +- src/backends/cl/ClTensorHandleFactory.cpp | 10 +- src/backends/cl/test/CMakeLists.txt | 6 + src/backends/cl/test/ClFallbackTests.cpp | 538 +++++++++++++++++++++ src/backends/neon/NeonBackend.cpp | 3 +- src/backends/neon/test/NeonFallbackTests.cpp | 532 +++++++++++++++++++- 7 files changed, 1088 insertions(+), 23 deletions(-) create mode 100644 src/backends/cl/test/ClFallbackTests.cpp diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index 00ac90b121..b5a1b392b4 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -570,10 +570,12 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tens info.m_OutputTensorInfos.push_back(outputTensorInfo); MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags(); + bool needMemCopy = true; if (m_IsImportEnabled) // Try import the input tensor { if(CheckFlag(importFlags, MemorySource::Malloc) ) { + needMemCopy = false; // This assumes a CPU Tensor handle void* mem = tensorHandle->Map(false); if (outputTensorHandle->Import(mem, MemorySource::Malloc)) @@ -584,12 +586,8 @@ void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tens tensorHandle->Unmap(); throw MemoryImportException("EnqueueInput: Memory Import failed"); } - else - { - throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import"); - } } - else + if (needMemCopy) { // Create a mem copy workload for input since we did not import std::unique_ptr inputWorkload = std::make_unique(inputQueueDescriptor, info); @@ -643,6 +641,7 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten // c) There is only one connection to the OutputSlot and it is to an OutputLayer. // d) The output pointer is allocated via malloc. (Other types will be supported in a later release) // e) m_IsExportEnabled must be set to true + bool needMemCopy = true; if (m_IsExportEnabled && (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1)) { if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input) @@ -650,6 +649,7 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags(); if (CheckFlag(importFlags, MemorySource::Malloc)) { + needMemCopy = false; void *mem = tensorHandle->Map(false); bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc); tensorHandle->Unmap(); @@ -669,17 +669,9 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten throw MemoryExportException("EnqueueOutput: Memory Export failed"); } } - else - { - throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export"); - } - } - else - { - throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer"); } } - else + if (needMemCopy) { const Layer& connectedLayer = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer(); // Do not add MemCopy Layer if OutputLayer is already connected the MemCopy Layer diff --git a/src/backends/backendsCommon/test/CompatibilityTests.cpp b/src/backends/backendsCommon/test/CompatibilityTests.cpp index 90aa76e3f3..b69e11253d 100644 --- a/src/backends/backendsCommon/test/CompatibilityTests.cpp +++ b/src/backends/backendsCommon/test/CompatibilityTests.cpp @@ -15,7 +15,7 @@ using namespace armnn; -BOOST_AUTO_TEST_SUITE(BackendsCompatibility) +BOOST_AUTO_TEST_SUITE(BackendsCompatibility, * boost::unit_test::disabled()) BOOST_AUTO_TEST_CASE(Neon_Cl_DirectCompatibility_Test) { diff --git a/src/backends/cl/ClTensorHandleFactory.cpp b/src/backends/cl/ClTensorHandleFactory.cpp index 33995f7b34..237f27a4ed 100644 --- a/src/backends/cl/ClTensorHandleFactory.cpp +++ b/src/backends/cl/ClTensorHandleFactory.cpp @@ -73,10 +73,11 @@ std::unique_ptr ClTensorHandleFactory::CreateTensorHandle(const T const bool IsMemoryManaged) const { std::unique_ptr tensorHandle = std::make_unique(tensorInfo); - if (IsMemoryManaged) + if (!IsMemoryManaged) { - tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + ARMNN_LOG(warning) << "ClTensorHandleFactory only has support for memory managed."; } + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); return tensorHandle; } @@ -85,10 +86,11 @@ std::unique_ptr ClTensorHandleFactory::CreateTensorHandle(const T const bool IsMemoryManaged) const { std::unique_ptr tensorHandle = std::make_unique(tensorInfo, dataLayout); - if (IsMemoryManaged) + if (!IsMemoryManaged) { - tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); + ARMNN_LOG(warning) << "ClTensorHandleFactory only has support for memory managed."; } + tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup()); return tensorHandle; } diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt index b0b330931d..2cf00d106b 100644 --- a/src/backends/cl/test/CMakeLists.txt +++ b/src/backends/cl/test/CMakeLists.txt @@ -23,6 +23,12 @@ if (ARMNNREF) ) endif() +if (ARMCOMPUTENEON) + list(APPEND armnnClBackendUnitTests_sources + ClFallbackTests.cpp + ) +endif() + add_library(armnnClBackendUnitTests OBJECT ${armnnClBackendUnitTests_sources}) target_include_directories(armnnClBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnn) target_include_directories(armnnClBackendUnitTests PRIVATE ${PROJECT_SOURCE_DIR}/src/armnnUtils) diff --git a/src/backends/cl/test/ClFallbackTests.cpp b/src/backends/cl/test/ClFallbackTests.cpp new file mode 100644 index 0000000000..5885cbe8ef --- /dev/null +++ b/src/backends/cl/test/ClFallbackTests.cpp @@ -0,0 +1,538 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include + +#include + +#include + +BOOST_AUTO_TEST_SUITE(ClFallback) + +BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon) +{ + using namespace armnn; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); + IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); + IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); + IConnectableLayer* add = net->AddAdditionLayer("add"); + IConnectableLayer* sub = net->AddSubtractionLayer("sub"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); + add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); + sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + + input0->GetOutputSlot(0).SetTensorInfo(info); + input1->GetOutputSlot(0).SetTensorInfo(info); + input2->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + sub->GetOutputSlot(0).SetTensorInfo(info); + + std::vector backends = { Compute::GpuAcc, Compute::CpuAcc }; + // Use BackendSelectionHint to specify CpuAcc for Subtraction layer + sub->BackendSelectionHint(backends[1]); + + // optimize the network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = true; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + + OptimizedNetwork* optNetObjPtr = PolymorphicDowncast(optNet.get()); + Graph& graph = optNetObjPtr->GetGraph(); + + armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); + armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); + armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); + armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); + armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); + armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); + armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layer0, layer1)); + BOOST_TEST(CheckOrder(graph, layer1, layer2)); + BOOST_TEST(CheckOrder(graph, layer2, layer3)); + BOOST_TEST(CheckOrder(graph, layer3, layer4)); + BOOST_TEST(CheckOrder(graph, layer4, layer5)); + BOOST_TEST(CheckOrder(graph, layer5, layer6)); + + // Use memory import between backends + BOOST_TEST((layer4->GetType() == LayerType::MemCopy)); + + // Correctly use backend hint + BOOST_TEST((layer5->GetBackendId() == Compute::CpuAcc )); + + // Load it into the runtime. It should pass. + NetworkId netId; + std::string ignoredErrorMessage; + INetworkProperties networkProperties(true, true); + + runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + std::vector inputData0 + { + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + }; + std::vector inputData1 + { + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + }; + std::vector inputData2 + { + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + }; + + std::vector outputData(12); + + std::vector expectedOutput + { + 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f + }; + + InputTensors inputTensors + { + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + }; + OutputTensors outputTensors + { + { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Executed Subtraction using CpuAcc + std::size_t found = dump.find("NeonSubtractionWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Check output is as expected + BOOST_TEST(outputData == expectedOutput); +} + +BOOST_AUTO_TEST_CASE(ClImportDisabledFallbackToNeon) +{ + using namespace armnn; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); + IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); + IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); + IConnectableLayer* add = net->AddAdditionLayer("add"); + IConnectableLayer* sub = net->AddSubtractionLayer("sub"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); + add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); + sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + + input0->GetOutputSlot(0).SetTensorInfo(info); + input1->GetOutputSlot(0).SetTensorInfo(info); + input2->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + sub->GetOutputSlot(0).SetTensorInfo(info); + + std::vector backends = { Compute::GpuAcc, Compute::CpuAcc }; + // Use BackendSelectionHint to specify CpuAcc for Subtraction layer + sub->BackendSelectionHint(backends[1]); + + // optimize the network + OptimizerOptions optOptions; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + + OptimizedNetwork* optNetObjPtr = PolymorphicDowncast(optNet.get()); + Graph& graph = optNetObjPtr->GetGraph(); + + armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); + armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); + armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); + armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); + armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); + armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); + armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layer0, layer1)); + BOOST_TEST(CheckOrder(graph, layer1, layer2)); + BOOST_TEST(CheckOrder(graph, layer2, layer3)); + BOOST_TEST(CheckOrder(graph, layer3, layer4)); + BOOST_TEST(CheckOrder(graph, layer4, layer5)); + BOOST_TEST(CheckOrder(graph, layer5, layer6)); + + // Use memory import between backends + BOOST_TEST((layer4->GetType() == LayerType::MemCopy)); + + // Correctly use backend hint + BOOST_TEST((layer5->GetBackendId() == Compute::CpuAcc )); + + // Load it into the runtime. It should pass. + NetworkId netId; + runtime->LoadNetwork(netId, std::move(optNet)); + + // Creates structures for input & output + std::vector inputData0 + { + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + }; + std::vector inputData1 + { + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + }; + std::vector inputData2 + { + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + }; + + std::vector outputData(12); + + std::vector expectedOutput + { + 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f + }; + + InputTensors inputTensors + { + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + }; + OutputTensors outputTensors + { + { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Executed Subtraction using CpuAcc + std::size_t found = dump.find("NeonSubtractionWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Check output is as expected + BOOST_TEST(outputData == expectedOutput); +} + +BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon) +{ + using namespace armnn; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + Pooling2dDescriptor desc; + + IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); + IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); + IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); + IConnectableLayer* add = net->AddAdditionLayer("add"); + IConnectableLayer* sub = net->AddSubtractionLayer("sub"); + IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); + add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); + sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); + pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); + + input0->GetOutputSlot(0).SetTensorInfo(info); + input1->GetOutputSlot(0).SetTensorInfo(info); + input2->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + sub->GetOutputSlot(0).SetTensorInfo(info); + pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); + + std::vector backends = { Compute::GpuAcc, Compute::CpuAcc }; + // Use BackendSelectionHint to specify CpuAcc for Subtraction layer + sub->BackendSelectionHint(backends[1]); + + // optimize the network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = true; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + + OptimizedNetwork* optNetObjPtr = PolymorphicDowncast(optNet.get()); + Graph& graph = optNetObjPtr->GetGraph(); + + armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); + armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); + armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); + armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); + armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); + armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); + armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]"); + armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling"); + armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layer0, layer1)); + BOOST_TEST(CheckOrder(graph, layer1, layer2)); + BOOST_TEST(CheckOrder(graph, layer2, layer3)); + BOOST_TEST(CheckOrder(graph, layer3, layer4)); + BOOST_TEST(CheckOrder(graph, layer4, layer5)); + BOOST_TEST(CheckOrder(graph, layer5, layer6)); + BOOST_TEST(CheckOrder(graph, layer6, layer7)); + BOOST_TEST(CheckOrder(graph, layer7, layer8)); + + // Use memory import between backends + BOOST_TEST((layer4->GetType() == LayerType::MemCopy)); + BOOST_TEST((layer6->GetType() == LayerType::MemCopy)); + + // Correctly use backend hint + BOOST_TEST((layer5->GetBackendId() == Compute::CpuAcc )); + + // Load it into the runtime. It should pass. + NetworkId netId; + std::string ignoredErrorMessage; + INetworkProperties networkProperties(true, true); + + runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + std::vector inputData0 + { + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + }; + std::vector inputData1 + { + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + }; + std::vector inputData2 + { + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + }; + + std::vector outputData(2); + + std::vector expectedOutput{ 11.0f, -1.0f }; + + InputTensors inputTensors + { + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + }; + OutputTensors outputTensors + { + { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Executed Subtraction using CpuAcc + std::size_t found = dump.find("NeonSubtractionWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Correctly switch back to GpuAcc + found = dump.find("ClPooling2dWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Check output is as expected + BOOST_TEST(outputData == expectedOutput); +} + +BOOST_AUTO_TEST_CASE(ClImportDisableFallbackSubgraphToNeon) +{ + using namespace armnn; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + Pooling2dDescriptor desc; + + IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); + IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); + IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); + IConnectableLayer* add = net->AddAdditionLayer("add"); + IConnectableLayer* sub = net->AddSubtractionLayer("sub"); + IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); + add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); + sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); + pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); + + input0->GetOutputSlot(0).SetTensorInfo(info); + input1->GetOutputSlot(0).SetTensorInfo(info); + input2->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + sub->GetOutputSlot(0).SetTensorInfo(info); + pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); + + std::vector backends = { Compute::GpuAcc, Compute::CpuAcc }; + // Use BackendSelectionHint to specify CpuAcc for Subtraction layer + sub->BackendSelectionHint(backends[1]); + + // optimize the network + OptimizerOptions optOptions; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + + OptimizedNetwork* optNetObjPtr = PolymorphicDowncast(optNet.get()); + Graph& graph = optNetObjPtr->GetGraph(); + + armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); + armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); + armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); + armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); + armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); + armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); + armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]"); + armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling"); + armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layer0, layer1)); + BOOST_TEST(CheckOrder(graph, layer1, layer2)); + BOOST_TEST(CheckOrder(graph, layer2, layer3)); + BOOST_TEST(CheckOrder(graph, layer3, layer4)); + BOOST_TEST(CheckOrder(graph, layer4, layer5)); + BOOST_TEST(CheckOrder(graph, layer5, layer6)); + BOOST_TEST(CheckOrder(graph, layer6, layer7)); + BOOST_TEST(CheckOrder(graph, layer7, layer8)); + + // Use memory import between backends + BOOST_TEST((layer4->GetType() == LayerType::MemCopy)); + BOOST_TEST((layer6->GetType() == LayerType::MemCopy)); + + // Correctly use backend hint + BOOST_TEST((layer5->GetBackendId() == Compute::CpuAcc )); + + // Load it into the runtime. It should pass. + NetworkId netId; + runtime->LoadNetwork(netId, std::move(optNet)); + + // Creates structures for input & output + std::vector inputData0 + { + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + }; + std::vector inputData1 + { + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + }; + std::vector inputData2 + { + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + }; + + std::vector outputData(2); + + std::vector expectedOutput{ 11.0f, -1.0f }; + + InputTensors inputTensors + { + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + }; + OutputTensors outputTensors + { + { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Executed Subtraction using CpuAcc + std::size_t found = dump.find("NeonSubtractionWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Correctly switch back to GpuAcc + found = dump.find("ClPooling2dWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Check output is as expected + BOOST_TEST(outputData == expectedOutput); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp index 642c19dbe4..9862ddbd70 100644 --- a/src/backends/neon/NeonBackend.cpp +++ b/src/backends/neon/NeonBackend.cpp @@ -129,8 +129,7 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph std::vector NeonBackend::GetHandleFactoryPreferences() const { - return std::vector() = {"Arm/Neon/TensorHandleFactory", - "Arm/Cl/TensorHandleFactory"}; + return std::vector() = { NeonTensorHandleFactory::GetIdStatic() }; } void NeonBackend::RegisterTensorHandleFactories(class TensorHandleFactoryRegistry& registry) diff --git a/src/backends/neon/test/NeonFallbackTests.cpp b/src/backends/neon/test/NeonFallbackTests.cpp index 9a07ed236f..fd7fbbc4d5 100644 --- a/src/backends/neon/test/NeonFallbackTests.cpp +++ b/src/backends/neon/test/NeonFallbackTests.cpp @@ -12,8 +12,6 @@ BOOST_AUTO_TEST_SUITE(NeonFallback) -std::vector defaultBackends = { armnn::Compute::CpuAcc }; - BOOST_AUTO_TEST_CASE(FallbackImportToCpuAcc) { using namespace armnn; @@ -684,4 +682,534 @@ BOOST_AUTO_TEST_CASE(FallbackDisableImportFromCpuAcc) BOOST_TEST(outputData == expectedOutput); } +#if defined(ARMCOMPUTECL_ENABLED) +BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackToCl) +{ + using namespace armnn; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); + IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); + IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); + IConnectableLayer* add = net->AddAdditionLayer("add"); + IConnectableLayer* sub = net->AddSubtractionLayer("sub"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); + add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); + sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + + input0->GetOutputSlot(0).SetTensorInfo(info); + input1->GetOutputSlot(0).SetTensorInfo(info); + input2->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + sub->GetOutputSlot(0).SetTensorInfo(info); + + std::vector backends = { Compute::CpuAcc, Compute::GpuAcc }; + // Use BackendSelectionHint to specify GpuAcc for Subtraction layer + sub->BackendSelectionHint(backends[1]); + + // optimize the network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = true; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + + OptimizedNetwork* optNetObjPtr = PolymorphicDowncast(optNet.get()); + Graph& graph = optNetObjPtr->GetGraph(); + + armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); + armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); + armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); + armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); + armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); + armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); + armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layer0, layer1)); + BOOST_TEST(CheckOrder(graph, layer1, layer2)); + BOOST_TEST(CheckOrder(graph, layer2, layer3)); + BOOST_TEST(CheckOrder(graph, layer3, layer4)); + BOOST_TEST(CheckOrder(graph, layer4, layer5)); + BOOST_TEST(CheckOrder(graph, layer5, layer6)); + + // Use memory import between backends + BOOST_TEST((layer4->GetType() == LayerType::MemCopy)); + + // Correctly use backend hint + BOOST_TEST((layer5->GetBackendId() == Compute::GpuAcc )); + + // Load it into the runtime. It should pass. + NetworkId netId; + std::string ignoredErrorMessage; + INetworkProperties networkProperties(true, true); + + runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + std::vector inputData0 + { + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + }; + std::vector inputData1 + { + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + }; + std::vector inputData2 + { + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + }; + + std::vector outputData(12); + + std::vector expectedOutput + { + 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f + }; + + InputTensors inputTensors + { + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + }; + OutputTensors outputTensors + { + { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Executed Subtraction using GpuAcc + std::size_t found = dump.find("ClSubtractionWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Check output is as expected + BOOST_TEST(outputData == expectedOutput); +} + +BOOST_AUTO_TEST_CASE(NeonImportDisabledFallbackToCl) +{ + using namespace armnn; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); + IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); + IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); + IConnectableLayer* add = net->AddAdditionLayer("add"); + IConnectableLayer* sub = net->AddSubtractionLayer("sub"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); + add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); + sub->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + + input0->GetOutputSlot(0).SetTensorInfo(info); + input1->GetOutputSlot(0).SetTensorInfo(info); + input2->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + sub->GetOutputSlot(0).SetTensorInfo(info); + + std::vector backends = { Compute::CpuAcc, Compute::GpuAcc }; + // Use BackendSelectionHint to specify GpuAcc for Subtraction layer + sub->BackendSelectionHint(backends[1]); + + // optimize the network + OptimizerOptions optOptions; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + + OptimizedNetwork* optNetObjPtr = PolymorphicDowncast(optNet.get()); + Graph& graph = optNetObjPtr->GetGraph(); + + armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); + armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); + armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); + armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); + armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); + armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); + armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layer0, layer1)); + BOOST_TEST(CheckOrder(graph, layer1, layer2)); + BOOST_TEST(CheckOrder(graph, layer2, layer3)); + BOOST_TEST(CheckOrder(graph, layer3, layer4)); + BOOST_TEST(CheckOrder(graph, layer4, layer5)); + BOOST_TEST(CheckOrder(graph, layer5, layer6)); + + // Use memory import between backends + BOOST_TEST((layer4->GetType() == LayerType::MemCopy)); + + // Correctly use backend hint + BOOST_TEST((layer5->GetBackendId() == Compute::GpuAcc )); + + // Load it into the runtime. It should pass. + NetworkId netId; + runtime->LoadNetwork(netId, std::move(optNet)); + + // Creates structures for input & output + std::vector inputData0 + { + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + }; + std::vector inputData1 + { + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + }; + std::vector inputData2 + { + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + }; + + std::vector outputData(12); + + std::vector expectedOutput + { + 11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f + }; + + InputTensors inputTensors + { + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + }; + OutputTensors outputTensors + { + { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Executed Subtraction using GpuAcc + std::size_t found = dump.find("ClSubtractionWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Check output is as expected + BOOST_TEST(outputData == expectedOutput); +} + +BOOST_AUTO_TEST_CASE(NeonImportEnabledFallbackSubgraphToCl) +{ + using namespace armnn; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + Pooling2dDescriptor desc; + + IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); + IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); + IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); + IConnectableLayer* add = net->AddAdditionLayer("add"); + IConnectableLayer* sub = net->AddSubtractionLayer("sub"); + IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); + add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); + sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); + pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); + + input0->GetOutputSlot(0).SetTensorInfo(info); + input1->GetOutputSlot(0).SetTensorInfo(info); + input2->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + sub->GetOutputSlot(0).SetTensorInfo(info); + pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); + + std::vector backends = { Compute::CpuAcc, Compute::GpuAcc }; + // Use BackendSelectionHint to specify GpuAcc for Subtraction layer + sub->BackendSelectionHint(backends[1]); + + // optimize the network + OptimizerOptions optOptions; + optOptions.m_ImportEnabled = true; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + + OptimizedNetwork* optNetObjPtr = PolymorphicDowncast(optNet.get()); + Graph& graph = optNetObjPtr->GetGraph(); + + armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); + armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); + armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); + armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); + armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); + armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); + armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]"); + armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling"); + armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layer0, layer1)); + BOOST_TEST(CheckOrder(graph, layer1, layer2)); + BOOST_TEST(CheckOrder(graph, layer2, layer3)); + BOOST_TEST(CheckOrder(graph, layer3, layer4)); + BOOST_TEST(CheckOrder(graph, layer4, layer5)); + BOOST_TEST(CheckOrder(graph, layer5, layer6)); + BOOST_TEST(CheckOrder(graph, layer6, layer7)); + BOOST_TEST(CheckOrder(graph, layer7, layer8)); + + // Use memory import between backends + BOOST_TEST((layer4->GetType() == LayerType::MemCopy)); + BOOST_TEST((layer6->GetType() == LayerType::MemCopy)); + + // Correctly use backend hint + BOOST_TEST((layer5->GetBackendId() == Compute::GpuAcc )); + + // Load it into the runtime. It should pass. + NetworkId netId; + std::string ignoredErrorMessage; + INetworkProperties networkProperties(true, true); + + runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties); + + // Creates structures for input & output + std::vector inputData0 + { + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + }; + std::vector inputData1 + { + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + }; + std::vector inputData2 + { + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + }; + + std::vector outputData(2); + + std::vector expectedOutput{ 11.0f, -1.0f }; + + InputTensors inputTensors + { + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + }; + OutputTensors outputTensors + { + { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Executed Subtraction using GpuAcc + std::size_t found = dump.find("ClSubtractionWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Correctly switch back to CpuAcc + found = dump.find("NeonPooling2dWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Contains SyncMemGeneric for output + found = dump.find("SyncMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Check output is as expected + BOOST_TEST(outputData == expectedOutput); +} + +BOOST_AUTO_TEST_CASE(NeonImportDisableFallbackSubgraphToCl) +{ + using namespace armnn; + + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + Pooling2dDescriptor desc; + + IConnectableLayer* input0 = net->AddInputLayer(0, "input0"); + IConnectableLayer* input1 = net->AddInputLayer(1, "input1"); + IConnectableLayer* input2 = net->AddInputLayer(2, "input2"); + IConnectableLayer* add = net->AddAdditionLayer("add"); + IConnectableLayer* sub = net->AddSubtractionLayer("sub"); + IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input0->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input1->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0)); + add->GetOutputSlot(0).Connect(sub->GetInputSlot(1)); + sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); + pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32); + TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32); + + input0->GetOutputSlot(0).SetTensorInfo(info); + input1->GetOutputSlot(0).SetTensorInfo(info); + input2->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + sub->GetOutputSlot(0).SetTensorInfo(info); + pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo); + + std::vector backends = { Compute::CpuAcc, Compute::GpuAcc }; + // Use BackendSelectionHint to specify GpuAcc for Subtraction layer + sub->BackendSelectionHint(backends[1]); + + // optimize the network + OptimizerOptions optOptions; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions); + + OptimizedNetwork* optNetObjPtr = PolymorphicDowncast(optNet.get()); + Graph& graph = optNetObjPtr->GetGraph(); + + armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0"); + armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1"); + armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2"); + armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add"); + armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]"); + armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub"); + armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "[ sub (0) -> pooling (0) ]"); + armnn::Layer* const layer7 = GetFirstLayerWithName(graph, "pooling"); + armnn::Layer* const layer8 = GetFirstLayerWithName(graph, "output"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layer0, layer1)); + BOOST_TEST(CheckOrder(graph, layer1, layer2)); + BOOST_TEST(CheckOrder(graph, layer2, layer3)); + BOOST_TEST(CheckOrder(graph, layer3, layer4)); + BOOST_TEST(CheckOrder(graph, layer4, layer5)); + BOOST_TEST(CheckOrder(graph, layer5, layer6)); + BOOST_TEST(CheckOrder(graph, layer6, layer7)); + BOOST_TEST(CheckOrder(graph, layer7, layer8)); + + // Use memory import between backends + BOOST_TEST((layer4->GetType() == LayerType::MemCopy)); + BOOST_TEST((layer6->GetType() == LayerType::MemCopy)); + + // Correctly use backend hint + BOOST_TEST((layer5->GetBackendId() == Compute::GpuAcc )); + + // Load it into the runtime. It should pass. + NetworkId netId; + runtime->LoadNetwork(netId, std::move(optNet)); + + // Creates structures for input & output + std::vector inputData0 + { + 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f + }; + std::vector inputData1 + { + 0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f + }; + std::vector inputData2 + { + 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f + }; + + std::vector outputData(2); + + std::vector expectedOutput{ 11.0f, -1.0f }; + + InputTensors inputTensors + { + { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) }, + { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }, + { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) } + }; + OutputTensors outputTensors + { + { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) } + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // Do the inference + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Retrieve the Profiler.Print() output to get the workload execution + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss);; + std::string dump = ss.str(); + + // Executed Subtraction using GpuAcc + std::size_t found = dump.find("ClSubtractionWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Correctly switch back to CpuAcc + found = dump.find("NeonPooling2dWorkload_Execute"); + BOOST_TEST(found != std::string::npos); + + // Contain CopyMemGeneric + found = dump.find("CopyMemGeneric"); + BOOST_TEST(found != std::string::npos); + + // Check output is as expected + BOOST_TEST(outputData == expectedOutput); +} +#endif + BOOST_AUTO_TEST_SUITE_END() -- cgit v1.2.1