From 06e0300ccf279c6b0fcbb5ef3b6fa36e00229492 Mon Sep 17 00:00:00 2001 From: Teresa Charlin Date: Thu, 15 Oct 2020 13:16:07 +0100 Subject: IVGCVSW-5314 Create OptimizeForExclusiveConnection * FuseBatchNorm class has been added to facilitate testing * Only Convolution2D FP32 being fused Signed-off-by: Teresa Charlin Change-Id: I049c4770946ddca21b08516d4c9f4d0d22bf9b45 --- src/armnn/Network.cpp | 3 +- src/armnn/optimizations/All.hpp | 1 + src/armnn/optimizations/FuseBatchNorm.hpp | 152 +++++++++++++++ src/armnn/optimizations/Optimization.hpp | 56 ++++++ src/armnn/test/OptimizerTests.cpp | 308 +++++++++++++++++++++++++++++- 5 files changed, 510 insertions(+), 10 deletions(-) create mode 100644 src/armnn/optimizations/FuseBatchNorm.hpp diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index 373f9992b4..6578b8445f 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -1054,7 +1054,8 @@ IOptimizedNetworkPtr Optimize(const INetwork& inNetwork, OptimizeConsecutiveReshapes(), FoldPadIntoConvolution2d(), PermuteAndBatchToSpaceAsDepthToSpace(), - TransposeAndBatchToSpaceAsDepthToSpace())); + TransposeAndBatchToSpaceAsDepthToSpace(), + FuseBatchNormIntoConvolution2D())); // If Fp32 to Fp16 optimization is set convert Fp32 network to Fp16 if (options.m_ReduceFp32ToFp16) diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp index e89c36b834..d042616ba4 100644 --- a/src/armnn/optimizations/All.hpp +++ b/src/armnn/optimizations/All.hpp @@ -10,6 +10,7 @@ #include "ConvertFp32NetworkToBf16.hpp" #include "ConvertFp32NetworkToFp16.hpp" #include "FoldPadIntoConvolution2d.hpp" +#include "FuseBatchNorm.hpp" #include "MovePermuteUp.hpp" #include "MoveTransposeUp.hpp" #include "OptimizeConsecutiveReshapes.hpp" diff --git a/src/armnn/optimizations/FuseBatchNorm.hpp b/src/armnn/optimizations/FuseBatchNorm.hpp new file mode 100644 index 0000000000..e8e8c5d77f --- /dev/null +++ b/src/armnn/optimizations/FuseBatchNorm.hpp @@ -0,0 +1,152 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "Optimization.hpp" +#include + +namespace armnn +{ +namespace optimizations +{ + +template +class FuseBatchNorm +{ +public: + /// Run for every exclusive connection between any base Convolution layer and a child BatchNorm layer for not + /// quantized layers. + /// The child will be removed, the base will be removed if it's left unconnected. A new Convolution layer will + /// be added, its weights and bias will be calculated using the weights and bias of the base Convolution layer + /// combined with the parameters of the child BatchNorm layer. + void Run(Graph& graph, InputSlot& connection) const + { + Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer(); + Layer& child = connection.GetOwningLayer(); + + ARMNN_ASSERT(base.GetType() == LayerType::Convolution2d); + ARMNN_ASSERT(child.GetType() == LayerType::BatchNormalization); + + if (base.GetDataType() == DataType::Float32 && child.GetDataType() == DataType::Float32) + { + OutputSlot* parentOut = base.GetInputSlot(0).GetConnectedOutputSlot(); + auto convLayer = PolymorphicDowncast(&base); + auto batchNormLayer = PolymorphicDowncast(&child); + + // Read convolution and batch norm parameters + BatchNormalizationDescriptor batchNormDescriptor = batchNormLayer->GetParameters(); + auto epsilon = batchNormDescriptor.m_Eps; + IgnoreUnused(epsilon); + + ConstTensor betaTensor(batchNormLayer->m_Beta->GetTensorInfo(), batchNormLayer->m_Beta->Map(true)); + ConstTensor gammaTensor(batchNormLayer->m_Gamma->GetTensorInfo(), batchNormLayer->m_Gamma->Map(true)); + ConstTensor meanTensor(batchNormLayer->m_Mean->GetTensorInfo(), batchNormLayer->m_Mean->Map(true)); + ConstTensor varTensor(batchNormLayer->m_Variance->GetTensorInfo(), batchNormLayer->m_Variance->Map(true)); + + auto convDescriptor = convLayer->GetParameters(); + ConstTensor weightsTensor(convLayer->m_Weight->GetTensorInfo(), convLayer->m_Weight->Map(true)); + + armnnUtils::DataLayoutIndexed dataLayout(convDescriptor.m_DataLayout); + auto weightsShape = convLayer->m_Weight->GetTensorInfo().GetShape(); + const unsigned int outputChannels = weightsShape[0]; + const unsigned int inputChannels = weightsShape[dataLayout.GetChannelsIndex()]; + const unsigned int weightsHeight = weightsShape[dataLayout.GetHeightIndex()]; + const unsigned int weightsWidth = weightsShape[dataLayout.GetWidthIndex()]; + + const auto* weightsBuffer = static_cast(weightsTensor.GetMemoryArea()); + const auto* betaBuffer = static_cast(betaTensor.GetMemoryArea()); + const auto* gammaBuffer = static_cast(gammaTensor.GetMemoryArea()); + const auto* meanBuffer = static_cast(meanTensor.GetMemoryArea()); + const auto* varBuffer = static_cast(varTensor.GetMemoryArea()); + + std::vector weightsVector (weightsBuffer, weightsBuffer + weightsTensor.GetNumElements()); + std::vector betaVector (betaBuffer, betaBuffer + betaTensor.GetNumElements()); + std::vector gammaVector (gammaBuffer, gammaBuffer + gammaTensor.GetNumElements()); + std::vector meanVector (meanBuffer, meanBuffer + meanTensor.GetNumElements()); + std::vector varianceVector(varBuffer, varBuffer + varTensor.GetNumElements()); + + // fusedWeights = ( gamma * weights ) / ( std - epsilon); + std::vector fusedWeightsVector(weightsVector.size()); + + unsigned int i = 0; + for (unsigned int cOut = 0; cOut < outputChannels; ++cOut) + { + auto mult = gammaVector[cOut] / sqrtf (varianceVector[cOut] + epsilon); + for (unsigned int cInput = 0; cInput < inputChannels; ++cInput) + { + for (unsigned int h = 0; h < weightsHeight; ++h) + { + for (unsigned int w = 0; w < weightsWidth; ++w) + { + fusedWeightsVector[i] = mult * weightsVector[i]; + i++; + } + } + } + } + ConstTensor fusedWeightsTensor(convLayer->m_Weight->GetTensorInfo(), fusedWeightsVector); + + // fusedBias = (gamma * (bias - mean)) / (variance - epsilon) + beta; + std::vector fusedBiasVector(outputChannels); + if (convDescriptor.m_BiasEnabled) + { + ARMNN_ASSERT_MSG(convLayer->m_Bias != nullptr, + "FuseBatchNorm: Bias data should not be null if bias is enabled."); + + ConstTensor biasTensor(convLayer->m_Bias->GetTensorInfo(), convLayer->m_Bias->Map(true)); + const auto* biasBuffer = static_cast(biasTensor.GetMemoryArea()); + std::vector biasVector(biasBuffer, biasBuffer + biasTensor.GetNumElements()); + + for (unsigned int cOut = 0; cOut < outputChannels; ++cOut) + { + fusedBiasVector[cOut] = ((gammaVector[cOut] * (biasVector[cOut] - meanVector[cOut])) / + sqrtf(varianceVector[cOut] + epsilon)) + betaVector[cOut]; + } + } + else + { + convDescriptor.m_BiasEnabled = true; + std::vector biasVector(outputChannels, 0); + + for (unsigned int cOut = 0; cOut < outputChannels; ++cOut) + { + fusedBiasVector[cOut] = ((gammaVector[cOut] * (biasVector[cOut] - meanVector[cOut])) / + sqrtf(varianceVector[cOut] + epsilon)) + betaVector[cOut]; + } + } + ConstTensor fusedBiasTensor(TensorInfo({outputChannels}, DataType::Float32), fusedBiasVector); + + // Insert the new convolution layer that has batch norm parameters fused into + const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") + base.GetName(); + auto& newConv2dLayer = *graph.InsertNewLayer(base.GetInputSlot(0), + convDescriptor, + name.c_str()); + newConv2dLayer.m_Weight = std::make_unique(fusedWeightsTensor); + newConv2dLayer.m_Bias = std::make_unique(ConstTensor(fusedBiasTensor)); + + // Reconnects with original parent. + newConv2dLayer.GetOutputSlot().MoveAllConnections(*parentOut); + // Parent is now the new convolution2d layer. + parentOut = &newConv2dLayer.GetOutputSlot(); + + // Moves connections in child output to parent layer. + // Child layer will be removed as it's left unconnected. + // Base layer will be removed if left unconnected. + child.GetOutputSlot().MoveAllConnections(*parentOut); + } + } +protected: + FuseBatchNorm() = default; + ~FuseBatchNorm() = default; +}; + +using FuseBatchNormIntoConvolution2D = + OptimizeForExclusiveConnection>; + +} // namespace optimizations +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/optimizations/Optimization.hpp b/src/armnn/optimizations/Optimization.hpp index 1796ac842b..320cae2b75 100644 --- a/src/armnn/optimizations/Optimization.hpp +++ b/src/armnn/optimizations/Optimization.hpp @@ -122,4 +122,60 @@ public: using OptimizeForTypeImpl>::OptimizeForTypeImpl; }; +/// Wrapper Optimization class that calls Wrapped::Run for every connection BaseType -> ChildType. +/// - Wrapped class mustn't remove the base layer. The optimizer will remove it if left unconnected +/// after applying each optimization. +/// - Wrapped class mustn't affect existing connections in the same output. It might add new ones. +/// - Children layers are removed if left unconnected after applying the wrapped optimization. +template +class OptimizeForExclusiveConnectionImpl : public Wrapped +{ +public: + using Wrapped::Wrapped; + + void Run(Graph& graph, BaseType& base) const + { + for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output) + { + if (output->GetNumConnections() == 1) + { + for (auto&& childInput : output->GetConnections()) + { + if (childInput->GetOwningLayer().GetType() == LayerEnumOf()) + { + Wrapped::Run(graph, *childInput); + } + } + + // Removes unconnected children. + for (unsigned int i = 0; i < output->GetNumConnections();) + { + Layer* child = &output->GetConnection(i)->GetOwningLayer(); + + if (child->IsOutputUnconnected()) + { + graph.EraseLayer(child); + } + else + { + ++i; + } + } + } + } + } + +protected: + ~OptimizeForExclusiveConnectionImpl() = default; +}; + +template +class OptimizeForExclusiveConnection final + : public OptimizeForTypeImpl> +{ +public: + using OptimizeForTypeImpl>::OptimizeForTypeImpl; +}; + } // namespace armnn diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp index 3af50ecf3a..879905bda8 100644 --- a/src/armnn/test/OptimizerTests.cpp +++ b/src/armnn/test/OptimizerTests.cpp @@ -597,11 +597,11 @@ BOOST_AUTO_TEST_CASE(FoldPadLayerIntoConvolution2dLayer) }; BOOST_TEST(CheckSequence(graph.cbegin(), - graph.cend(), - &IsLayerOfType, - &IsLayerOfType, - checkSimpleConv2d, - &IsLayerOfType)); + graph.cend(), + &IsLayerOfType, + &IsLayerOfType, + checkSimpleConv2d, + &IsLayerOfType)); armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(FoldPadIntoConvolution2d())); @@ -622,10 +622,10 @@ BOOST_AUTO_TEST_CASE(FoldPadLayerIntoConvolution2dLayer) }; BOOST_TEST(CheckSequence(graph.cbegin(), - graph.cend(), - &IsLayerOfType, - checkPadFoldedIntoConv2d, - &IsLayerOfType)); + graph.cend(), + &IsLayerOfType, + checkPadFoldedIntoConv2d, + &IsLayerOfType)); } @@ -798,4 +798,294 @@ BOOST_AUTO_TEST_CASE(BackendHintTest) } } +BOOST_AUTO_TEST_CASE(OptimizeForExclusiveConnections_fuse_Test) +{ + using namespace armnn; + // Define layers information + Convolution2dDescriptor convolution2dDescriptor; + convolution2dDescriptor.m_BiasEnabled = false; + convolution2dDescriptor.m_DataLayout = DataLayout::NHWC; + BatchNormalizationDescriptor batchNormDescriptor; + batchNormDescriptor.m_DataLayout = DataLayout::NHWC; + + const unsigned int inputDimensionSizes[] = {1, 4, 4, 3}; // NHWCin + const unsigned int weightsDimensionSizes[] = {1, 2, 2, 3}; // CoutHWCin + const unsigned int outputDimensionSizes[] = {1, 3, 3, 1}; // NHWCout + const unsigned int outputChannelSize[] = {outputDimensionSizes[3]}; // Cout + + TensorInfo inputInfo (4, inputDimensionSizes, DataType::Float32); + TensorInfo outputInfo(4, outputDimensionSizes, DataType::Float32); + + std::vector weightsVector = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + ConstTensor weights (TensorInfo(4, weightsDimensionSizes, DataType::Float32), weightsVector); + + + std::vector betaVector = {0.1f}; + std::vector gammaVector = {0.5f}; + std::vector meanVector = {0}; + std::vector varianceVector = {1}; + ConstTensor beta (TensorInfo(1, outputChannelSize, DataType::Float32), betaVector); + ConstTensor gamma (TensorInfo(1, outputChannelSize, DataType::Float32), gammaVector); + ConstTensor mean (TensorInfo(1, outputChannelSize, DataType::Float32), meanVector); + ConstTensor variance(TensorInfo(1, outputChannelSize, DataType::Float32), varianceVector); + + // Define the network + Graph graph; + auto input = graph.AddLayer(0, "input"); + auto conv = graph.AddLayer(convolution2dDescriptor, "convolution"); + auto batchNorm = graph.AddLayer(batchNormDescriptor, "batchNorm"); + auto output = graph.AddLayer(0, "output"); + + // Set layer information + input ->GetOutputSlot().SetTensorInfo(inputInfo); + conv ->GetOutputSlot().SetTensorInfo(outputInfo); + batchNorm->GetOutputSlot().SetTensorInfo(outputInfo); + conv ->m_Weight = std::make_unique(weights); + batchNorm->m_Beta = std::make_unique(beta); + batchNorm->m_Gamma = std::make_unique(gamma); + batchNorm->m_Mean = std::make_unique(mean); + batchNorm->m_Variance = std::make_unique(variance); + if (convolution2dDescriptor.m_BiasEnabled) + { + std::vector biasVector = {11}; + ConstTensor bias (TensorInfo(1, outputChannelSize, DataType::Float32), biasVector); + conv->m_Bias = std::make_unique(bias); + } + + // Connect layers + input ->GetOutputSlot(0).Connect(conv ->GetInputSlot(0)); + conv ->GetOutputSlot(0).Connect(batchNorm->GetInputSlot(0)); + batchNorm ->GetOutputSlot(0).Connect(output ->GetInputSlot(0)); + + BOOST_CHECK(4 == graph.GetNumLayers()); + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType)); + + // Optimize graph + armnn::Optimizer::Pass(graph, MakeOptimizations(FuseBatchNormIntoConvolution2D())); + + auto checkFusedConv2d = [ ](const armnn::Layer* const layer) -> bool + { + return IsLayerOfType(layer) && + (layer->GetNameStr() == "fused-batchNorm-into-convolution"); + }; + + BOOST_CHECK(3 == graph.GetNumLayers()); + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType, + checkFusedConv2d, + &IsLayerOfType)); +} + +BOOST_AUTO_TEST_CASE(OptimizeForExclusiveConnections_notFuse_Test) +{ + // Define the network + Graph graph; + Convolution2dDescriptor convolution2dDescriptor; + BatchNormalizationDescriptor batchNormDescriptor; + + auto input = graph.AddLayer(0, "input"); + auto conv = graph.AddLayer(convolution2dDescriptor, "convolution"); + auto batchNorm = graph.AddLayer(batchNormDescriptor, "batchNorm"); + auto output = graph.AddLayer(0, "output"); + auto output2 = graph.AddLayer(1, "output2"); + + // Connect layers + input ->GetOutputSlot(0).Connect(conv ->GetInputSlot(0)); + conv ->GetOutputSlot(0).Connect(batchNorm->GetInputSlot(0)); + batchNorm ->GetOutputSlot(0).Connect(output ->GetInputSlot(0)); + conv ->GetOutputSlot(0).Connect(output2 ->GetInputSlot(0)); + + BOOST_CHECK(5 == graph.GetNumLayers()); + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType)); + // Optimize graph + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(FuseBatchNormIntoConvolution2D())); + + BOOST_CHECK(5 == graph.GetNumLayers()); + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType)); +} + +BOOST_AUTO_TEST_CASE(Fuse_batchNorm_into_Conv2D_Float32_Test) +{ + using namespace armnn; + + // Define layers information + Convolution2dDescriptor convolution2dDescriptor; + convolution2dDescriptor.m_BiasEnabled = false; + convolution2dDescriptor.m_DataLayout = DataLayout::NHWC; + convolution2dDescriptor.m_StrideX = 1; + convolution2dDescriptor.m_StrideY = 1; + BatchNormalizationDescriptor batchNormDescriptor; + batchNormDescriptor.m_DataLayout = DataLayout::NHWC; + + const unsigned int inputDimensionSizes[] = {1, 4, 4, 3}; // NHWCin + const unsigned int weightsDimensionSizes[] = {4, 2, 2, 3}; // CoutHWCin + const unsigned int outputDimensionSizes[] = {1, 3, 3, 4}; // NHWCout + const unsigned int outputChannelSize[] = {outputDimensionSizes[3]}; // Cout + + TensorInfo inputInfo (4, inputDimensionSizes, DataType::Float32); + TensorInfo outputInfo(4, outputDimensionSizes, DataType::Float32); + + std::vector weightsVector = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 112, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 210, 211, 212, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 310, 311, 312}; + TensorInfo weightsInfo(4, weightsDimensionSizes, DataType::Float32); + ConstTensor weights (weightsInfo, weightsVector); + std::vector biasVector = {3.3f, 3.2f, 3.1f, 3.0f}; + TensorInfo biasInfo(1, outputChannelSize, DataType::Float32); + ConstTensor bias (biasInfo, biasVector); + Optional optionalBias = Optional(bias); + + std::vector betaVector = {0.0f, 0.2f, 0.3f, 0.4f}; + std::vector gammaVector = {0.5f, 0.6f, 0.7f, 0.8f}; + std::vector meanVector = {0.1f, 0.2f, 0.3f, 0.4f}; + std::vector varianceVector = {1.0f, 1.1f, 1.2f, 1.3f}; + ConstTensor beta (TensorInfo(1, outputChannelSize, DataType::Float32), betaVector); + ConstTensor gamma (TensorInfo(1, outputChannelSize, DataType::Float32), gammaVector); + ConstTensor mean (TensorInfo(1, outputChannelSize, DataType::Float32), meanVector); + ConstTensor variance(TensorInfo(1, outputChannelSize, DataType::Float32), varianceVector); + + auto inputSize = inputDimensionSizes[0]*inputDimensionSizes[1]*inputDimensionSizes[2]*inputDimensionSizes[3]; + auto outputSize = outputDimensionSizes[0]*outputDimensionSizes[1]*outputDimensionSizes[2]*outputDimensionSizes[3]; + + // FIRST NETWORK: Fused + + // Construct ArmNN network + NetworkId networkIdentifier; + INetworkPtr network = INetwork::Create(); + IConnectableLayer *inputLayer = network->AddInputLayer(0); + IConnectableLayer *convLayer = network->AddConvolution2dLayer(convolution2dDescriptor, + weights, + optionalBias, + "convolution"); + IConnectableLayer *batchNormLayer = network->AddBatchNormalizationLayer(batchNormDescriptor, + mean, + variance, + beta, + gamma, + "batchNorm"); + IConnectableLayer *outputLayer = network->AddOutputLayer(0); + + inputLayer ->GetOutputSlot(0).Connect(convLayer ->GetInputSlot(0)); + convLayer ->GetOutputSlot(0).Connect(batchNormLayer->GetInputSlot(0)); + batchNormLayer ->GetOutputSlot(0).Connect(outputLayer ->GetInputSlot(0)); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + IRuntimePtr run = IRuntime::Create(options); + + //Set the tensors in the network. + inputLayer ->GetOutputSlot(0).SetTensorInfo(inputInfo); + convLayer ->GetOutputSlot(0).SetTensorInfo(outputInfo); + batchNormLayer ->GetOutputSlot(0).SetTensorInfo(outputInfo); + + // Optimise ArmNN network + IOptimizedNetworkPtr optNet = Optimize(*network, {Compute::CpuRef}, run->GetDeviceSpec()); + if (!optNet) + { + // This shouldn't happen for this simple sample, with reference backend. + // But in general usage Optimize could fail if the hardware at runtime cannot + // support the model that has been provided. + std::cerr << "Error: Failed to optimise the input network." << std::endl; + } + + // Load graph into runtime + run->LoadNetwork(networkIdentifier, std::move(optNet)); + + //Creates structures for inputs and outputs. + std::vector inputData(inputSize, 128); + std::vector outputData(outputSize); + + InputTensors inputTensors {{0, ConstTensor(run->GetInputTensorInfo (networkIdentifier, 0), inputData.data())}}; + OutputTensors outputTensors{{0, Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputData.data())}}; + + + // Execute network + run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors); + + // SECOND NETWORK: NotFused + + // Construct ArmNN network + NetworkId networkIdentifierNotFused; + INetworkPtr networkNotFused = INetwork::Create(); + IConnectableLayer *inputLayerNotFused = networkNotFused->AddInputLayer(0); + IConnectableLayer *convLayerNotFused = networkNotFused->AddConvolution2dLayer(convolution2dDescriptor, + weights, + optionalBias, + "convolution"); + IConnectableLayer *batchNormLayerNotFused = networkNotFused->AddBatchNormalizationLayer(batchNormDescriptor, + mean, + variance, + beta, + gamma, + "batchNorm"); + IConnectableLayer *outputLayerNotFused = networkNotFused->AddOutputLayer(0); + IConnectableLayer *output2LayerNotFused = networkNotFused->AddOutputLayer(1); + + + inputLayerNotFused ->GetOutputSlot(0).Connect(convLayerNotFused ->GetInputSlot(0)); + convLayerNotFused ->GetOutputSlot(0).Connect(batchNormLayerNotFused->GetInputSlot(0)); + batchNormLayerNotFused ->GetOutputSlot(0).Connect(outputLayerNotFused ->GetInputSlot(0)); + convLayerNotFused ->GetOutputSlot(0).Connect(output2LayerNotFused ->GetInputSlot(0)); + + // Create ArmNN runtime + IRuntimePtr runNotFused = IRuntime::Create(options); + + //Set the tensors in the network. + inputLayerNotFused ->GetOutputSlot(0).SetTensorInfo(inputInfo); + convLayerNotFused ->GetOutputSlot(0).SetTensorInfo(outputInfo); + batchNormLayerNotFused ->GetOutputSlot(0).SetTensorInfo(outputInfo); + + // Optimise ArmNN network + IOptimizedNetworkPtr optNetNotFused = Optimize(*networkNotFused, {Compute::CpuRef}, runNotFused->GetDeviceSpec()); + if (!optNetNotFused) + { + // This shouldn't happen for this simple sample, with reference backend. + // But in general usage Optimize could fail if the hardware at runtime cannot + // support the model that has been provided. + std::cerr << "Error: Failed to optimise the input network." << std::endl; + } + + // Load graph into runtime + runNotFused->LoadNetwork(networkIdentifierNotFused, std::move(optNetNotFused)); + + //Creates structures for inputs and outputs. + std::vector inputDataNotFused(inputSize, 128); + std::vector outputDataNotFused(outputSize); + std::vector outputData2NotFused(outputSize); + + InputTensors inputTensorsNotFused{ + {0, ConstTensor(runNotFused->GetInputTensorInfo(networkIdentifierNotFused, 0), inputDataNotFused.data())}}; + OutputTensors outputTensorsNotFused{ + {0, Tensor(runNotFused->GetOutputTensorInfo(networkIdentifierNotFused, 0), outputDataNotFused.data())}, + {1, Tensor(runNotFused->GetOutputTensorInfo(networkIdentifierNotFused, 1), outputData2NotFused.data())}}; + + // Execute network + runNotFused->EnqueueWorkload(networkIdentifierNotFused, inputTensorsNotFused, outputTensorsNotFused); + + // Check the output of the fused-convolution matches with the output of the batchNormm in the "NotFused" network + for (unsigned int n = 0; n < outputData.size(); ++n) + { + BOOST_CHECK_CLOSE(outputData[n], outputDataNotFused[n], 0.001); + } +} + BOOST_AUTO_TEST_SUITE_END() -- cgit v1.2.1