From 07810fc2fcdd34db74222d90cc73ef12a88e7b78 Mon Sep 17 00:00:00 2001 From: Mike Kelly Date: Thu, 12 Nov 2020 10:58:48 +0000 Subject: IVGCVSW-5328-5329 Fuse Activation * Added Fused Activation Optimization to both CL and Neon backends. * Added Fused Activation support to all the CL and Neon workloads that support it. * Changed ProfilingTest network to be a Convolution layer followed by an Abs layer rather than an Activation layer. * Added IBackendInternal::OptimizeSubgraphView function that can accept a ModelOptions. * Network will now call OptimizeSubgraphView passing in the ModelOptions. Signed-off-by: Keith Davis Signed-off-by: Mike Kelly Signed-off-by: Teresa Charlin Change-Id: Ib536ac3cbafc7d9b35c139ad9a65b7735262cd9d --- Android.mk | 1 + CMakeLists.txt | 1 + include/armnn/backends/IBackendInternal.hpp | 3 + src/armnn/Network.cpp | 4 +- src/armnn/layers/FullyConnectedLayer.cpp | 1 - src/armnn/test/OptimizerTests.cpp | 24 +- .../test/optimizations/FuseActivationTests.cpp | 789 +++++++++++++++++++++ src/backends/aclCommon/ArmComputeSubgraphUtils.hpp | 145 ++++ src/backends/aclCommon/ArmComputeUtils.hpp | 40 +- src/backends/aclCommon/CMakeLists.txt | 1 + src/backends/backendsCommon/IBackendInternal.cpp | 7 + src/backends/backendsCommon/WorkloadData.hpp | 2 +- src/backends/cl/ClBackend.cpp | 263 ++++++- src/backends/cl/ClBackend.hpp | 3 +- src/backends/cl/ClLayerSupport.cpp | 27 +- src/backends/cl/workloads/ClAdditionWorkload.cpp | 15 +- src/backends/cl/workloads/ClAdditionWorkload.hpp | 3 +- .../ClBatchNormalizationFloatWorkload.cpp | 22 +- .../ClBatchNormalizationFloatWorkload.hpp | 3 +- .../cl/workloads/ClConvolution2dWorkload.cpp | 14 +- .../cl/workloads/ClConvolution2dWorkload.hpp | 3 +- .../workloads/ClDepthwiseConvolutionWorkload.cpp | 14 +- .../workloads/ClDepthwiseConvolutionWorkload.hpp | 3 +- .../cl/workloads/ClDivisionFloatWorkload.cpp | 19 +- .../cl/workloads/ClDivisionFloatWorkload.hpp | 3 +- .../cl/workloads/ClFullyConnectedWorkload.cpp | 13 +- .../cl/workloads/ClFullyConnectedWorkload.hpp | 3 +- .../cl/workloads/ClMultiplicationWorkload.cpp | 20 +- .../cl/workloads/ClMultiplicationWorkload.hpp | 3 +- .../cl/workloads/ClSubtractionWorkload.cpp | 16 +- .../cl/workloads/ClSubtractionWorkload.hpp | 3 +- src/backends/neon/NeonBackend.cpp | 246 ++++++- src/backends/neon/NeonLayerSupport.cpp | 27 +- .../neon/workloads/NeonAdditionWorkload.cpp | 15 +- .../neon/workloads/NeonAdditionWorkload.hpp | 4 +- .../workloads/NeonBatchNormalizationWorkload.cpp | 17 +- .../workloads/NeonBatchNormalizationWorkload.hpp | 3 +- .../neon/workloads/NeonConvolution2dWorkload.cpp | 15 +- .../neon/workloads/NeonConvolution2dWorkload.hpp | 3 +- .../workloads/NeonDepthwiseConvolutionWorkload.cpp | 25 +- .../workloads/NeonDepthwiseConvolutionWorkload.hpp | 4 +- .../neon/workloads/NeonDivisionWorkload.cpp | 20 +- .../neon/workloads/NeonDivisionWorkload.hpp | 5 +- .../neon/workloads/NeonFullyConnectedWorkload.cpp | 16 +- .../neon/workloads/NeonFullyConnectedWorkload.hpp | 3 +- .../neon/workloads/NeonMultiplicationWorkload.cpp | 16 +- .../neon/workloads/NeonMultiplicationWorkload.hpp | 4 +- .../neon/workloads/NeonSubtractionWorkload.cpp | 17 +- .../neon/workloads/NeonSubtractionWorkload.hpp | 4 +- src/profiling/test/ProfilingTestUtils.cpp | 128 ++-- 50 files changed, 1852 insertions(+), 188 deletions(-) create mode 100644 src/armnn/test/optimizations/FuseActivationTests.cpp create mode 100644 src/backends/aclCommon/ArmComputeSubgraphUtils.hpp diff --git a/Android.mk b/Android.mk index e8bf4b668e..d683c2312f 100644 --- a/Android.mk +++ b/Android.mk @@ -370,6 +370,7 @@ LOCAL_SRC_FILES := \ src/armnn/test/optimizations/ConvertConstantsHalfToFloatTests.cpp \ src/armnn/test/optimizations/Fp32NetworkToBf16ConverterTests.cpp \ src/armnn/test/optimizations/Fp32NetworkToFp16ConverterTests.cpp \ + src/armnn/test/optimizations/FuseActivationTests.cpp \ src/armnn/test/optimizations/InsertDebugLayerTests.cpp \ src/armnn/test/optimizations/MovePermuteUpTests.cpp \ src/armnn/test/optimizations/OptimizeConsecutiveReshapesTests.cpp \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 240767f43b..30b03dce04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -647,6 +647,7 @@ if(BUILD_UNIT_TESTS) src/armnn/test/optimizations/ConvertConstantsHalfToFloatTests.cpp src/armnn/test/optimizations/Fp32NetworkToBf16ConverterTests.cpp src/armnn/test/optimizations/Fp32NetworkToFp16ConverterTests.cpp + src/armnn/test/optimizations/FuseActivationTests.cpp src/armnn/test/optimizations/FuseBatchNormTests.cpp src/armnn/test/optimizations/InsertDebugLayerTests.cpp src/armnn/test/optimizations/MovePermuteUpTests.cpp diff --git a/include/armnn/backends/IBackendInternal.hpp b/include/armnn/backends/IBackendInternal.hpp index 5f1b413d83..c7ed8efa78 100644 --- a/include/armnn/backends/IBackendInternal.hpp +++ b/include/armnn/backends/IBackendInternal.hpp @@ -147,6 +147,9 @@ public: virtual OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph) const; + virtual OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph, + const ModelOptions& modelOptions) const; + bool SupportsTensorAllocatorAPI() const; ITensorHandleFactory::FactoryId GetBackwardCompatibleFavoriteHandleFactory(); diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index 5c55641c82..d41f2f6fa7 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -537,6 +537,7 @@ BackendsMap CreateSupportedBackends(TensorHandleFactoryRegistry& handleFactoryRe OptimizationResult ApplyBackendOptimizations(OptimizedNetwork* optNetObjPtr, BackendSettings& backendSettings, BackendsMap& backends, + const ModelOptions& modelOptions, Optional&> errMessages) { ARMNN_ASSERT(optNetObjPtr); @@ -572,7 +573,7 @@ OptimizationResult ApplyBackendOptimizations(OptimizedNetwork* optNetObjPtr, for (auto& subgraph : subgraphs) { // Try to optimize the current sub-graph - OptimizationViews optimizationViews = backendObjPtr->OptimizeSubgraphView(*subgraph); + OptimizationViews optimizationViews = backendObjPtr->OptimizeSubgraphView(*subgraph, modelOptions); ARMNN_ASSERT(optimizationViews.Validate(*subgraph)); // Optimization attempted, check the resulting optimized sub-graph @@ -1111,6 +1112,7 @@ IOptimizedNetworkPtr Optimize(const INetwork& inNetwork, OptimizationResult backendOptimizationResult = ApplyBackendOptimizations(optNetObjPtr, backendSettings, backends, + options.m_ModelOptions, messages); if (backendOptimizationResult.m_Error) { diff --git a/src/armnn/layers/FullyConnectedLayer.cpp b/src/armnn/layers/FullyConnectedLayer.cpp index 0dc138b761..ca7a0cc4bb 100644 --- a/src/armnn/layers/FullyConnectedLayer.cpp +++ b/src/armnn/layers/FullyConnectedLayer.cpp @@ -26,7 +26,6 @@ std::unique_ptr FullyConnectedLayer::CreateWorkload(const IWorkloadFa FullyConnectedQueueDescriptor descriptor; - SetAdditionalInfo(descriptor); descriptor.m_Weight = m_Weight.get(); if (m_Param.m_BiasEnabled) { diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp index 0179589bf4..e7eab9d00d 100644 --- a/src/armnn/test/OptimizerTests.cpp +++ b/src/armnn/test/OptimizerTests.cpp @@ -810,10 +810,10 @@ BOOST_AUTO_TEST_CASE(OptimizeForExclusiveConnectionsFuseTest) std::vector weightsVector = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; ConstTensor weights(TensorInfo(4, weightsDimensionSizes, DataType::Float32), weightsVector); - std::vector betaVector = {0.1f}; - std::vector gammaVector = {0.5f}; - std::vector meanVector = {0}; - std::vector varianceVector = {1}; + std::vector betaVector = { 0.1f }; + std::vector gammaVector = { 0.5f }; + std::vector meanVector = { 0 }; + std::vector varianceVector = { 1 }; ConstTensor beta(TensorInfo(1, outputChannelSize, DataType::Float32), betaVector); ConstTensor gamma(TensorInfo(1, outputChannelSize, DataType::Float32), gammaVector); ConstTensor mean(TensorInfo(1, outputChannelSize, DataType::Float32), meanVector); @@ -830,7 +830,7 @@ BOOST_AUTO_TEST_CASE(OptimizeForExclusiveConnectionsFuseTest) input->GetOutputSlot().SetTensorInfo(inputInfo); conv->GetOutputSlot().SetTensorInfo(outputInfo); batchNorm->GetOutputSlot().SetTensorInfo(outputInfo); - conv ->m_Weight = std::make_unique(weights); + conv->m_Weight = std::make_unique(weights); batchNorm->m_Beta = std::make_unique(beta); batchNorm->m_Gamma = std::make_unique(gamma); batchNorm->m_Mean = std::make_unique(mean); @@ -843,9 +843,9 @@ BOOST_AUTO_TEST_CASE(OptimizeForExclusiveConnectionsFuseTest) } // Connect layers - input ->GetOutputSlot(0).Connect(conv ->GetInputSlot(0)); - conv ->GetOutputSlot(0).Connect(batchNorm->GetInputSlot(0)); - batchNorm ->GetOutputSlot(0).Connect(output ->GetInputSlot(0)); + input->GetOutputSlot(0).Connect(conv->GetInputSlot(0)); + conv->GetOutputSlot(0).Connect(batchNorm->GetInputSlot(0)); + batchNorm->GetOutputSlot(0).Connect(output->GetInputSlot(0)); BOOST_CHECK(4 == graph.GetNumLayers()); BOOST_TEST(CheckSequence(graph.cbegin(), @@ -887,10 +887,10 @@ BOOST_AUTO_TEST_CASE(OptimizeForExclusiveConnectionsWithoutFuseTest) auto output2 = graph.AddLayer(1, "output2"); // Connect layers - input ->GetOutputSlot(0).Connect(conv ->GetInputSlot(0)); - conv ->GetOutputSlot(0).Connect(batchNorm->GetInputSlot(0)); - batchNorm ->GetOutputSlot(0).Connect(output ->GetInputSlot(0)); - conv ->GetOutputSlot(0).Connect(output2 ->GetInputSlot(0)); + input->GetOutputSlot(0).Connect(conv->GetInputSlot(0)); + conv->GetOutputSlot(0).Connect(batchNorm->GetInputSlot(0)); + batchNorm->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + conv->GetOutputSlot(0).Connect(output2->GetInputSlot(0)); BOOST_CHECK(5 == graph.GetNumLayers()); BOOST_TEST(CheckSequence(graph.cbegin(), diff --git a/src/armnn/test/optimizations/FuseActivationTests.cpp b/src/armnn/test/optimizations/FuseActivationTests.cpp new file mode 100644 index 0000000000..0e855977a0 --- /dev/null +++ b/src/armnn/test/optimizations/FuseActivationTests.cpp @@ -0,0 +1,789 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include "LayersFwd.hpp" + +#include +#include +#include +#include + +#include + +#include +#include + +using namespace armnn; + +BOOST_AUTO_TEST_SUITE(Optimizer) + +namespace +{ +const float g_qScale = 1.0f; +const int32_t g_qOffset = 0; + +template +std::vector GetVector(unsigned int size, float initial, float increment) +{ + std::vector typeVector(size, initial); + std::vector vector(size); + + if (size > 1) + { + for (unsigned int i = 0; i < size; ++i) + { + vector[i] = T(initial + (increment * static_cast(i))); + } + } + return vector; +} + +template> +struct Convolution2dTest +{ + using LayerType = armnn::Convolution2dLayer; + static std::string GetReceiverLayerName() { return "Convolution2d"; }; + static const bool isElementWise = false; + + static TensorShape GetInputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCin + static TensorShape GetOutputShape() { return TensorShape( {1, 3, 3, 4}); } // NHWCout + static TensorShape GetWeightsShape() { return TensorShape( {4, 2, 2, 3}); } // CoutHWCin + + constexpr static const unsigned int inputSize = 48; // batchIn * heightIn * widthIn * channelIn + constexpr static const unsigned int outputSize = 36; // batchOut * heightOut * widthOut * channelOut + + static IConnectableLayer* AddReceiverLayer(INetwork* network, + const char* name) + { + Convolution2dDescriptor descriptor; + descriptor.m_BiasEnabled = false; + descriptor.m_DataLayout = DataLayout::NHWC; + descriptor.m_StrideX = 1; + descriptor.m_StrideY = 1; + + std::vector weightsData = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42}; + std::vector weightsVector = armnnUtils::QuantizedVector(weightsData, g_qScale, g_qOffset); + TensorInfo weightsInfo(GetWeightsShape(), ArmnnType, g_qScale, g_qOffset); + ConstTensor weights(weightsInfo, weightsVector); + Optional optionalBias; + + return network->AddConvolution2dLayer(descriptor, weights, optionalBias, name); + } +}; + +template> +struct DepthwiseConvolution2dTest +{ +public: + using LayerType = armnn::DepthwiseConvolution2dLayer; + static std::string GetReceiverLayerName() { return "DepthwiseConvolution2d"; }; + static const bool isElementWise = false; + + static TensorShape GetInputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCin + static TensorShape GetOutputShape() { return TensorShape( {1, 3, 3, 12}); } // NHWCout + static TensorShape GetWeightsShape() { return TensorShape( {4, 3, 2, 2}); } // MCinHW + + constexpr static const unsigned int inputSize = 48; //batchIn * heightIn * widthIn * channelIn; + constexpr static const unsigned int outputSize = 108; //batchOut * heightOut * widthOut * channelOut; + + static IConnectableLayer* AddReceiverLayer(INetwork* network, + const char* name) + { + DepthwiseConvolution2dDescriptor descriptor; + descriptor.m_BiasEnabled = false; + descriptor.m_DataLayout = DataLayout::NHWC; + descriptor.m_StrideX = 1; + descriptor.m_StrideY = 1; + + std::vector weightsData = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42}; + std::vector weightsVector = armnnUtils::QuantizedVector(weightsData, g_qScale, g_qOffset); + TensorInfo weightsInfo(GetWeightsShape(), ArmnnType, g_qScale, g_qOffset); + ConstTensor weights(weightsInfo, weightsVector); + Optional optionalBias; + + return network->AddDepthwiseConvolution2dLayer(descriptor, weights, optionalBias, name); + } +}; + +template> +struct FullyConnectedTest +{ +public: + using LayerType = armnn::FullyConnectedLayer; + static std::string GetReceiverLayerName() { return "FullyConnected"; }; + static const bool isElementWise = false; + + static TensorShape GetInputShape() { return TensorShape( {2, 5, 1, 1}); } // NCinHW + static TensorShape GetOutputShape() { return TensorShape( {2, 3}); } // NCout + static TensorShape GetWeightsShape() { return TensorShape( {5, 3}); } // CinCout + + constexpr static const unsigned int inputSize = 10; // batchIn * heightIn * widthIn * channelIn + constexpr static const unsigned int outputSize = 6; // batchOut * heightOut * widthOut * channelOut + + static IConnectableLayer* AddReceiverLayer(INetwork* network, + const char* name) + { + FullyConnectedDescriptor descriptor; + descriptor.m_BiasEnabled = false; + + std::vector weightsData = { 1, 2, 3, 4, 5, + 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15}; + std::vector weightsVector = armnnUtils::QuantizedVector(weightsData, g_qScale, g_qOffset); + TensorInfo weightsInfo(GetWeightsShape(), ArmnnType, g_qScale, g_qOffset); + ConstTensor weights(weightsInfo, weightsVector); + Optional optionalBias; + + return network->AddFullyConnectedLayer(descriptor, weights, optionalBias, name); + } +}; + +template> +struct BatchNormTest +{ +public: + using LayerType = armnn::BatchNormalizationLayer; + static std::string GetReceiverLayerName() { return "BatchNorm"; }; + static const bool isElementWise = false; + + static TensorShape GetInputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCin + static TensorShape GetOutputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCout + + constexpr static const unsigned int inputSize = 48; // batchIn * heightIn * widthIn * channelIn + constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut + + static IConnectableLayer* AddReceiverLayer(INetwork* network, + const char* name) + { + BatchNormalizationDescriptor descriptor; + descriptor.m_DataLayout = DataLayout::NHWC; + + std::vector betaVector = GetVector(GetOutputShape()[3], 0.0f, 0.2f); + std::vector gammaVector = GetVector(GetOutputShape()[3], 0.5f, 0.1f); + std::vector meanVector = GetVector(GetOutputShape()[3], 0.1f, 0.1f); + std::vector varianceVector = GetVector(GetOutputShape()[3], 1.0f, 0.1f); + + const unsigned int outputChannelSize[] = { GetOutputShape()[3] }; + ConstTensor beta(TensorInfo(1, outputChannelSize, ArmnnType), betaVector); + ConstTensor gamma(TensorInfo(1, outputChannelSize, ArmnnType), gammaVector); + ConstTensor mean(TensorInfo(1, outputChannelSize, ArmnnType), meanVector); + ConstTensor variance(TensorInfo(1, outputChannelSize, ArmnnType), varianceVector); + + return network->AddBatchNormalizationLayer(descriptor, mean, variance, beta, gamma, name); + } +}; + +template> +struct MultiplicationTest +{ + using LayerType = armnn::MultiplicationLayer; + static std::string GetReceiverLayerName() { return "Multiplication"; }; + static const bool isElementWise = true; + + static TensorShape GetInputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCin + static TensorShape GetOutputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCout + + constexpr static const unsigned int inputSize = 48; // batchIn * heightIn * widthIn * channelIn + constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut + + static IConnectableLayer* AddReceiverLayer(INetwork* network, + const char* name) + { + return network->AddMultiplicationLayer(name); + } +}; + +template> +struct AdditionTest +{ + using LayerType = armnn::AdditionLayer; + static std::string GetReceiverLayerName() { return "Addition"; }; + static const bool isElementWise = true; + + static TensorShape GetInputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCin + static TensorShape GetOutputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCout + + constexpr static const unsigned int inputSize = 48; // batchIn * heightIn * widthIn * channelIn + constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut + + static IConnectableLayer* AddReceiverLayer(INetwork* network, + const char* name) + { + return network->AddAdditionLayer(name); + } +}; + +template> +struct SubtractionTest +{ + using LayerType = armnn::SubtractionLayer; + static std::string GetReceiverLayerName() { return "Subtraction"; }; + static const bool isElementWise = true; + + static TensorShape GetInputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCin + static TensorShape GetOutputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCout + + constexpr static const unsigned int inputSize = 48; // batchIn * heightIn * widthIn * channelIn + constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut + + static IConnectableLayer* AddReceiverLayer(INetwork* network, + const char* name) + { + return network->AddSubtractionLayer(name); + } +}; + +template> +struct DivisionTest +{ + using LayerType = armnn::DivisionLayer; + static std::string GetReceiverLayerName() { return "Division"; }; + static const bool isElementWise = true; + + static TensorShape GetInputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCin + static TensorShape GetOutputShape() { return TensorShape( {1, 4, 4, 3}); } // NHWCout + + constexpr static const unsigned int inputSize = 48; // batchIn * heightIn * widthIn * channelIn + constexpr static const unsigned int outputSize = 48; // batchOut * heightOut * widthOut * channelOut + + static IConnectableLayer* AddReceiverLayer(INetwork* network, + const char* name) + { + return network->AddDivisionLayer(name); + } +}; + +} // namespace + +template +INetworkPtr CreatNetwork(ActivationDescriptor activationDescriptor, bool preventFusing) +{ + // Create a network + INetworkPtr network = INetwork::Create(); + + IConnectableLayer* inputLayer = network->AddInputLayer(0); + + IConnectableLayer* receiverLayer = LayerTest::AddReceiverLayer(network.get(), + "receiverLayer"); + + IConnectableLayer* activationLayer = network->AddActivationLayer(activationDescriptor, + "activation"); + + IConnectableLayer* outputLayer = network->AddOutputLayer(0); + IConnectableLayer* output2Layer = preventFusing?network->AddOutputLayer(1):nullptr; + + // Define layers information + TensorInfo inputInfo(LayerTest::GetInputShape(), ArmnnType, g_qScale, g_qOffset); + TensorInfo outputInfo(LayerTest::GetOutputShape(), ArmnnType, g_qScale, g_qOffset); + + // Set layer information + inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo); + receiverLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); + activationLayer->GetOutputSlot(0).SetTensorInfo(outputInfo); + + // Connect layers + inputLayer->GetOutputSlot(0).Connect(receiverLayer->GetInputSlot(0)); + receiverLayer->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0)); + activationLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + + if (LayerTest::isElementWise) + { + inputLayer->GetOutputSlot(0).Connect(receiverLayer->GetInputSlot(1)); + } + if (preventFusing) + { + receiverLayer->GetOutputSlot(0).Connect(output2Layer->GetInputSlot(0)); + } + + return network; +} + +template> +void FuseActivationIntoPreviousLayerTest(ActivationDescriptor activationDescriptor, float tolerance, armnn::Compute +backendId) +{ + // FIRST NETWORK: Fused + // Construct ArmNN network + INetworkPtr networkFused = CreatNetwork(activationDescriptor, false); + + // Create ArmNN runtime + IRuntimePtr run = IRuntime::Create(IRuntime::CreationOptions()); // default options + + // Optimise ArmNN network + IOptimizedNetworkPtr optNetFused = Optimize(*networkFused, {backendId}, run->GetDeviceSpec()); + + Graph graphFused = PolymorphicDowncast(optNetFused.get())->GetGraph(); + + auto checkFusedConv2d = [](const armnn::Layer* const layer)->bool { + return IsLayerOfType(layer) && + (layer->GetNameStr() == "fused-activation-into-receiverLayer"); + }; + + BOOST_CHECK_MESSAGE(3 == graphFused.GetNumLayers(), LayerTest::GetReceiverLayerName()); + BOOST_TEST(CheckSequence(graphFused.cbegin(), + graphFused.cend(), + &IsLayerOfType, + checkFusedConv2d, + &IsLayerOfType)); + + // Load network into runtime + NetworkId networkIdentifier; + BOOST_TEST(run->LoadNetwork(networkIdentifier, std::move(optNetFused)) == Status::Success); + + //Creates structures for inputs and outputs. + std::vector data = GetVector(LayerTest::inputSize, 1.0f, 0.1f); + std::vector inputDataFused = armnnUtils::QuantizedVector(data, g_qScale, g_qOffset); + std::vector outputDataFused(LayerTest::outputSize); + + InputTensors inputTensorsFused{ + {0, ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), inputDataFused.data())}}; + OutputTensors outputTensorsFused{ + {0, Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), outputDataFused.data())}}; + + // Execute network + run->EnqueueWorkload(networkIdentifier, inputTensorsFused, outputTensorsFused); + + // SECOND NETWORK: NotFused + // Construct ArmNN network + INetworkPtr networkNotFused = CreatNetwork(activationDescriptor, true); + + // Create ArmNN runtime + IRuntimePtr runNotFused = IRuntime::Create(IRuntime::CreationOptions()); // default options + + // Optimise ArmNN network + IOptimizedNetworkPtr optNetNotFused = Optimize(*networkNotFused, {backendId}, runNotFused->GetDeviceSpec()); + + Graph graphNotFused = PolymorphicDowncast(optNetNotFused.get())->GetGraph(); + + BOOST_CHECK(5 == graphNotFused.GetNumLayers()); + BOOST_TEST(CheckSequence(graphNotFused.cbegin(), + graphNotFused.cend(), + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType, + &IsLayerOfType)); + + // Load network into runtime + NetworkId networkIdentifierNotFused; + BOOST_TEST(runNotFused->LoadNetwork(networkIdentifierNotFused, std::move(optNetNotFused)) == Status::Success); + + //Creates structures for inputs and outputs. + std::vector inputDataNotFused = armnnUtils::QuantizedVector(data, g_qScale, g_qOffset); + std::vector outputDataNotFused(LayerTest::outputSize); + std::vector outputData2NotFused(LayerTest::outputSize); + + InputTensors inputTensorsNotFused{ + {0, ConstTensor(runNotFused->GetInputTensorInfo(networkIdentifierNotFused, 0), inputDataNotFused.data())}}; + OutputTensors outputTensorsNotFused{ + {0, Tensor(runNotFused->GetOutputTensorInfo(networkIdentifierNotFused, 0), outputDataNotFused.data())}, + {1, Tensor(runNotFused->GetOutputTensorInfo(networkIdentifierNotFused, 1), outputData2NotFused.data())}}; + + // Execute network + runNotFused->EnqueueWorkload(networkIdentifierNotFused, inputTensorsNotFused, outputTensorsNotFused); + + // Check the output of the fused-activation matches with the output of the activation in the "NotFused" network + for (unsigned int n = 0; n < outputDataFused.size(); ++n) + { + BOOST_CHECK_CLOSE(static_cast(outputDataFused[n]), static_cast(outputDataNotFused[n]), + T(tolerance)); + } +} + +#if defined(ARMCOMPUTENEON_ENABLED) +// ReLu fused into Receiver Layers Float32 +BOOST_AUTO_TEST_CASE(FuseReLUIntoConvFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoDWConvFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoFullyConnectedFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoBatchNormFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} + +// BoundedReLu fused into Receiver Layers Float32 +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoConvFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoDWConvFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest < DepthwiseConvolution2dTest < DataType::Float32 > , DataType::Float32 > + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoFullyConnectedFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoBatchNormFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} + +// ReLU fused into Receiver Layers QAsymmU8 +BOOST_AUTO_TEST_CASE(FuseReLUIntoConvQAsymmU8CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::QAsymmU8> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoDWConvQAsymmU8CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::QAsymmU8> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoFullyConnectedQAsymmU8CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::QAsymmU8> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} + +// HardSwish fused into Receiver Layers Float32 +BOOST_AUTO_TEST_CASE(FuseHardSwishIntoConvFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::HardSwish; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} + +// TanH fused into Receiver Layers Float32 +BOOST_AUTO_TEST_CASE(FuseTanHIntoConvFloat32CpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::TanH; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::CpuAcc); +} +#endif + +#if defined(ARMCOMPUTECL_ENABLED) +// ReLu fused into Receiver Layers Float32 +BOOST_AUTO_TEST_CASE(FuseReLUIntoConvFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoDWConvFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoFullyConnectedFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoBatchNormFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoMulFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoAddFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoSubFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUIntoDivFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} + +// BoundedReLu fused into Receiver Layers Float32 +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoConvFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoDWConvFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoFullyConnectedFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoBatchNormFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoMulFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoAddFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoSubFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseBoundedReLUIntoDivFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::BoundedReLu; + activationDescriptor.m_A = 1.0f; + activationDescriptor.m_B = -1.0f; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} + +// ReLU fused into Receiver Layers QAsymmU8 +BOOST_AUTO_TEST_CASE(FuseReLUQIntoConvAsymmU8GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::QAsymmU8> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUQIntoDWConvAsymmU8GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::QAsymmU8> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseReLUQIntoFullyConnectedAsymmU8GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::ReLu; + + FuseActivationIntoPreviousLayerTest, DataType::QAsymmU8> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} + +// HardSwish fused into Receiver Layers Float32 +BOOST_AUTO_TEST_CASE(FuseHardSwishIntoConvFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::HardSwish; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseHardSwishIntoMulFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::HardSwish; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseHardSwishIntoAddFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::HardSwish; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseHardSwishIntoSubFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::HardSwish; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseHardSwishIntoDivFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::HardSwish; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} + +// TanH fused into Receiver Layers Float32 +BOOST_AUTO_TEST_CASE(FuseTanHIntoConvFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::TanH; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseTanHIntoMulFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::TanH; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseTanHIntoAddFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::TanH; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseTanHIntoSubFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::TanH; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +BOOST_AUTO_TEST_CASE(FuseTanHIntoDivFloat32GpuAccTest) +{ + ActivationDescriptor activationDescriptor; + activationDescriptor.m_Function = ActivationFunction::TanH; + + FuseActivationIntoPreviousLayerTest, DataType::Float32> + (activationDescriptor, 0.0001f, armnn::Compute::GpuAcc); +} +#endif + +BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file diff --git a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp new file mode 100644 index 0000000000..79744ecf97 --- /dev/null +++ b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp @@ -0,0 +1,145 @@ +// +// Copyright © 2020 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include + +namespace armnn +{ + +namespace +{ + +// +// this helper only works if all layers where the inputs connect to are not selected +// +SubgraphView::InputSlots CreateInputsFrom(const std::vector& layers) +{ + SubgraphView::InputSlots result; + for (auto&& layer : layers) + { + for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it) + { + result.push_back(&(*it)); + } + } + return result; +} + +// +// this helper only works if all layers where the outputs connect to are not selected +// +SubgraphView::OutputSlots CreateOutputsFrom(const std::vector& layers) +{ + SubgraphView::OutputSlots result; + for (auto&& layer : layers) + { + for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it) + { + result.push_back(&(*it)); + } + } + return result; +} + +} // namespace + +inline const TensorInfo GetOverriddenDataType(const TensorInfo& info, Optional type) +{ + if (!type) + { + return info; + } + + return TensorInfo(info.GetShape(), type.value(), info.GetQuantizationScale(), info.GetQuantizationOffset()); +} + +inline armnn::Optional GetOptionalBiasTypeFromWeightsType(armnn::Optional weightsType) +{ + if (!weightsType) + { + return weightsType; + } + + switch(weightsType.value()) + { + case armnn::DataType::BFloat16: + case armnn::DataType::Float16: + case armnn::DataType::Float32: + return weightsType; + case armnn::DataType::QAsymmS8: + return armnn::DataType::Signed32; + case armnn::DataType::QAsymmU8: + return armnn::DataType::Signed32; + case armnn::DataType::QSymmS16: + return armnn::DataType::Signed32; + default: + ARMNN_ASSERT_MSG(false, "GetBiasTypeFromWeightsType(): Unsupported data type."); + } + return armnn::EmptyOptional(); +} + +template +LayerType* FuseLayerWithoutParameters(OptimizationViews& optimizationViews, + LayerType* baseLayer, + ActivationLayer* activationLayer, + ActivationDescriptor& activationDesc, + std::string name) +{ + LayerType* replacementLayer = optimizationViews.GetGraph().AddLayer(name.c_str()); + + replacementLayer->SetAdditionalInfoForObject(std::make_shared(activationDesc)); + + SubgraphView substitutionSubgraph(CreateInputsFrom({baseLayer}), + CreateOutputsFrom({activationLayer}), + {baseLayer, activationLayer}); + SubgraphView replacementSubgraph(replacementLayer); + + optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph}); + return replacementLayer; +} + +template +LayerType* FuseLayerWithParameters(OptimizationViews& optimizationViews, + LayerType* baseLayer, + ActivationLayer* activationLayer, + ActivationDescriptor& activationDesc, + std::string name) +{ + LayerType* replacementLayer = optimizationViews.GetGraph().AddLayer(baseLayer->GetParameters(), + name.c_str()); + + replacementLayer->SetAdditionalInfoForObject(std::make_shared(activationDesc)); + + SubgraphView substitutionSubgraph(CreateInputsFrom({baseLayer}), + CreateOutputsFrom({activationLayer}), + {baseLayer, activationLayer}); + SubgraphView replacementSubgraph(replacementLayer); + + optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph}); + return replacementLayer; +} + +template +LayerType* FuseLayerWithWeightsAndBiases(OptimizationViews& optimizationViews, + LayerType* baseLayer, + ActivationLayer* activationLayer, + ActivationDescriptor& activationDesc, + std::string name) +{ + LayerType* replacementLayer = FuseLayerWithParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + + replacementLayer->m_Weight = std::move(baseLayer->m_Weight); + replacementLayer->m_Bias = std::move(baseLayer->m_Bias); + + return replacementLayer; +} + +} // namespace armnn diff --git a/src/backends/aclCommon/ArmComputeUtils.hpp b/src/backends/aclCommon/ArmComputeUtils.hpp index 6b1f975350..adcf8281d2 100644 --- a/src/backends/aclCommon/ArmComputeUtils.hpp +++ b/src/backends/aclCommon/ArmComputeUtils.hpp @@ -9,6 +9,8 @@ #include #include +#include "../../../../clframework/arm_compute/core/Types.h" +#include "../backendsCommon/WorkloadData.hpp" namespace armnn { @@ -77,6 +79,30 @@ ConvertActivationDescriptorToAclActivationLayerInfo(const ActivationDescriptor& actDesc.m_A, actDesc.m_B); } +inline arm_compute::ActivationLayerInfo +ConvertActivationDescriptorToAclActivationLayerInfo(const ActivationDescriptor* activationDescPtr) +{ + if (activationDescPtr != nullptr) + { + return ConvertActivationDescriptorToAclActivationLayerInfo(static_cast( + *activationDescPtr)); + } + return arm_compute::ActivationLayerInfo(); +} + +inline arm_compute::ActivationLayerInfo +ConvertAdditionalInfoToAclActivationLayerInfo(const QueueDescriptor& queueDescriptor) +{ + const ActivationDescriptor* activationDescPtr = queueDescriptor.GetAdditionalInformation(); + + if (activationDescPtr != nullptr) + { + return ConvertActivationDescriptorToAclActivationLayerInfo(static_cast( + *activationDescPtr)); + } + return arm_compute::ActivationLayerInfo(); +} + inline arm_compute::ComparisonOperation ConvertComparisonOperationToAcl(const ComparisonDescriptor& descriptor) { switch (descriptor.m_Operation) @@ -130,10 +156,22 @@ ConvertNormalizationAlgorithmChannelToAclNormType(NormalizationAlgorithmChannel } inline arm_compute::FullyConnectedLayerInfo -ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(const FullyConnectedDescriptor& fullyConnectedDesc) +ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(const FullyConnectedDescriptor& fullyConnectedDesc, + const ActivationDescriptor* activationDesc) +{ + arm_compute::FullyConnectedLayerInfo fc_info; + fc_info.transpose_weights = fullyConnectedDesc.m_TransposeWeightMatrix; + fc_info.activation_info = ConvertActivationDescriptorToAclActivationLayerInfo(activationDesc); + return fc_info; +} + +inline arm_compute::FullyConnectedLayerInfo +ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(const FullyConnectedDescriptor& fullyConnectedDesc, + arm_compute::ActivationLayerInfo activationLayerInfo) { arm_compute::FullyConnectedLayerInfo fc_info; fc_info.transpose_weights = fullyConnectedDesc.m_TransposeWeightMatrix; + fc_info.activation_info = activationLayerInfo; return fc_info; } diff --git a/src/backends/aclCommon/CMakeLists.txt b/src/backends/aclCommon/CMakeLists.txt index fa80437f2d..dac663b20c 100644 --- a/src/backends/aclCommon/CMakeLists.txt +++ b/src/backends/aclCommon/CMakeLists.txt @@ -7,6 +7,7 @@ list(APPEND armnnAclCommon_sources ArmComputeTensorHandle.hpp ArmComputeTensorUtils.hpp ArmComputeTensorUtils.cpp + ArmComputeSubgraphUtils.hpp ArmComputeUtils.hpp BaseMemoryManager.cpp BaseMemoryManager.hpp diff --git a/src/backends/backendsCommon/IBackendInternal.cpp b/src/backends/backendsCommon/IBackendInternal.cpp index 81fc515b98..b08dff84ed 100644 --- a/src/backends/backendsCommon/IBackendInternal.cpp +++ b/src/backends/backendsCommon/IBackendInternal.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: MIT // +#include #include namespace armnn @@ -135,6 +136,12 @@ OptimizationViews IBackendInternal::OptimizeSubgraphView(const SubgraphView& sub return result; } +OptimizationViews IBackendInternal::OptimizeSubgraphView(const SubgraphView& subgraph, + const ModelOptions& /*modelOptions*/) const +{ + return OptimizeSubgraphView(subgraph); +} + bool IBackendInternal::SupportsTensorAllocatorAPI() const { return !GetHandleFactoryPreferences().empty(); diff --git a/src/backends/backendsCommon/WorkloadData.hpp b/src/backends/backendsCommon/WorkloadData.hpp index dd39d312b7..0a232dc515 100644 --- a/src/backends/backendsCommon/WorkloadData.hpp +++ b/src/backends/backendsCommon/WorkloadData.hpp @@ -36,7 +36,7 @@ struct QueueDescriptor unsigned int numExpectedOut) const; template - const T* GetAdditionalInformation() + const T* GetAdditionalInformation() const { return static_cast(m_AdditionalInfoObject); } diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp index 6254b0a32a..57a5851650 100644 --- a/src/backends/cl/ClBackend.cpp +++ b/src/backends/cl/ClBackend.cpp @@ -12,16 +12,28 @@ #include "ClTensorHandleFactory.hpp" #include +#include +#include +#include #include #include #include - #include +#include "workloads/ClAdditionWorkload.hpp" +#include "workloads/ClBatchNormalizationFloatWorkload.hpp" +#include "workloads/ClConvolution2dWorkload.hpp" +#include "workloads/ClDepthwiseConvolutionWorkload.hpp" +#include "workloads/ClDivisionFloatWorkload.hpp" +#include "workloads/ClFullyConnectedWorkload.hpp" +#include "workloads/ClMultiplicationWorkload.hpp" +#include "workloads/ClSubtractionWorkload.hpp" + #include +#include #include namespace armnn @@ -129,11 +141,256 @@ IBackendInternal::ILayerSupportSharedPtr ClBackend::GetLayerSupport(const ModelO return layerSupport; } -OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph) const +OptimizationViews ClBackend::OptimizeSubgraphView(const SubgraphView& subgraph, + const ModelOptions& modelOptions) const { OptimizationViews optimizationViews; - optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); + auto it = subgraph.end(); + bool isFastMathEnabled = false; + +#if defined(ARMCOMPUTECL_ENABLED) + IBackendInternal::IBackendSpecificModelContextPtr modelContextPtr = CreateBackendSpecificModelContext(modelOptions); + + if (modelContextPtr) + { + auto clModelOptions = dynamic_cast(modelContextPtr.get()); + if (clModelOptions) + { + isFastMathEnabled = clModelOptions->IsFastMathEnabled(); + } + } +#endif + + while (it != subgraph.begin()) + { + --it; + Layer& base = **it; + + if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d + || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected + || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication + || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division) + && (base.GetAdditionalInformation() == nullptr)) + { + for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output) + { + if (output->GetNumConnections() == 1) + { + for (auto&& childInput : output->GetConnections()) + { + if (childInput->GetOwningLayer().GetType() == LayerType::Activation) + { + Layer& child = childInput->GetOwningLayer(); + + auto* activationLayer = PolymorphicDowncast(&child); + + const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") + + base.GetName(); + + // Get params from activation layer + ActivationDescriptor activationDesc = activationLayer->GetParameters(); + + if (base.GetType() == LayerType::Convolution2d) + { + Convolution2dLayer* baseLayer = PolymorphicDowncast(&base); + + Optional biases; + + if (baseLayer->GetParameters().m_BiasEnabled) + { + biases = GetOverriddenDataType(baseLayer->m_Bias->GetTensorInfo(), + GetOptionalBiasTypeFromWeightsType( + baseLayer->m_Weight->GetTensorInfo().GetDataType())); + } + + arm_compute::Status status = ClConvolution2dWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetParameters(), + baseLayer->m_Weight->GetTensorInfo(), + biases, + isFastMathEnabled, + &activationDesc); + + if (status) + { + FuseLayerWithWeightsAndBiases(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::DepthwiseConvolution2d) + { + DepthwiseConvolution2dLayer* baseLayer = + PolymorphicDowncast(&base); + + Optional biases; + + if (baseLayer->GetParameters().m_BiasEnabled) + { + biases = GetOverriddenDataType(baseLayer->m_Bias->GetTensorInfo(), + GetOptionalBiasTypeFromWeightsType( + baseLayer->m_Weight->GetTensorInfo().GetDataType())); + } + + arm_compute::Status status = ClDepthwiseConvolutionWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetParameters(), + baseLayer->m_Weight->GetTensorInfo(), + biases, + &activationDesc); + + if (status) + { + FuseLayerWithWeightsAndBiases(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::FullyConnected) + { + FullyConnectedLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = ClFullyConnectedWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->m_Weight->GetTensorInfo(), + baseLayer->m_Bias->GetTensorInfo(), + baseLayer->GetParameters(), + &activationDesc); + + if (status) + { + FuseLayerWithWeightsAndBiases(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::BatchNormalization) + { + BatchNormalizationLayer* baseLayer = + PolymorphicDowncast(&base); + + arm_compute::Status status = ClBatchNormalizationValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->m_Mean->GetTensorInfo(), + baseLayer->m_Variance->GetTensorInfo(), + baseLayer->m_Beta->GetTensorInfo(), + baseLayer->m_Gamma->GetTensorInfo(), + baseLayer->GetParameters(), + &activationDesc); + + if (status) + { + BatchNormalizationLayer* replacementLayer = + FuseLayerWithParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + + replacementLayer->m_Beta = std::move(baseLayer->m_Beta); + replacementLayer->m_Gamma = std::move(baseLayer->m_Gamma); + replacementLayer->m_Mean = std::move(baseLayer->m_Mean); + replacementLayer->m_Variance = std::move(baseLayer->m_Variance); + } + } + else if (base.GetType() == LayerType::Addition) + { + AdditionLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = ClAdditionValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + &activationDesc); + + if (status) + { + FuseLayerWithoutParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::Division) + { + DivisionLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = ClDivisionWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + &activationDesc); + + if (status) + { + FuseLayerWithoutParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::Multiplication) + { + MultiplicationLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = ClMultiplicationWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + &activationDesc); + + if (status) + { + FuseLayerWithoutParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::Subtraction) + { + SubtractionLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = ClSubtractionValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + &activationDesc); + + if (status) + { + FuseLayerWithoutParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + } + } + } + } + } + } + // end each optimization + if (optimizationViews.GetSubstitutions().empty()) + { + optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); + } return optimizationViews; } diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp index af5534e0d0..2b19fc5b33 100644 --- a/src/backends/cl/ClBackend.hpp +++ b/src/backends/cl/ClBackend.hpp @@ -44,7 +44,8 @@ public: IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override; IBackendInternal::ILayerSupportSharedPtr GetLayerSupport(const ModelOptions& modelOptions) const override; - OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph) const override; + OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph, + const ModelOptions& modelOptions) const override; IBackendInternal::IBackendSpecificModelContextPtr CreateBackendSpecificModelContext( const ModelOptions& modelOptions) const override; diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp index 7c1466e0e1..cce5c9b3bd 100644 --- a/src/backends/cl/ClLayerSupport.cpp +++ b/src/backends/cl/ClLayerSupport.cpp @@ -197,7 +197,8 @@ bool ClLayerSupport::IsAdditionSupported(const TensorInfo& input0, reasonIfUnsupported, input0, input1, - output); + output, + nullptr); } bool ClLayerSupport::IsArgMinMaxSupported(const TensorInfo& input, @@ -230,7 +231,8 @@ bool ClLayerSupport::IsBatchNormalizationSupported(const TensorInfo& input, var, beta, gamma, - descriptor); + descriptor, + nullptr); } bool ClLayerSupport::IsBatchToSpaceNdSupported(const TensorInfo& input, @@ -357,7 +359,8 @@ bool ClLayerSupport::IsConvolution2dSupported(const TensorInfo& input, descriptor, weights, biases, - isFastMathEnabled); + isFastMathEnabled, + nullptr); } bool ClLayerSupport::IsDequantizeSupported(const TensorInfo& input, @@ -395,7 +398,8 @@ bool ClLayerSupport::IsDepthwiseConvolutionSupported(const TensorInfo& input, output, descriptor, weights, - biases); + biases, + nullptr); } bool ClLayerSupport::IsDilatedDepthwiseConvolutionSupported(const TensorInfo& input, @@ -411,7 +415,8 @@ bool ClLayerSupport::IsDilatedDepthwiseConvolutionSupported(const TensorInfo& in output, descriptor, weights, - biases); + biases, + nullptr); } @@ -424,7 +429,8 @@ bool ClLayerSupport::IsDivisionSupported(const TensorInfo& input0, reasonIfUnsupported, input0, input1, - output); + output, + nullptr); } bool ClLayerSupport::IsElementwiseUnarySupported(const TensorInfo& input, @@ -494,7 +500,8 @@ bool ClLayerSupport::IsFullyConnectedSupported(const TensorInfo& input, output, weights, biases, - descriptor); + descriptor, + nullptr); } bool ClLayerSupport::IsGatherSupported(const TensorInfo& input0, @@ -639,7 +646,8 @@ bool ClLayerSupport::IsMultiplicationSupported(const TensorInfo& input0, reasonIfUnsupported, input0, input1, - output); + output, + nullptr); } bool ClLayerSupport::IsNormalizationSupported(const TensorInfo& input, @@ -911,7 +919,8 @@ bool ClLayerSupport::IsSubtractionSupported(const TensorInfo& input0, reasonIfUnsupported, input0, input1, - output); + output, + nullptr); } bool ClLayerSupport::IsTransposeConvolution2dSupported(const TensorInfo& input, diff --git a/src/backends/cl/workloads/ClAdditionWorkload.cpp b/src/backends/cl/workloads/ClAdditionWorkload.cpp index 18e2400ccd..7e75a04110 100644 --- a/src/backends/cl/workloads/ClAdditionWorkload.cpp +++ b/src/backends/cl/workloads/ClAdditionWorkload.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "ClWorkloadUtils.hpp" @@ -26,7 +27,10 @@ ClAdditionWorkload::ClAdditionWorkload(const AdditionQueueDescriptor& descriptor arm_compute::ICLTensor& input0 = static_cast(this->m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& input1 = static_cast(this->m_Data.m_Inputs[1])->GetTensor(); arm_compute::ICLTensor& output = static_cast(this->m_Data.m_Outputs[0])->GetTensor(); - m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy); + + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + + m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy, activationInfo); } void ClAdditionWorkload::Execute() const @@ -37,16 +41,21 @@ void ClAdditionWorkload::Execute() const arm_compute::Status ClAdditionValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output) + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0); const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + const arm_compute::Status aclStatus = arm_compute::CLArithmeticAddition::validate(&aclInput0Info, &aclInput1Info, &aclOutputInfo, - g_AclConvertPolicy); + g_AclConvertPolicy, + activationInfo); return aclStatus; } diff --git a/src/backends/cl/workloads/ClAdditionWorkload.hpp b/src/backends/cl/workloads/ClAdditionWorkload.hpp index 62bd0ae20b..372c4bc6f7 100644 --- a/src/backends/cl/workloads/ClAdditionWorkload.hpp +++ b/src/backends/cl/workloads/ClAdditionWorkload.hpp @@ -25,5 +25,6 @@ private: arm_compute::Status ClAdditionValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output); + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor = nullptr); } //namespace armnn diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp index fa0be85100..68942e2a01 100644 --- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp +++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.cpp @@ -4,12 +4,16 @@ // #include "ClBatchNormalizationFloatWorkload.hpp" +#include "ClWorkloadUtils.hpp" + #include + #include + #include -#include +#include -#include "ClWorkloadUtils.hpp" +#include namespace armnn { @@ -21,7 +25,8 @@ arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, const TensorInfo& var, const TensorInfo& beta, const TensorInfo& gamma, - const BatchNormalizationDescriptor &desc) + const BatchNormalizationDescriptor& desc, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input, desc.m_DataLayout); @@ -36,13 +41,17 @@ arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, const arm_compute::TensorInfo aclGammaInfo = armcomputetensorutils::BuildArmComputeTensorInfo(gamma, desc.m_DataLayout); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::CLBatchNormalizationLayer::validate(&aclInputInfo, &aclOutputInfo, &aclMeanInfo, &aclVarInfo, &aclBetaInfo, &aclGammaInfo, - desc.m_Eps); + desc.m_Eps, + activationInfo); } ClBatchNormalizationFloatWorkload::ClBatchNormalizationFloatWorkload( @@ -70,13 +79,16 @@ ClBatchNormalizationFloatWorkload::ClBatchNormalizationFloatWorkload( input.info()->set_data_layout(aclDataLayout); output.info()->set_data_layout(aclDataLayout); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + m_Layer.configure(&input, &output, m_Mean.get(), m_Variance.get(), m_Beta.get(), m_Gamma.get(), - m_Data.m_Parameters.m_Eps); + m_Data.m_Parameters.m_Eps, + activationInfo); InitializeArmComputeClTensorData(*m_Mean, m_Data.m_Mean); InitializeArmComputeClTensorData(*m_Variance, m_Data.m_Variance); diff --git a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp index e94bef20ac..ef5778309e 100644 --- a/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp +++ b/src/backends/cl/workloads/ClBatchNormalizationFloatWorkload.hpp @@ -19,7 +19,8 @@ arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, const TensorInfo& var, const TensorInfo& beta, const TensorInfo& gamma, - const BatchNormalizationDescriptor& desc); + const BatchNormalizationDescriptor& desc, + const ActivationDescriptor* activationDescriptor = nullptr); class ClBatchNormalizationFloatWorkload : public FloatWorkload { diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp index 7b52f2784f..50cb9ded37 100644 --- a/src/backends/cl/workloads/ClConvolution2dWorkload.cpp +++ b/src/backends/cl/workloads/ClConvolution2dWorkload.cpp @@ -25,7 +25,8 @@ arm_compute::Status ClConvolution2dWorkloadValidate(const TensorInfo& input, const Convolution2dDescriptor& descriptor, const TensorInfo& weights, const Optional& biases, - bool isFastMathEnabled) + bool isFastMathEnabled, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); @@ -47,6 +48,9 @@ arm_compute::Status ClConvolution2dWorkloadValidate(const TensorInfo& input, arm_compute::PadStrideInfo layerInfo = BuildArmComputePadStrideInfo(descriptor); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::CLConvolutionLayer::validate(&aclInputInfo, &aclWeightsInfo, optionalAclBiasesInfo, @@ -54,7 +58,7 @@ arm_compute::Status ClConvolution2dWorkloadValidate(const TensorInfo& input, layerInfo, arm_compute::WeightsInfo(), aclDilationInfo, - arm_compute::ActivationLayerInfo(), + activationInfo, isFastMathEnabled); } @@ -91,6 +95,8 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip arm_compute::PadStrideInfo padStrideInfo = BuildArmComputePadStrideInfo(m_Data.m_Parameters); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + m_ConvolutionLayer.configure(&input, m_KernelTensor.get(), m_BiasTensor.get(), @@ -98,7 +104,7 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip padStrideInfo, arm_compute::WeightsInfo(), aclDilationInfo, - arm_compute::ActivationLayerInfo(), + activationInfo, isFastMathEnabled); m_ConvolutionMethod = @@ -107,7 +113,7 @@ ClConvolution2dWorkload::ClConvolution2dWorkload(const Convolution2dQueueDescrip output.info(), padStrideInfo, arm_compute::WeightsInfo(), - arm_compute::ActivationLayerInfo(), + activationInfo, arm_compute::CLScheduler::get().target(), aclDilationInfo, isFastMathEnabled); diff --git a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp index f769422a0a..70170b569d 100644 --- a/src/backends/cl/workloads/ClConvolution2dWorkload.hpp +++ b/src/backends/cl/workloads/ClConvolution2dWorkload.hpp @@ -23,7 +23,8 @@ arm_compute::Status ClConvolution2dWorkloadValidate(const TensorInfo& input, const Convolution2dDescriptor& descriptor, const TensorInfo& weights, const Optional& biases, - bool isFastMathEnabled = false); + bool isFastMathEnabled = false, + const ActivationDescriptor* activationDescriptor = nullptr); class ClConvolution2dWorkload : public BaseWorkload { diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp index 8704b1276f..53f16848eb 100644 --- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp +++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp @@ -8,11 +8,13 @@ #include #include "ClWorkloadUtils.hpp" +#include #include #include #include #include #include +#include #include @@ -25,7 +27,8 @@ arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& inp const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, - const Optional& biases) + const Optional& biases, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); @@ -56,13 +59,16 @@ arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& inp descriptor.m_DilationX, descriptor.m_DilationY); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo, &aclWeightsInfo, optionalAclBiasesInfo, &aclOutputInfo, aclPadStrideInfo, aclDepthMultiplier, - arm_compute::ActivationLayerInfo(), + activationInfo, aclDilationInfo); } @@ -114,6 +120,8 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload( arm_compute::PadStrideInfo padStrideInfo = BuildArmComputePadStrideInfo(m_Data.m_Parameters); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + m_DepthwiseConvolutionLayer = std::make_unique(); static_cast(m_DepthwiseConvolutionLayer.get())->configure( &input, @@ -122,7 +130,7 @@ ClDepthwiseConvolutionWorkload::ClDepthwiseConvolutionWorkload( &output, padStrideInfo, depthMultiplier, - arm_compute::ActivationLayerInfo(), + activationInfo, aclDilationInfo); ARMNN_ASSERT(m_DepthwiseConvolutionLayer); diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.hpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.hpp index fc277b9947..c75913737d 100644 --- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.hpp +++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.hpp @@ -18,7 +18,8 @@ arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& inp const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, - const Optional& biases); + const Optional& biases, + const ActivationDescriptor* activationDescriptor = nullptr); class ClDepthwiseConvolutionWorkload : public BaseWorkload { diff --git a/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp b/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp index 2a27f8a9bc..c79e55ebdd 100644 --- a/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp +++ b/src/backends/cl/workloads/ClDivisionFloatWorkload.cpp @@ -4,9 +4,12 @@ // #include "ClDivisionFloatWorkload.hpp" -#include + +#include #include +#include + #include "ClWorkloadUtils.hpp" namespace armnn @@ -14,13 +17,17 @@ namespace armnn arm_compute::Status ClDivisionWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output) + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); - return arm_compute::CLArithmeticDivision::validate(&aclInput1, &aclInput2, &aclOutput); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + + return arm_compute::CLArithmeticDivision::validate(&aclInput1, &aclInput2, &aclOutput, activationInfo); } @@ -33,8 +40,10 @@ ClDivisionFloatWorkload::ClDivisionFloatWorkload(const DivisionQueueDescriptor& arm_compute::ICLTensor& input0 = static_cast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& input1 = static_cast(m_Data.m_Inputs[1])->GetTensor(); arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); - // Construct - m_ArithmeticDivision.configure(&input0, &input1, &output); + + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + + m_ArithmeticDivision.configure(&input0, &input1, &output, activationInfo); } void ClDivisionFloatWorkload::Execute() const diff --git a/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp b/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp index ddca87d78a..71d27ed5b5 100644 --- a/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp +++ b/src/backends/cl/workloads/ClDivisionFloatWorkload.hpp @@ -14,7 +14,8 @@ namespace armnn arm_compute::Status ClDivisionWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output); + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor = nullptr); class ClDivisionFloatWorkload : public FloatWorkload { diff --git a/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp b/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp index 60eb138b42..eaec639f28 100644 --- a/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp +++ b/src/backends/cl/workloads/ClFullyConnectedWorkload.cpp @@ -20,7 +20,8 @@ arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const TensorInfo& weights, const TensorInfo& biases, - const FullyConnectedDescriptor& descriptor) + const FullyConnectedDescriptor& descriptor, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); @@ -35,7 +36,7 @@ arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, } const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo = - ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor); + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor, activationDescriptor); return arm_compute::CLFullyConnectedLayer::validate(&aclInput, &aclWeights, @@ -63,9 +64,11 @@ ClFullyConnectedWorkload::ClFullyConnectedWorkload(const FullyConnectedQueueDesc arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); - // Construct - arm_compute::FullyConnectedLayerInfo fc_info; - fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix; + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + + arm_compute::FullyConnectedLayerInfo fc_info = + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor.m_Parameters, activationInfo); + m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info); InitializeArmComputeClTensorData(*m_WeightsTensor, m_Data.m_Weight); diff --git a/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp b/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp index e13436eaa5..311b59498b 100644 --- a/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp +++ b/src/backends/cl/workloads/ClFullyConnectedWorkload.hpp @@ -19,7 +19,8 @@ arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const TensorInfo& weights, const TensorInfo& biases, - const FullyConnectedDescriptor& descriptor); + const FullyConnectedDescriptor& descriptor, + const ActivationDescriptor* activationDescriptor = nullptr); class ClFullyConnectedWorkload : public armnn::BaseWorkload { diff --git a/src/backends/cl/workloads/ClMultiplicationWorkload.cpp b/src/backends/cl/workloads/ClMultiplicationWorkload.cpp index e9b75c3f10..46a1c4bc59 100644 --- a/src/backends/cl/workloads/ClMultiplicationWorkload.cpp +++ b/src/backends/cl/workloads/ClMultiplicationWorkload.cpp @@ -4,8 +4,12 @@ // #include "ClMultiplicationWorkload.hpp" -#include + +#include #include + +#include + #include "ClWorkloadUtils.hpp" namespace armnn @@ -13,7 +17,8 @@ namespace armnn arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output) + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); @@ -23,6 +28,9 @@ arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, arm_compute::ConvertPolicy::SATURATE : arm_compute::ConvertPolicy::WRAP; + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it, // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be // ignored for F32 tensors. @@ -31,7 +39,8 @@ arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, &aclOutput, 1.0f, convertPolicy, - arm_compute::RoundingPolicy::TO_ZERO); + arm_compute::RoundingPolicy::TO_ZERO, + activationInfo); } @@ -50,13 +59,16 @@ ClMultiplicationWorkload::ClMultiplicationWorkload(const MultiplicationQueueDesc arm_compute::ConvertPolicy::SATURATE : arm_compute::ConvertPolicy::WRAP; + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + // Construct m_PixelWiseMultiplication.configure(&input0, &input1, &output, 1.0f, convertPolicy, - arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + arm_compute::RoundingPolicy::TO_NEAREST_EVEN, + activationInfo); } void ClMultiplicationWorkload::Execute() const diff --git a/src/backends/cl/workloads/ClMultiplicationWorkload.hpp b/src/backends/cl/workloads/ClMultiplicationWorkload.hpp index 732bb16dcc..461449cc35 100644 --- a/src/backends/cl/workloads/ClMultiplicationWorkload.hpp +++ b/src/backends/cl/workloads/ClMultiplicationWorkload.hpp @@ -14,7 +14,8 @@ namespace armnn arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output); + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor = nullptr); class ClMultiplicationWorkload : public BaseWorkload { diff --git a/src/backends/cl/workloads/ClSubtractionWorkload.cpp b/src/backends/cl/workloads/ClSubtractionWorkload.cpp index 38154eb4d7..c9fb556383 100644 --- a/src/backends/cl/workloads/ClSubtractionWorkload.cpp +++ b/src/backends/cl/workloads/ClSubtractionWorkload.cpp @@ -7,9 +7,11 @@ #include #include +#include #include #include "ClWorkloadUtils.hpp" +#include "../../../../include/armnn/ArmNN.hpp" namespace armnn { @@ -26,7 +28,10 @@ ClSubtractionWorkload::ClSubtractionWorkload(const SubtractionQueueDescriptor& d arm_compute::ICLTensor& input0 = static_cast(this->m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& input1 = static_cast(this->m_Data.m_Inputs[1])->GetTensor(); arm_compute::ICLTensor& output = static_cast(this->m_Data.m_Outputs[0])->GetTensor(); - m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy); + + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + + m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy, activationInfo); } void ClSubtractionWorkload::Execute() const @@ -37,16 +42,21 @@ void ClSubtractionWorkload::Execute() const arm_compute::Status ClSubtractionValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output) + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0); const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + const arm_compute::Status aclStatus = arm_compute::CLArithmeticSubtraction::validate(&aclInput0Info, &aclInput1Info, &aclOutputInfo, - g_AclConvertPolicy); + g_AclConvertPolicy, + activationInfo); return aclStatus; } diff --git a/src/backends/cl/workloads/ClSubtractionWorkload.hpp b/src/backends/cl/workloads/ClSubtractionWorkload.hpp index da6d17c6ac..9f51de645b 100644 --- a/src/backends/cl/workloads/ClSubtractionWorkload.hpp +++ b/src/backends/cl/workloads/ClSubtractionWorkload.hpp @@ -25,5 +25,6 @@ private: arm_compute::Status ClSubtractionValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output); + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor = nullptr); } //namespace armnn diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp index 9862ddbd70..150bc345db 100644 --- a/src/backends/neon/NeonBackend.cpp +++ b/src/backends/neon/NeonBackend.cpp @@ -11,7 +11,10 @@ #include "NeonTensorHandleFactory.hpp" #include +#include +#include +#include #include #include @@ -19,8 +22,18 @@ #include +#include "workloads/NeonAdditionWorkload.hpp" +#include "workloads/NeonBatchNormalizationWorkload.hpp" +#include "workloads/NeonConvolution2dWorkload.hpp" +#include "workloads/NeonDepthwiseConvolutionWorkload.hpp" +#include "workloads/NeonDivisionWorkload.hpp" +#include "workloads/NeonFullyConnectedWorkload.hpp" +#include "workloads/NeonMultiplicationWorkload.hpp" +#include "workloads/NeonSubtractionWorkload.hpp" + #include +#include #include namespace armnn @@ -122,7 +135,238 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph { OptimizationViews optimizationViews; - optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); + auto it = subgraph.end(); + + while (it != subgraph.begin()) + { + --it; + Layer& base = **it; + + if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d + || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected + || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication + || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division) + && (base.GetAdditionalInformation() == nullptr)) + { + for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output) + { + if (output->GetNumConnections() == 1) + { + for (auto&& childInput : output->GetConnections()) + { + if (childInput->GetOwningLayer().GetType() == LayerType::Activation) + { + Layer& child = childInput->GetOwningLayer(); + + auto* activationLayer = PolymorphicDowncast(&child); + + const std::string name = std::string("fused-") + child.GetName() + std::string("-into-") + + base.GetName(); + + // Get params from activation layer + ActivationDescriptor activationDesc = activationLayer->GetParameters(); + + if (base.GetType() == LayerType::Convolution2d) + { + Convolution2dLayer* baseLayer = PolymorphicDowncast(&base); + + Optional biases; + + if (baseLayer->GetParameters().m_BiasEnabled) + { + biases = GetOverriddenDataType(baseLayer->m_Bias->GetTensorInfo(), + GetOptionalBiasTypeFromWeightsType( + baseLayer->m_Weight->GetTensorInfo().GetDataType())); + } + + arm_compute::Status status = NeonConvolution2dWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetParameters(), + baseLayer->m_Weight->GetTensorInfo(), + biases, + false, + &activationDesc); + + if (status) + { + FuseLayerWithWeightsAndBiases(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::DepthwiseConvolution2d) + { + DepthwiseConvolution2dLayer* baseLayer = + PolymorphicDowncast(&base); + + Optional biases; + + if (baseLayer->GetParameters().m_BiasEnabled) + { + biases = GetOverriddenDataType(baseLayer->m_Bias->GetTensorInfo(), + GetOptionalBiasTypeFromWeightsType( + baseLayer->m_Weight->GetTensorInfo().GetDataType())); + } + + arm_compute::Status status = NeonDepthwiseConvolutionWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetParameters(), + baseLayer->m_Weight->GetTensorInfo(), + biases, + &activationDesc); + + if (status) + { + FuseLayerWithWeightsAndBiases(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::FullyConnected) + { + FullyConnectedLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = NeonFullyConnectedWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->m_Weight->GetTensorInfo(), + baseLayer->m_Bias->GetTensorInfo(), + baseLayer->GetParameters(), + &activationDesc); + + if (status) + { + FuseLayerWithWeightsAndBiases(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::BatchNormalization) + { + BatchNormalizationLayer* baseLayer = + PolymorphicDowncast(&base); + + arm_compute::Status status = NeonBatchNormalizationValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->m_Mean->GetTensorInfo(), + baseLayer->m_Variance->GetTensorInfo(), + baseLayer->m_Beta->GetTensorInfo(), + baseLayer->m_Gamma->GetTensorInfo(), + baseLayer->GetParameters(), + &activationDesc); + + if (status) + { + BatchNormalizationLayer* replacementLayer = + FuseLayerWithParameters( + optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + + replacementLayer->m_Beta = std::move(baseLayer->m_Beta); + replacementLayer->m_Gamma = std::move(baseLayer->m_Gamma); + replacementLayer->m_Mean = std::move(baseLayer->m_Mean); + replacementLayer->m_Variance = std::move(baseLayer->m_Variance); + } + } + else if (base.GetType() == LayerType::Addition) + { + AdditionLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = NeonAdditionWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + &activationDesc); + + if (status) + { + FuseLayerWithoutParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::Division) + { + DivisionLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = NeonDivisionWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + &activationDesc); + + if (status) + { + FuseLayerWithoutParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::Multiplication) + { + MultiplicationLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = NeonMultiplicationWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + &activationDesc); + + if (status) + { + FuseLayerWithoutParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + else if (base.GetType() == LayerType::Subtraction) + { + SubtractionLayer* baseLayer = PolymorphicDowncast(&base); + + arm_compute::Status status = NeonSubtractionWorkloadValidate( + baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + baseLayer->GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo(), + activationLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(), + &activationDesc); + + if (status) + { + FuseLayerWithoutParameters(optimizationViews, + baseLayer, + activationLayer, + activationDesc, + name); + } + } + } + } + } + } + } + } + + if (optimizationViews.GetSubstitutions().empty()) + { + optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph)); + } return optimizationViews; } diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp index 0084dbd03f..f55d1c8df6 100644 --- a/src/backends/neon/NeonLayerSupport.cpp +++ b/src/backends/neon/NeonLayerSupport.cpp @@ -167,7 +167,8 @@ bool NeonLayerSupport::IsAdditionSupported(const TensorInfo& input0, reasonIfUnsupported, input0, input1, - output); + output, + nullptr); } bool NeonLayerSupport::IsArgMinMaxSupported(const TensorInfo& input, @@ -199,7 +200,8 @@ bool NeonLayerSupport::IsBatchNormalizationSupported(const TensorInfo& input, var, beta, gamma, - descriptor); + descriptor, + nullptr); } bool NeonLayerSupport::IsBatchToSpaceNdSupported(const TensorInfo& input, @@ -345,7 +347,8 @@ bool NeonLayerSupport::IsConvolution2dSupported(const TensorInfo& input, descriptor, weights, biases, - isFastMathEnabled); + isFastMathEnabled, + nullptr); } bool NeonLayerSupport::IsDepthToSpaceSupported(const TensorInfo& input, @@ -373,7 +376,8 @@ bool NeonLayerSupport::IsDepthwiseConvolutionSupported(const TensorInfo& input, output, descriptor, weights, - biases); + biases, + nullptr); } bool NeonLayerSupport::IsDequantizeSupported(const TensorInfo& input, @@ -399,7 +403,8 @@ bool NeonLayerSupport::IsDilatedDepthwiseConvolutionSupported(const TensorInfo& output, descriptor, weights, - biases); + biases, + nullptr); } bool NeonLayerSupport::IsElementwiseUnarySupported(const TensorInfo& input, @@ -474,7 +479,8 @@ bool NeonLayerSupport::IsFullyConnectedSupported(const TensorInfo& input, output, weights, biases, - descriptor); + descriptor, + nullptr); } bool NeonLayerSupport::IsGatherSupported(const TensorInfo& input0, @@ -611,7 +617,8 @@ bool NeonLayerSupport::IsMultiplicationSupported(const TensorInfo& input0, reasonIfUnsupported, input0, input1, - output); + output, + nullptr); } bool NeonLayerSupport::IsDivisionSupported(const TensorInfo& input0, @@ -623,7 +630,8 @@ bool NeonLayerSupport::IsDivisionSupported(const TensorInfo& input0, reasonIfUnsupported, input0, input1, - output); + output, + nullptr); } bool NeonLayerSupport::IsNormalizationSupported(const TensorInfo& input, @@ -911,7 +919,8 @@ bool NeonLayerSupport::IsSubtractionSupported(const TensorInfo& input0, reasonIfUnsupported, input0, input1, - output); + output, + nullptr); } bool NeonLayerSupport::IsTransposeConvolution2dSupported(const TensorInfo& input, diff --git a/src/backends/neon/workloads/NeonAdditionWorkload.cpp b/src/backends/neon/workloads/NeonAdditionWorkload.cpp index cb0c8a471f..9300b317a9 100644 --- a/src/backends/neon/workloads/NeonAdditionWorkload.cpp +++ b/src/backends/neon/workloads/NeonAdditionWorkload.cpp @@ -7,6 +7,8 @@ #include "NeonWorkloadUtils.hpp" #include +#include + #include #include @@ -17,16 +19,21 @@ namespace armnn arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output) + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput0 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::NEArithmeticAddition::validate(&aclInput0, &aclInput1, &aclOutput, - arm_compute::ConvertPolicy::SATURATE); + arm_compute::ConvertPolicy::SATURATE, + activationInfo); } @@ -40,8 +47,10 @@ NeonAdditionWorkload::NeonAdditionWorkload(const AdditionQueueDescriptor& descri arm_compute::ITensor& input2 = PolymorphicDowncast(m_Data.m_Inputs[1])->GetTensor(); arm_compute::ITensor& output = PolymorphicDowncast(m_Data.m_Outputs[0])->GetTensor(); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + auto layer = std::make_unique(); - layer->configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE); + layer->configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE, activationInfo); m_AddLayer.reset(layer.release()); } diff --git a/src/backends/neon/workloads/NeonAdditionWorkload.hpp b/src/backends/neon/workloads/NeonAdditionWorkload.hpp index 826fb1f3dd..8e43cbdb6d 100644 --- a/src/backends/neon/workloads/NeonAdditionWorkload.hpp +++ b/src/backends/neon/workloads/NeonAdditionWorkload.hpp @@ -8,6 +8,7 @@ #include #include +#include #include namespace armnn @@ -15,7 +16,8 @@ namespace armnn arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output); + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor = nullptr); class NeonAdditionWorkload : public BaseWorkload { diff --git a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp index ff777dbf9b..33480faf69 100644 --- a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp +++ b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.cpp @@ -8,7 +8,10 @@ #include "NeonWorkloadUtils.hpp" #include +#include + #include + #include #include @@ -24,7 +27,8 @@ arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input, const TensorInfo& var, const TensorInfo& beta, const TensorInfo& gamma, - const BatchNormalizationDescriptor& descriptor) + const BatchNormalizationDescriptor& descriptor, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); @@ -39,13 +43,17 @@ arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input, const arm_compute::TensorInfo aclGammaInfo = armcomputetensorutils::BuildArmComputeTensorInfo(gamma, descriptor.m_DataLayout); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::NEBatchNormalizationLayer::validate(&aclInputInfo, &aclOutputInfo, &aclMeanInfo, &aclVarInfo, &aclBetaInfo, &aclGammaInfo, - descriptor.m_Eps); + descriptor.m_Eps, + activationInfo); } NeonBatchNormalizationWorkload::NeonBatchNormalizationWorkload( @@ -73,6 +81,8 @@ NeonBatchNormalizationWorkload::NeonBatchNormalizationWorkload( m_Beta = std::make_unique(); BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo()); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + auto layer = std::make_unique(); layer->configure(&input, &output, @@ -80,7 +90,8 @@ NeonBatchNormalizationWorkload::NeonBatchNormalizationWorkload( m_Variance.get(), m_Beta.get(), m_Gamma.get(), - m_Data.m_Parameters.m_Eps); + m_Data.m_Parameters.m_Eps, + activationInfo); m_Layer.reset(layer.release()); InitializeArmComputeTensorData(*m_Mean, m_Data.m_Mean); diff --git a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.hpp b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.hpp index 3619ea0d73..fea778fb1c 100644 --- a/src/backends/neon/workloads/NeonBatchNormalizationWorkload.hpp +++ b/src/backends/neon/workloads/NeonBatchNormalizationWorkload.hpp @@ -21,7 +21,8 @@ arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input, const TensorInfo& var, const TensorInfo& beta, const TensorInfo& gamma, - const BatchNormalizationDescriptor& descriptor); + const BatchNormalizationDescriptor& descriptor, + const ActivationDescriptor* activationDescriptor = nullptr); class NeonBatchNormalizationWorkload : public BaseWorkload { diff --git a/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp b/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp index af6f1aee78..fd8be17dfd 100644 --- a/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp +++ b/src/backends/neon/workloads/NeonConvolution2dWorkload.cpp @@ -6,6 +6,7 @@ #include "NeonConvolution2dWorkload.hpp" #include +#include #include #include #include @@ -25,7 +26,8 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input, const Convolution2dDescriptor& descriptor, const TensorInfo& weights, const Optional& biases, - bool isFastMathEnabled) + bool isFastMathEnabled, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); @@ -47,6 +49,9 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input, arm_compute::PadStrideInfo layerInfo = BuildArmComputePadStrideInfo(descriptor); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::NEConvolutionLayer::validate(&aclInputInfo, &aclWeightsInfo, optionalAclBiasesInfo, @@ -54,7 +59,7 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input, layerInfo, arm_compute::WeightsInfo(), aclDilationInfo, - arm_compute::ActivationLayerInfo(), + activationInfo, isFastMathEnabled); } @@ -92,6 +97,8 @@ NeonConvolution2dWorkload::NeonConvolution2dWorkload( const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D(m_Data.m_Parameters.m_DilationX, m_Data.m_Parameters.m_DilationY); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + auto convolutionLayer = std::make_unique(memoryManager); convolutionLayer->configure(&input, m_KernelTensor.get(), @@ -100,7 +107,7 @@ NeonConvolution2dWorkload::NeonConvolution2dWorkload( padStrideInfo, arm_compute::WeightsInfo(), aclDilationInfo, - arm_compute::ActivationLayerInfo(), + activationInfo, isFastMathEnabled); m_ConvolutionMethod = @@ -110,7 +117,7 @@ NeonConvolution2dWorkload::NeonConvolution2dWorkload( padStrideInfo, arm_compute::WeightsInfo(), aclDilationInfo, - arm_compute::ActivationLayerInfo(), + activationInfo, isFastMathEnabled); m_ConvolutionLayer.reset(convolutionLayer.release()); diff --git a/src/backends/neon/workloads/NeonConvolution2dWorkload.hpp b/src/backends/neon/workloads/NeonConvolution2dWorkload.hpp index 860d78ba7e..4b6e58ce41 100644 --- a/src/backends/neon/workloads/NeonConvolution2dWorkload.hpp +++ b/src/backends/neon/workloads/NeonConvolution2dWorkload.hpp @@ -21,7 +21,8 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input, const Convolution2dDescriptor& descriptor, const TensorInfo& weights, const Optional& biases, - bool isFastMathEnabled = false); + bool isFastMathEnabled = false, + const ActivationDescriptor* activationDescriptor = nullptr); class NeonConvolution2dWorkload : public BaseWorkload { diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp index a9a3c75bfd..db6bcc3ecb 100644 --- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp +++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.cpp @@ -10,6 +10,7 @@ #include #include +#include #include @@ -29,7 +30,8 @@ arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& i const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, - const Optional& biases) + const Optional& biases, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output, descriptor.m_DataLayout); @@ -59,13 +61,16 @@ arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& i const arm_compute::Size2D aclDilationInfo = BuildArmComputeSize2D( descriptor.m_DilationX,descriptor.m_DilationY); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo, &aclWeightsInfo, optionalAclBiasesInfo, &aclOutputInfo, aclPadStrideInfo, aclDepthMultiplier, - arm_compute::ActivationLayerInfo(), + activationInfo, aclDilationInfo); } @@ -116,16 +121,18 @@ NeonDepthwiseConvolutionWorkload::NeonDepthwiseConvolutionWorkload( arm_compute::PadStrideInfo padStrideInfo = BuildArmComputePadStrideInfo(m_Data.m_Parameters); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + m_pDepthwiseConvolutionLayer = std::make_unique(); static_cast( m_pDepthwiseConvolutionLayer.get())->configure(&input, - m_KernelTensor.get(), - m_BiasTensor.get(), - &output, - padStrideInfo, - depthMultiplier, - arm_compute::ActivationLayerInfo(), - aclDilationInfo); + m_KernelTensor.get(), + m_BiasTensor.get(), + &output, + padStrideInfo, + depthMultiplier, + activationInfo, + aclDilationInfo); ARMNN_ASSERT(m_pDepthwiseConvolutionLayer); diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.hpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.hpp index 85932d3f9a..d257b91638 100644 --- a/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.hpp +++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionWorkload.hpp @@ -19,7 +19,9 @@ arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& i const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, - const Optional& biases); + const Optional& biases, + const ActivationDescriptor* activationDescriptor + = nullptr); class NeonDepthwiseConvolutionWorkload : public BaseWorkload { diff --git a/src/backends/neon/workloads/NeonDivisionWorkload.cpp b/src/backends/neon/workloads/NeonDivisionWorkload.cpp index fc353f136d..1a26d9510a 100644 --- a/src/backends/neon/workloads/NeonDivisionWorkload.cpp +++ b/src/backends/neon/workloads/NeonDivisionWorkload.cpp @@ -6,23 +6,31 @@ #include "NeonDivisionWorkload.hpp" #include +#include + #include + #include namespace armnn { arm_compute::Status NeonDivisionWorkloadValidate(const TensorInfo& input0, - const TensorInfo& input1, - const TensorInfo& output) + const TensorInfo& input1, + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput0 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::NEElementwiseDivision::validate(&aclInput0, - &aclInput1, - &aclOutput); + &aclInput1, + &aclOutput, + activationInfo); } NeonDivisionWorkload::NeonDivisionWorkload(const DivisionQueueDescriptor& descriptor, @@ -35,7 +43,9 @@ NeonDivisionWorkload::NeonDivisionWorkload(const DivisionQueueDescriptor& descri arm_compute::ITensor& input1 = PolymorphicDowncast(m_Data.m_Inputs[1])->GetTensor(); arm_compute::ITensor& output = PolymorphicDowncast(m_Data.m_Outputs[0])->GetTensor(); - m_DivLayer.configure(&input0, &input1, &output); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + + m_DivLayer.configure(&input0, &input1, &output, activationInfo); } void NeonDivisionWorkload::Execute() const diff --git a/src/backends/neon/workloads/NeonDivisionWorkload.hpp b/src/backends/neon/workloads/NeonDivisionWorkload.hpp index 2405d9a4ab..fffe02fc00 100644 --- a/src/backends/neon/workloads/NeonDivisionWorkload.hpp +++ b/src/backends/neon/workloads/NeonDivisionWorkload.hpp @@ -13,8 +13,9 @@ namespace armnn { arm_compute::Status NeonDivisionWorkloadValidate(const TensorInfo& input0, - const TensorInfo& input1, - const TensorInfo& output); + const TensorInfo& input1, + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor = nullptr); class NeonDivisionWorkload : public BaseWorkload { diff --git a/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp b/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp index e808c60c0c..31489a0c32 100644 --- a/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp +++ b/src/backends/neon/workloads/NeonFullyConnectedWorkload.cpp @@ -6,9 +6,12 @@ #include "NeonFullyConnectedWorkload.hpp" #include "NeonWorkloadUtils.hpp" + #include #include + #include + #include #include @@ -21,7 +24,8 @@ arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const TensorInfo& weights, const TensorInfo& biases, - const FullyConnectedDescriptor& descriptor) + const FullyConnectedDescriptor& descriptor, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); @@ -36,8 +40,7 @@ arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input, } const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo = - ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor); - + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor, activationDescriptor); return arm_compute::NEFullyConnectedLayer::validate(&aclInput, &aclWeights, @@ -64,9 +67,10 @@ NeonFullyConnectedWorkload::NeonFullyConnectedWorkload(const FullyConnectedQueue BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); } - // Construct - arm_compute::FullyConnectedLayerInfo fc_info; - fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix; + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + + arm_compute::FullyConnectedLayerInfo fc_info = + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor.m_Parameters, activationInfo); auto layer = std::make_unique(memoryManager); layer->configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info); diff --git a/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp b/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp index 1cd8be109a..8dc7fdcd6c 100644 --- a/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp +++ b/src/backends/neon/workloads/NeonFullyConnectedWorkload.hpp @@ -21,7 +21,8 @@ arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const TensorInfo& weights, const TensorInfo& biases, - const FullyConnectedDescriptor& descriptor); + const FullyConnectedDescriptor& descriptor, + const ActivationDescriptor* activationDescriptor = nullptr); class NeonFullyConnectedWorkload : public BaseWorkload { diff --git a/src/backends/neon/workloads/NeonMultiplicationWorkload.cpp b/src/backends/neon/workloads/NeonMultiplicationWorkload.cpp index 6f78b8eacc..e4ed195922 100644 --- a/src/backends/neon/workloads/NeonMultiplicationWorkload.cpp +++ b/src/backends/neon/workloads/NeonMultiplicationWorkload.cpp @@ -7,6 +7,8 @@ #include "NeonWorkloadUtils.hpp" +#include + #include #include @@ -16,7 +18,8 @@ namespace armnn arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output) + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); @@ -26,6 +29,9 @@ arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0, arm_compute::ConvertPolicy::SATURATE : arm_compute::ConvertPolicy::WRAP; + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it, // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be // ignored for F32 tensors. @@ -34,7 +40,8 @@ arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0, &aclOutput, 1.0f, convertPolicy, - arm_compute::RoundingPolicy::TO_ZERO); + arm_compute::RoundingPolicy::TO_ZERO, + activationInfo); } NeonMultiplicationWorkload::NeonMultiplicationWorkload(const MultiplicationQueueDescriptor& descriptor, @@ -52,6 +59,8 @@ NeonMultiplicationWorkload::NeonMultiplicationWorkload(const MultiplicationQueue arm_compute::ConvertPolicy::SATURATE : arm_compute::ConvertPolicy::WRAP; + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it, // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be // ignored for F32 tensors. @@ -61,7 +70,8 @@ NeonMultiplicationWorkload::NeonMultiplicationWorkload(const MultiplicationQueue &output, 1.0f, convertPolicy, - arm_compute::RoundingPolicy::TO_ZERO); + arm_compute::RoundingPolicy::TO_ZERO, + activationInfo); m_PixelWiseMultiplication.reset(layer.release()); } diff --git a/src/backends/neon/workloads/NeonMultiplicationWorkload.hpp b/src/backends/neon/workloads/NeonMultiplicationWorkload.hpp index bfbaf776c1..d2bcd04482 100644 --- a/src/backends/neon/workloads/NeonMultiplicationWorkload.hpp +++ b/src/backends/neon/workloads/NeonMultiplicationWorkload.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -16,7 +17,8 @@ namespace armnn { arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output); + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor = nullptr); class NeonMultiplicationWorkload : public BaseWorkload { diff --git a/src/backends/neon/workloads/NeonSubtractionWorkload.cpp b/src/backends/neon/workloads/NeonSubtractionWorkload.cpp index ccc2bfe58b..21f0f6fa41 100644 --- a/src/backends/neon/workloads/NeonSubtractionWorkload.cpp +++ b/src/backends/neon/workloads/NeonSubtractionWorkload.cpp @@ -6,8 +6,12 @@ #include "NeonSubtractionWorkload.hpp" #include "NeonWorkloadUtils.hpp" + #include +#include + #include + #include #include @@ -17,16 +21,21 @@ namespace armnn arm_compute::Status NeonSubtractionWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output) + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor) { const arm_compute::TensorInfo aclInput0 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + const arm_compute::ActivationLayerInfo activationInfo = ConvertActivationDescriptorToAclActivationLayerInfo( + activationDescriptor); + return arm_compute::NEArithmeticSubtraction::validate(&aclInput0, &aclInput1, &aclOutput, - arm_compute::ConvertPolicy::SATURATE); + arm_compute::ConvertPolicy::SATURATE, + activationInfo); } NeonSubtractionWorkload::NeonSubtractionWorkload(const SubtractionQueueDescriptor& descriptor, @@ -39,8 +48,10 @@ NeonSubtractionWorkload::NeonSubtractionWorkload(const SubtractionQueueDescripto arm_compute::ITensor& input2 = PolymorphicDowncast(m_Data.m_Inputs[1])->GetTensor(); arm_compute::ITensor& output = PolymorphicDowncast(m_Data.m_Outputs[0])->GetTensor(); + const arm_compute::ActivationLayerInfo activationInfo = ConvertAdditionalInfoToAclActivationLayerInfo(descriptor); + auto layer = std::make_unique(); - layer->configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE); + layer->configure(&input1, &input2, &output, arm_compute::ConvertPolicy::SATURATE, activationInfo); m_SubLayer.reset(layer.release()); } diff --git a/src/backends/neon/workloads/NeonSubtractionWorkload.hpp b/src/backends/neon/workloads/NeonSubtractionWorkload.hpp index 3326f8bf4a..19d0811a18 100644 --- a/src/backends/neon/workloads/NeonSubtractionWorkload.hpp +++ b/src/backends/neon/workloads/NeonSubtractionWorkload.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -17,7 +18,8 @@ namespace armnn arm_compute::Status NeonSubtractionWorkloadValidate(const TensorInfo& input0, const TensorInfo& input1, - const TensorInfo& output); + const TensorInfo& output, + const ActivationDescriptor* activationDescriptor = nullptr); class NeonSubtractionWorkload : public BaseWorkload { diff --git a/src/profiling/test/ProfilingTestUtils.cpp b/src/profiling/test/ProfilingTestUtils.cpp index 09639bfae7..93d0b10d4b 100644 --- a/src/profiling/test/ProfilingTestUtils.cpp +++ b/src/profiling/test/ProfilingTestUtils.cpp @@ -413,20 +413,20 @@ void VerifyPostOptimisationStructureTestImpl(armnn::BackendId backendId) conv2dDesc.m_BiasEnabled = true; IConnectableLayer* conv2d = net->AddConvolution2dLayer(conv2dDesc, weights, optionalBiases); - // Activation layer - armnn::ActivationDescriptor activationDesc; - armnn::IConnectableLayer* const activation = net->AddActivationLayer(activationDesc, "activation"); + // Abs layer + armnn::ElementwiseUnaryDescriptor absDesc; + armnn::IConnectableLayer* const abs = net->AddElementwiseUnaryLayer(absDesc, "abs"); // Output layer IConnectableLayer* output = net->AddOutputLayer(0, "output"); input->GetOutputSlot(0).Connect(conv2d->GetInputSlot(0)); - conv2d->GetOutputSlot(0).Connect(activation->GetInputSlot(0)); - activation->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + conv2d->GetOutputSlot(0).Connect(abs->GetInputSlot(0)); + abs->GetOutputSlot(0).Connect(output->GetInputSlot(0)); input->GetOutputSlot(0).SetTensorInfo(inputInfo); conv2d->GetOutputSlot(0).SetTensorInfo(outputInfo); - activation->GetOutputSlot(0).SetTensorInfo(outputInfo); + abs->GetOutputSlot(0).SetTensorInfo(outputInfo); // optimize the network std::vector backends = { backendId }; @@ -633,70 +633,70 @@ void VerifyPostOptimisationStructureTestImpl(armnn::BackendId backendId) offset); BOOST_TEST_MESSAGE("CONV2D LAYER - WORKLOAD CHILD RELATIONSHIP OK"); - // Activation layer - // Activation layer entity - VerifyTimelineEntityBinaryPacketData(activation->GetGuid(), readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION ENTITY OK"); + // Abs layer + // Abs layer entity + VerifyTimelineEntityBinaryPacketData(abs->GetGuid(), readableData, offset); + BOOST_TEST_MESSAGE("ABS ENTITY OK"); // Name entity - ProfilingGuid activationLabelGuid = VerifyTimelineLabelBinaryPacketData( - EmptyOptional(), "activation", readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION NAME LABEL OK"); + ProfilingGuid absLabelGuid = VerifyTimelineLabelBinaryPacketData( + EmptyOptional(), "abs", readableData, offset); + BOOST_TEST_MESSAGE("ABS NAME LABEL OK"); // Entity - Name relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::LabelLink, EmptyOptional(), - activation->GetGuid(), - activationLabelGuid, + abs->GetGuid(), + absLabelGuid, LabelsAndEventClasses::NAME_GUID, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION LAYER - NAME RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS LAYER - NAME RELATIONSHIP OK"); // Entity - Type relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::LabelLink, EmptyOptional(), - activation->GetGuid(), + abs->GetGuid(), LabelsAndEventClasses::LAYER_GUID, LabelsAndEventClasses::TYPE_GUID, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION LAYER TYPE RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS LAYER TYPE RELATIONSHIP OK"); - // Network - Activation layer relationship + // Network - Abs layer relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::RetentionLink, EmptyOptional(), optNetGuid, - activation->GetGuid(), + abs->GetGuid(), LabelsAndEventClasses::CHILD_GUID, readableData, offset); - BOOST_TEST_MESSAGE("NETWORK - ACTIVATION LAYER CHILD RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("NETWORK - ABS LAYER CHILD RELATIONSHIP OK"); - // Conv2d layer - Activation layer relationship + // Conv2d layer - Abs layer relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::RetentionLink, EmptyOptional(), conv2d->GetGuid(), - activation->GetGuid(), + abs->GetGuid(), LabelsAndEventClasses::CONNECTION_GUID, readableData, offset); - BOOST_TEST_MESSAGE("CONV2D LAYER - ACTIVATION LAYER CONNECTION OK"); + BOOST_TEST_MESSAGE("CONV2D LAYER - ABS LAYER CONNECTION OK"); - // Activation workload - // Activation workload entity - ProfilingGuid activationWorkloadGuid = VerifyTimelineEntityBinaryPacketData(EmptyOptional(), readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD ENTITY OK"); + // Abs workload + // Abs workload entity + ProfilingGuid absWorkloadGuid = VerifyTimelineEntityBinaryPacketData(EmptyOptional(), readableData, offset); + BOOST_TEST_MESSAGE("ABS WORKLOAD ENTITY OK"); // Entity - Type relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::LabelLink, EmptyOptional(), - activationWorkloadGuid, + absWorkloadGuid, LabelsAndEventClasses::WORKLOAD_GUID, LabelsAndEventClasses::TYPE_GUID, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLAD TYPE RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS WORKLAD TYPE RELATIONSHIP OK"); // BackendId entity VerifyTimelineLabelBinaryPacketData(EmptyOptional(), backendId.Get(), readableData, offset); @@ -705,22 +705,22 @@ void VerifyPostOptimisationStructureTestImpl(armnn::BackendId backendId) // Entity - BackendId relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::LabelLink, EmptyOptional(), - activationWorkloadGuid, + absWorkloadGuid, backendIdLabelGuid, LabelsAndEventClasses::BACKENDID_GUID, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD BACKEND ID RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS WORKLOAD BACKEND ID RELATIONSHIP OK"); - // Activation layer - Activation workload relationship + // Abs layer - Abs workload relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::RetentionLink, EmptyOptional(), - activation->GetGuid(), - activationWorkloadGuid, + abs->GetGuid(), + absWorkloadGuid, LabelsAndEventClasses::CHILD_GUID, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION LAYER - WORKLOAD CHILD RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS LAYER - WORKLOAD CHILD RELATIONSHIP OK"); // Output layer // Output layer entity @@ -761,15 +761,15 @@ void VerifyPostOptimisationStructureTestImpl(armnn::BackendId backendId) offset); BOOST_TEST_MESSAGE("NETWORK - OUTPUT LAYER CHILD RELATIONSHIP OK"); - // Activation layer - Output layer relationship + // Abs layer - Output layer relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::RetentionLink, EmptyOptional(), - activation->GetGuid(), + abs->GetGuid(), output->GetGuid(), LabelsAndEventClasses::CONNECTION_GUID, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION LAYER - OUTPUT LAYER CONNECTION OK"); + BOOST_TEST_MESSAGE("ABS LAYER - OUTPUT LAYER CONNECTION OK"); bufferManager.MarkRead(readableBuffer); @@ -1100,73 +1100,73 @@ void VerifyPostOptimisationStructureTestImpl(armnn::BackendId backendId) offset); BOOST_TEST_MESSAGE("CONV2D WORKLOAD EXECUTION END OF LIFE RELATIONSHIP OK"); - // Activation workload execution - // Activation workload execution entity - ProfilingGuid activationWorkloadExecutionGuid = VerifyTimelineEntityBinaryPacketData( + // Abs workload execution + // Abs workload execution entity + ProfilingGuid absWorkloadExecutionGuid = VerifyTimelineEntityBinaryPacketData( EmptyOptional(), readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD EXECUTION ENTITY OK"); + BOOST_TEST_MESSAGE("ABS WORKLOAD EXECUTION ENTITY OK"); // Entity - Type relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::LabelLink, EmptyOptional(), - activationWorkloadExecutionGuid, + absWorkloadExecutionGuid, LabelsAndEventClasses::WORKLOAD_EXECUTION_GUID, LabelsAndEventClasses::TYPE_GUID, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD EXECUTION TYPE RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS WORKLOAD EXECUTION TYPE RELATIONSHIP OK"); // Inference - Workload execution relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::RetentionLink, EmptyOptional(), inferenceGuid, - activationWorkloadExecutionGuid, + absWorkloadExecutionGuid, LabelsAndEventClasses::CHILD_GUID, readableData, offset); - BOOST_TEST_MESSAGE("INFERENCE - ACTIVATION WORKLOAD EXECUTION CHILD RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("INFERENCE - ABS WORKLOAD EXECUTION CHILD RELATIONSHIP OK"); // Workload - Workload execution relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::RetentionLink, EmptyOptional(), - activationWorkloadGuid, - activationWorkloadExecutionGuid, + absWorkloadGuid, + absWorkloadExecutionGuid, LabelsAndEventClasses::EXECUTION_OF_GUID, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD - ACTIVATION WORKLOAD EXECUTION RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS WORKLOAD - ABS WORKLOAD EXECUTION RELATIONSHIP OK"); - // Start Activation workload execution life + // Start Abs workload execution life // Event packet - timeline, threadId, eventGuid - ProfilingGuid activationWorkloadExecutionSOLEventGuid = VerifyTimelineEventBinaryPacket( + ProfilingGuid absWorkloadExecutionSOLEventGuid = VerifyTimelineEventBinaryPacket( EmptyOptional(), EmptyOptional(), EmptyOptional(), readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD EXECUTION START OF LIFE EVENT OK"); + BOOST_TEST_MESSAGE("ABS WORKLOAD EXECUTION START OF LIFE EVENT OK"); - // Activation workload execution - event relationship + // Abs workload execution - event relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::ExecutionLink, EmptyOptional(), - activationWorkloadExecutionGuid, - activationWorkloadExecutionSOLEventGuid, + absWorkloadExecutionGuid, + absWorkloadExecutionSOLEventGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD EXECUTION START OF LIFE RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS WORKLOAD EXECUTION START OF LIFE RELATIONSHIP OK"); - // End of Activation workload execution life + // End of Abs workload execution life // Event packet - timeline, threadId, eventGuid - ProfilingGuid activationWorkloadExecutionEOLEventGuid = VerifyTimelineEventBinaryPacket( + ProfilingGuid absWorkloadExecutionEOLEventGuid = VerifyTimelineEventBinaryPacket( EmptyOptional(), EmptyOptional(), EmptyOptional(), readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD EXECUTION END OF LIFE EVENT OK"); + BOOST_TEST_MESSAGE("ABS WORKLOAD EXECUTION END OF LIFE EVENT OK"); - // Activation workload execution - event relationship + // Abs workload execution - event relationship VerifyTimelineRelationshipBinaryPacketData(ProfilingRelationshipType::ExecutionLink, EmptyOptional(), - activationWorkloadExecutionGuid, - activationWorkloadExecutionEOLEventGuid, + absWorkloadExecutionGuid, + absWorkloadExecutionEOLEventGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS, readableData, offset); - BOOST_TEST_MESSAGE("ACTIVATION WORKLOAD EXECUTION END OF LIFE RELATIONSHIP OK"); + BOOST_TEST_MESSAGE("ABS WORKLOAD EXECUTION END OF LIFE RELATIONSHIP OK"); // Output workload execution // Output workload execution entity -- cgit v1.2.1