diff options
author | Tracy Narine <tracy.narine@arm.com> | 2023-09-20 14:19:07 +0100 |
---|---|---|
committer | Tracy Narine <tracy.narine@arm.com> | 2023-09-28 14:25:16 +0100 |
commit | 6440ce89abb06e090d2b3cf91bafc14277072475 (patch) | |
tree | c55682891a0f01f3edbf5dad58720ded7af3fc64 | |
parent | 9a418d850333119e219fb05addc57b56cdc60a7e (diff) | |
download | armnn-6440ce89abb06e090d2b3cf91bafc14277072475.tar.gz |
IVGCVSW-7504 Create a backend specific optimization to fuse ADD+MUL+Add+(Activation) in CpuAcc
* Adding CpuAcc backend optimization to fuse add+mul+add into one layer
* Tests added/enhanced
* Also added optional extended parameter to Graph::Print()
and throw macros that could be used in place of assert
Signed-off-by: Tracy Narine <tracy.narine@arm.com>
Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: I5f8d094b969a130d8c2c7b4da07426313a9fea76
-rw-r--r-- | CMakeLists.txt | 1 | ||||
-rw-r--r-- | include/armnn/Exceptions.hpp | 12 | ||||
-rw-r--r-- | src/armnn/Graph.cpp | 23 | ||||
-rw-r--r-- | src/armnn/Graph.hpp | 4 | ||||
-rw-r--r-- | src/armnn/test/optimizations/AddMulAddTests.cpp | 311 | ||||
-rw-r--r-- | src/backends/aclCommon/ArmComputeSubgraphUtils.hpp | 21 | ||||
-rw-r--r-- | src/backends/backendsCommon/SubgraphUtils.hpp | 160 | ||||
-rw-r--r-- | src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp | 18 | ||||
-rw-r--r-- | src/backends/neon/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/backends/neon/NeonBackend.cpp | 82 | ||||
-rw-r--r-- | src/backends/neon/NeonBackendOptimizationUtils.hpp | 215 | ||||
-rw-r--r-- | src/backends/neon/test/NeonLayerTests.cpp | 2 |
12 files changed, 840 insertions, 10 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index a89e3fd4f7..04b71513b0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -597,6 +597,7 @@ if(BUILD_UNIT_TESTS) src/armnn/test/ObservableTest.cpp src/armnn/test/OptimizerTests.cpp src/armnn/test/optimizations/AddBroadcastReshapeLayerTests.cpp + src/armnn/test/optimizations/AddMulAddTests.cpp src/armnn/test/optimizations/BroadcastToTests.cpp src/armnn/test/optimizations/ConvertConstDequantisationLayersToConstLayersTest.cpp src/armnn/test/optimizations/ConvertConstPermuteLayersToConstLayersTest.cpp diff --git a/include/armnn/Exceptions.hpp b/include/armnn/Exceptions.hpp index 19b7f87e5a..1fa7083d5a 100644 --- a/include/armnn/Exceptions.hpp +++ b/include/armnn/Exceptions.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd. All rights reserved. +// Copyright © 2017-2023 Arm Ltd. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -201,3 +201,13 @@ void ConditionalThrowIfNotEqual(const std::string& message, } // namespace armnn #define CHECK_LOCATION() armnn::CheckLocation(__func__, __FILE__, __LINE__) + +// Use to throw rather than assert +#define ARMNN_THROW_MSG_IF_FALSE(_cond, _except, _str) \ + do { if (!(static_cast<bool>(_cond))) {throw _except(_str);} } while(0) +#define ARMNN_THROW_IF_FALSE(_cond, _except) \ + ARMNN_THROW_MSG_IF_FALSE(_cond, _except, #_cond) +#define ARMNN_THROW_INVALIDARG_MSG_IF_FALSE(_cond, _str) \ + ARMNN_THROW_MSG_IF_FALSE(_cond, armnn::InvalidArgumentException, _str) +#define ARMNN_THROW_INVALIDARG_IF_FALSE(_cond) \ + ARMNN_THROW_MSG_IF_FALSE(_cond, armnn::InvalidArgumentException, #_cond) diff --git a/src/armnn/Graph.cpp b/src/armnn/Graph.cpp index cf6f20f82b..f7fbba783e 100644 --- a/src/armnn/Graph.cpp +++ b/src/armnn/Graph.cpp @@ -65,7 +65,7 @@ Graph::Graph(const Graph& other) } } -Status Graph::Print() const +Status Graph::Print(bool extended) const { if (m_Layers.empty()) { @@ -80,8 +80,15 @@ Status Graph::Print() const auto numInputSlots = it->GetNumInputSlots(); auto numOutputSlots = it->GetNumOutputSlots(); + std::string guid; + if (extended) + { + guid += ":"; + guid += std::to_string(it->GetGuid()); + } ARMNN_LOG(info) << it->GetName() << ":" << GetLayerTypeAsCString(it->GetType()) << ":" << it->GetBackendId().Get() + << guid << " has " << numInputSlots << " input slots" << " and " << numOutputSlots << " output slots."; @@ -97,6 +104,13 @@ Status Graph::Print() const message << inputTensorShape[dim] << ","; } message << " ]"; + if (extended) + { + message << " Scale: " << i.GetConnectedOutputSlot()->GetTensorInfo().GetQuantizationScale(); + message << " Offset: " << i.GetConnectedOutputSlot()->GetTensorInfo().GetQuantizationOffset(); + message << " The input slot is connected to: "; + message << i.GetConnectedOutputSlot()->GetOwningIConnectableLayer().GetGuid(); + } ARMNN_LOG(info) << message.str(); } @@ -113,6 +127,13 @@ Status Graph::Print() const message << outputTensorShape[dim] << ","; } message << " ]"; + if (extended) + { + message << " Scale: " << layer->GetOutputSlots()[i].GetTensorInfo().GetQuantizationScale(); + message << " Offset: " << layer->GetOutputSlots()[i].GetTensorInfo().GetQuantizationOffset(); + message << " The output slot is connected to: "; + message << layer->GetOutputSlots()[i].GetConnection(0)->GetOwningIConnectableLayer().GetGuid(); + } ARMNN_LOG(info) << message.str(); } ARMNN_LOG(info) << "\n"; diff --git a/src/armnn/Graph.hpp b/src/armnn/Graph.hpp index 1b87751e9b..aa543c1357 100644 --- a/src/armnn/Graph.hpp +++ b/src/armnn/Graph.hpp @@ -1,5 +1,5 @@ // -// Copyright © 2017 Arm Ltd and Contributors. All rights reserved. +// Copyright © 2017-2023 Arm Ltd and Contributors. All rights reserved. // SPDX-License-Identifier: MIT // #pragma once @@ -140,7 +140,7 @@ public: }); } - Status Print() const; + Status Print(bool extended = false) const; Status SerializeToDot(std::ostream& stream); diff --git a/src/armnn/test/optimizations/AddMulAddTests.cpp b/src/armnn/test/optimizations/AddMulAddTests.cpp new file mode 100644 index 0000000000..fababa030a --- /dev/null +++ b/src/armnn/test/optimizations/AddMulAddTests.cpp @@ -0,0 +1,311 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#include <GraphUtils.hpp> +#include <TestUtils.hpp> +#include <ResolveType.hpp> +#include <armnnUtils/QuantizeHelper.hpp> + +#include <armnn/INetwork.hpp> + +#include <doctest/doctest.h> + +using namespace armnn; + +namespace +{ +template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>> +void AddMulAddTest(Compute backendId, bool addOutput, bool addRelu) +{ + const TensorInfo input0TensorInfo({ 1, 2, 2, 3 }, + ArmnnType, + IsQuantizedType<T>() ? 0.25f : 1, + IsQuantizedType<T>() ? 10 : 0, + true); + const TensorInfo input1TensorInfo({ 1, 2, 2, 3 }, + ArmnnType, + IsQuantizedType<T>() ? 0.25f : 1, + IsQuantizedType<T>() ? 11 : 0, + true); + const TensorInfo mulInput1TensorInfo({ 3 }, + ArmnnType, + IsQuantizedType<T>() ? 0.25f : 1, + IsQuantizedType<T>() ? 12 : 0, + true); + const TensorInfo addInput1TensorInfo({ 3 }, + ArmnnType, + IsQuantizedType<T>() ? 0.25f : 1, + IsQuantizedType<T>() ? 13 : 0, + true); + const TensorInfo output0TensorInfo({ 1, 2, 2, 3 }, + ArmnnType, + IsQuantizedType<T>() ? 0.5f : 1, + IsQuantizedType<T>() ? 14 : 0); + const TensorInfo output1TensorInfo({ 1, 2, 2, 3 }, + ArmnnType, + IsQuantizedType<T>() ? 0.5f : 1, + IsQuantizedType<T>() ? 15 : 0); + + std::vector<float> input0Data + { + 0.0f, 0.0f, 0.0f, + 1.0f, 1.0f, 1.0f, + -1.0f, -1.0f, -1.0f, + -2.0f, -2.0f, -2.0f + }; + std::vector<float> input1Data + { + 0.0f, 0.0f, 0.0f, + 1.0f, 1.0f, 1.0f, + -1.0f, -1.0f, -1.0f, + -2.0f, -2.0f, -2.0f + }; + std::vector<float> mulInput1Data + { + 2.0f, 1.0f, 1.0f + }; + std::vector<float> addInput1Data + { + 3.0f, 0.0f, 0.0f + }; + std::vector<float> output0ExpectedData = + { + 0.0f, 0.0f, 0.0f, + 2.0f, 2.0f, 2.0f, + -2.0f, -2.0f, -2.0f, + -4.0f, -4.0f, -4.0f + }; + std::vector<float> output1ExpectedData = + { + 3.0f, 0.0f, 0.0f, + 7.0f, 2.0f, 2.0f, + -1.0f, -2.0f, -2.0f, + -5.0f, -4.0f, -4.0f + }; + std::vector<float> output1ReluExpectedData = + { + 3.0f, 0.0f, 0.0f, + 7.0f, 2.0f, 2.0f, + 0.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 0.0f + }; + + std::vector<T> input0 = armnnUtils::QuantizedVector<T>(input0Data, + input0TensorInfo.GetQuantizationScale(), + input0TensorInfo.GetQuantizationOffset()); + std::vector<T> input1 = armnnUtils::QuantizedVector<T>(input1Data, + input1TensorInfo.GetQuantizationScale(), + input1TensorInfo.GetQuantizationOffset()); + std::vector<T> mulInput1 = armnnUtils::QuantizedVector<T>(mulInput1Data, + mulInput1TensorInfo.GetQuantizationScale(), + mulInput1TensorInfo.GetQuantizationOffset()); + std::vector<T> addInput1 = armnnUtils::QuantizedVector<T>(addInput1Data, + addInput1TensorInfo.GetQuantizationScale(), + addInput1TensorInfo.GetQuantizationOffset()); + std::vector<T> output0Expected = armnnUtils::QuantizedVector<T>(output0ExpectedData, + output0TensorInfo.GetQuantizationScale(), + output0TensorInfo.GetQuantizationOffset()); + std::vector<T> output1Expected = armnnUtils::QuantizedVector<T>(output1ExpectedData, + output1TensorInfo.GetQuantizationScale(), + output1TensorInfo.GetQuantizationOffset()); + std::vector<T> output1ReluExpected = armnnUtils::QuantizedVector<T>(output1ReluExpectedData, + output1TensorInfo.GetQuantizationScale(), + output1TensorInfo.GetQuantizationOffset()); + + std::vector<T> output0Actual(output0TensorInfo.GetNumElements()); + std::vector<T> output1Actual(output1TensorInfo.GetNumElements()); + + // Create a network + INetworkPtr network = INetwork::Create(); + + IConnectableLayer* const input0Layer = network->AddInputLayer(0); + IConnectableLayer* const input1Layer = network->AddInputLayer(1); + IConnectableLayer* const add0Layer = network->AddElementwiseBinaryLayer(BinaryOperation::Add, "add0"); + IConnectableLayer* outputAddLayer = nullptr; + if (addOutput) + { + outputAddLayer = network->AddOutputLayer(0); + } + + auto constMulInput1Tensor = armnn::ConstTensor(mulInput1TensorInfo, mulInput1); + IConnectableLayer* const mulInput1Layer = network->AddConstantLayer(constMulInput1Tensor, "mulInput1"); + IConnectableLayer* const mulLayer = network->AddElementwiseBinaryLayer(BinaryOperation::Mul, "mul"); + + auto constAddInput1Tensor = armnn::ConstTensor(addInput1TensorInfo, addInput1); + IConnectableLayer* const addInput1Layer = network->AddConstantLayer(constAddInput1Tensor, "addInput1"); + IConnectableLayer* const add1Layer = network->AddElementwiseBinaryLayer(BinaryOperation::Add, "add1"); + IConnectableLayer* const outputLayer = network->AddOutputLayer(1); + + IConnectableLayer* relu = nullptr; + if (addRelu) + { + relu = network->AddActivationLayer(ActivationFunction::ReLu, "relu"); + } + + input0Layer->GetOutputSlot(0).SetTensorInfo(input0TensorInfo); + input1Layer->GetOutputSlot(0).SetTensorInfo(input1TensorInfo); + add0Layer->GetOutputSlot(0).SetTensorInfo(output0TensorInfo); + mulInput1Layer->GetOutputSlot(0).SetTensorInfo(mulInput1TensorInfo); + mulLayer->GetOutputSlot(0).SetTensorInfo(input0TensorInfo); + addInput1Layer->GetOutputSlot(0).SetTensorInfo(addInput1TensorInfo); + add1Layer->GetOutputSlot(0).SetTensorInfo(output1TensorInfo); + if (addRelu) + { + relu->GetOutputSlot(0).SetTensorInfo(output1TensorInfo); + } + + input0Layer->GetOutputSlot(0).Connect(add0Layer->GetInputSlot(0)); + input1Layer->GetOutputSlot(0).Connect(add0Layer->GetInputSlot(1)); + if (addOutput) + { + add0Layer->GetOutputSlot(0).Connect(outputAddLayer->GetInputSlot(0)); + } + + add0Layer->GetOutputSlot(0).Connect(mulLayer->GetInputSlot(0)); + mulInput1Layer->GetOutputSlot(0).Connect(mulLayer->GetInputSlot(1)); + mulLayer->GetOutputSlot(0).Connect(add1Layer->GetInputSlot(0)); + addInput1Layer->GetOutputSlot(0).Connect(add1Layer->GetInputSlot(1)); + + if (addRelu) + { + add1Layer->GetOutputSlot(0).Connect(relu->GetInputSlot(0)); + relu->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + } + else + { + add1Layer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + } + + // Create ArmNN runtime + IRuntimePtr run = IRuntime::Create(IRuntime::CreationOptions()); + + // Optimise ArmNN network + IOptimizedNetworkPtr optNet = Optimize(*network, {backendId}, run->GetDeviceSpec()); + + Graph& graph = GetGraphForTesting(optNet.get()); + + // There are 9 add layers above, so we should be left with 7 layers in the graph after the optimization. + // Numbers reduced by 1 in addOutput == false case + unsigned int expectedLayerNum = (addOutput) ? 7 : 6; + CHECK((graph.GetNumLayers() == expectedLayerNum)); + + if (addOutput) + { + CHECK(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType<InputLayer>, + &IsLayerOfType<InputLayer>, + &IsLayerOfType<ConstantLayer>, + &IsLayerOfType<ConstantLayer>, + &IsLayerOfType<FusedLayer>, + &IsLayerOfType<OutputLayer>, + &IsLayerOfType<OutputLayer>)); + } + else + { + CHECK(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType<InputLayer>, + &IsLayerOfType<InputLayer>, + &IsLayerOfType<ConstantLayer>, + &IsLayerOfType<ConstantLayer>, + &IsLayerOfType<FusedLayer>, + &IsLayerOfType<OutputLayer>)); + } + + // Load network into runtime + NetworkId networkIdentifier; + run->LoadNetwork(networkIdentifier, std::move(optNet)); + + // Create input and output tensors + InputTensors inputTensors + { + {0, armnn::ConstTensor(input0TensorInfo, input0.data())}, + {1, armnn::ConstTensor(input1TensorInfo, input1.data())} + }; + OutputTensors outputTensors; + if (addOutput) + { + outputTensors.push_back( + {0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), output0Actual.data())}); + outputTensors.push_back( + {1, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 1), output1Actual.data())}); + } + else + { + outputTensors.push_back( + {1, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 1), output1Actual.data())}); + } + + // Run inference + run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors); + + // Checks the results + if (addOutput) + { + CHECK(output0Actual == output0Expected); + } + + if (addRelu) + { + CHECK(output1Actual == output1ReluExpected); + } + else + { + CHECK(output1Actual == output1Expected); + } + +} +} + +#if defined(ARMCOMPUTENEON_ENABLED) +TEST_SUITE("Optimizer_AddMulAdd") +{ + +TEST_CASE("AddMulAdd2OutputsFloat32Test") +{ + AddMulAddTest<DataType::Float32>(Compute::CpuAcc, true, false); +} + +TEST_CASE("AddMulAdd2OutputsInt8Test") +{ + AddMulAddTest<DataType::QAsymmS8>(Compute::CpuAcc, true, false); +} + +TEST_CASE("AddMulAdd2OutputsUint8Test") +{ + AddMulAddTest<DataType::QAsymmU8>(Compute::CpuAcc, true, false); +} + +TEST_CASE("AddMulAdd1OutputFloat32Test") +{ + AddMulAddTest<DataType::Float32>(Compute::CpuAcc, false, false); +} + +TEST_CASE("AddMulAdd1OutputInt8Test") +{ + AddMulAddTest<DataType::QAsymmS8>(Compute::CpuAcc, false, false); +} + +TEST_CASE("AddMulAdd1OutputUint8Test") +{ + AddMulAddTest<DataType::QAsymmU8>(Compute::CpuAcc, false, false); +} + +// +// Relu tests +// +TEST_CASE("AddMulAddRelu2OutputsFloat32Test") +{ + AddMulAddTest<DataType::Float32>(Compute::CpuAcc, true, true); +} + +TEST_CASE("AddMulAddRelu1OutputFloat32Test") +{ + AddMulAddTest<DataType::Float32>(Compute::CpuAcc, false, true); +} + +} +#endif
\ No newline at end of file diff --git a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp index 90c0fd5890..a44acb0f54 100644 --- a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp +++ b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp @@ -356,4 +356,25 @@ void ReplaceLayers(OptimizationViews& optimizationViews, optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph}); } +// +// Substitute a multi-layer subgraph with one new layer +// +template<typename LayerType> +void ReplaceMultipleLayers(OptimizationViews& optimizationViews, + std::vector<IConnectableLayer*>& originalLayers, + LayerType* baseLayer, + const std::vector<SlotList> inputLayersSlotLists, + const std::vector<SlotList> outputLayersSlotLists) +{ + std::list<IConnectableLayer*> originalLayerList(originalLayers.begin(), originalLayers.end()); + + SubgraphView substitutionSubgraph( + std::move(originalLayerList), + CreateIInputsFromSlotLists<armnn::IConnectableLayer>(originalLayers, inputLayersSlotLists), + CreateIOutputsFromSlotLists<armnn::IConnectableLayer>(originalLayers, outputLayersSlotLists)); + SubgraphView replacementSubgraph(baseLayer); + + optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph}); +} + } // namespace armnn diff --git a/src/backends/backendsCommon/SubgraphUtils.hpp b/src/backends/backendsCommon/SubgraphUtils.hpp index 9f2cdba6ef..6a9e8f1b76 100644 --- a/src/backends/backendsCommon/SubgraphUtils.hpp +++ b/src/backends/backendsCommon/SubgraphUtils.hpp @@ -161,6 +161,53 @@ SubgraphView::IOutputSlots CreateIOutputsFrom(const std::vector<armnn::IConnecta return result; } +// Type used to hold the slot numbers to create the lists from. There should +// be a SlotList for each layer in the layers list +typedef std::vector<int> SlotList; + +template<typename ILayerType> +SubgraphView::IInputSlots CreateIInputsFromSlotLists(const std::vector<ILayerType*>& layers, + const std::vector<SlotList>& layersSlotLists) +{ + ARMNN_THROW_INVALIDARG_IF_FALSE(layersSlotLists.size() == layers.size()); + + SubgraphView::IInputSlots result; + + for (unsigned int layerIdx = 0; layerIdx < layers.size(); ++layerIdx) + { + const SlotList& slotList = layersSlotLists[layerIdx]; + for (unsigned int slotIdx = 0 ; slotIdx < layers[layerIdx]->GetNumInputSlots(); ++slotIdx) + { + if (std::find(slotList.begin(), slotList.end(), slotIdx) != slotList.end()) + { + result.push_back(&(layers[layerIdx]->GetInputSlot(slotIdx))); + } + } + } + return result; +} + +template<typename ILayerType> +SubgraphView::IOutputSlots CreateIOutputsFromSlotLists(const std::vector<ILayerType*>& layers, + const std::vector<SlotList>& layersSlotLists) +{ + ARMNN_THROW_INVALIDARG_IF_FALSE(layersSlotLists.size() == layers.size()); + + SubgraphView::IOutputSlots result; + for (unsigned int layerIdx = 0; layerIdx < layers.size(); ++layerIdx) + { + const SlotList& slotList = layersSlotLists[layerIdx]; + for (unsigned int slotIdx = 0; slotIdx < layers[layerIdx]->GetNumOutputSlots(); ++slotIdx) + { + bool foundIt = std::find(slotList.begin(), slotList.end(), slotIdx) != slotList.end(); + if (foundIt) + { + result.push_back(&(layers[layerIdx]->GetOutputSlot(slotIdx))); + } + } + } + return result; +} } inline bool IsNCHW(armnn::Layer& layer) @@ -308,4 +355,117 @@ LayerType* FoldPadIntoAveragePool2d(OptimizationViews& optimizationViews, return replacementLayer; } +// +// Layer sequence detection such as add + mul + add ( + optional activation ) +// + +inline bool IsSequenceLayerType(Layer& layer, LayerType type) +{ + return layer.GetType() == type; +} + +inline bool IsSequenceLayerType(Layer& layer, BinaryOperation type) +{ + return (layer.GetType() == LayerType::ElementwiseBinary) && + (PolymorphicDowncast<ElementwiseBinaryLayer*>(&layer)->GetParameters().m_Operation == type); +} + +// Detect a layer sequence and activation if specified. The activation must be at the end of the sequence. +template<typename TYPE> +bool IsLayerSequence(Layer& currentLayer, + TYPE first, + TYPE second, + TYPE third, + Layer* layerList[4], + bool handleValidActivates, + const std::vector<ActivationFunction>& validActivates) +{ + auto PreviousLayer = [](Layer& layer) + { + return &layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer(); + }; + + auto NextLayer = [](Layer& layer) + { + return &layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer(); + }; + + auto LayerIncomingConnectionDataType = [](Layer& layer) + { + return layer.GetInputSlot(0).GetTensorInfo().GetDataType(); + }; + + bool result = false; + + // Match in reverse so there is only 1 connection to check + if (IsSequenceLayerType(currentLayer, third)) + { + // Save DataType of third layer + DataType dataType = LayerIncomingConnectionDataType(currentLayer); + + // Save third layer + layerList[2] = ¤tLayer; + + // Check the layers that proceed this one for the requested grouping + Layer *prevLayer = PreviousLayer(currentLayer); + if (prevLayer && IsSequenceLayerType(*prevLayer, second)) + { + bool dataTypesMatch = (dataType == LayerIncomingConnectionDataType(*prevLayer)); + if (! dataTypesMatch) + { + return result; + } + + layerList[1] = prevLayer; + prevLayer = PreviousLayer(*prevLayer); + if (prevLayer && IsSequenceLayerType(*prevLayer, first)) + { + dataTypesMatch = (dataType == LayerIncomingConnectionDataType(*prevLayer)); + if (! dataTypesMatch) + { + return result; + } + + layerList[0] = prevLayer; + + // Detected the first 3 layers if we get to this point so now + // check to see if we have a valid activation. If there is no activation + // then the sequence still matches. + if (handleValidActivates) + { + Layer *nextLayer = NextLayer(currentLayer); + if (nextLayer) + { + if (IsSequenceLayerType(*nextLayer, LayerType::Activation)) + { + // This layer is an activation, so it must be a valid type for the sequence + ActivationFunction activationFunction = + PolymorphicDowncast<ActivationLayer*>(nextLayer)->GetParameters().m_Function; + long count = std::count(validActivates.cbegin(), + validActivates.cend(), + activationFunction); + if (count > 0) + { + layerList[3] = nextLayer; + result = true; + } + } + else + { + // Next layer is not an activation so sequence still matches + result = true; + } + } + } + else + { + result = true; + } + } + } + } + + return result; +} + } // namespace armnn diff --git a/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp index 9dece9be3b..39d2219954 100644 --- a/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp +++ b/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp @@ -39,18 +39,18 @@ std::vector<LayerTestResult<T,4>> AddMulAddTest(armnn::IWorkloadFactory& workloa if (IsQuantizedType<T>()) { input0TensorInfo.SetQuantizationScale(0.25f); - input0TensorInfo.SetQuantizationOffset(128); + input0TensorInfo.SetQuantizationOffset(10); input1TensorInfo.SetQuantizationScale(0.25f); - input1TensorInfo.SetQuantizationOffset(128); + input1TensorInfo.SetQuantizationOffset(11); mulInput1TensorInfo.SetQuantizationScale(0.25f); - mulInput1TensorInfo.SetQuantizationOffset(128); + mulInput1TensorInfo.SetQuantizationOffset(12); addInput1TensorInfo.SetQuantizationScale(0.25f); - addInput1TensorInfo.SetQuantizationOffset(128); + addInput1TensorInfo.SetQuantizationOffset(13); output0TensorInfo.SetQuantizationScale(0.5f); - output0TensorInfo.SetQuantizationOffset(120); + output0TensorInfo.SetQuantizationOffset(14); output1TensorInfo.SetQuantizationScale(0.5f); - output1TensorInfo.SetQuantizationOffset(120); + output1TensorInfo.SetQuantizationOffset(15); } std::vector<float> input0Data @@ -140,6 +140,12 @@ std::vector<LayerTestResult<T,4>> AddMulAddTest(armnn::IWorkloadFactory& workloa } AddOutputToWorkload(fusedQueueDescriptor, info, output1TensorInfo, output1Handle.get()); + if (addOutput) + { + AddOutputToWorkload(fusedQueueDescriptor, info, output0TensorInfo, output0Handle.get()); + } + AddOutputToWorkload(fusedQueueDescriptor, info, output1TensorInfo, output1Handle.get()); + std::unique_ptr<IWorkload> workload = workloadFactory.CreateWorkload(LayerType::Fused, fusedQueueDescriptor, info); diff --git a/src/backends/neon/CMakeLists.txt b/src/backends/neon/CMakeLists.txt index 1c077731c4..8ceeef386b 100644 --- a/src/backends/neon/CMakeLists.txt +++ b/src/backends/neon/CMakeLists.txt @@ -8,6 +8,7 @@ if(ARMCOMPUTENEON) NeonBackend.cpp NeonBackend.hpp NeonBackendId.hpp + NeonBackendOptimizationUtils.hpp NeonBackendModelContext.hpp NeonBackendModelContext.cpp NeonInterceptorScheduler.hpp diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp index b5719db007..7311098631 100644 --- a/src/backends/neon/NeonBackend.cpp +++ b/src/backends/neon/NeonBackend.cpp @@ -9,6 +9,7 @@ #include "NeonWorkloadFactory.hpp" #include "NeonLayerSupport.hpp" #include "NeonTensorHandleFactory.hpp" +#include "NeonBackendOptimizationUtils.hpp" #include <armnn/BackendRegistry.hpp> #include <armnn/Descriptors.hpp> @@ -28,6 +29,7 @@ #include <neon/workloads/NeonDepthwiseConvolutionWorkload.hpp> #include <neon/workloads/NeonDivisionWorkload.hpp> #include <neon/workloads/NeonFullyConnectedWorkload.hpp> +#include <neon/workloads/NeonFusedWorkload.hpp> #include <neon/workloads/NeonMultiplicationWorkload.hpp> #include <neon/workloads/NeonReduceWorkload.hpp> #include <neon/workloads/NeonSubtractionWorkload.hpp> @@ -524,6 +526,86 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph } RemoveReshapeLayer(baseLayer, untouched, optimizationViews); } + + // Replace Add/Mul/Add where possible + Layer* layerList[4] = {nullptr, nullptr, nullptr, nullptr}; + const std::vector<ActivationFunction> validActivates = { ActivationFunction::ReLu, + ActivationFunction::BoundedReLu }; + if (IsLayerSequence<BinaryOperation>(base, + BinaryOperation::Add, BinaryOperation::Mul, BinaryOperation::Add, + layerList, + true, // handleValidActivates + validActivates)) + { + bool fuseReLu = false; + unsigned int numInputs = 0; + unsigned int numOutputs = 0; + std::vector<TensorInfo> inputInfos; + std::vector<TensorInfo> outputInfos; + const ActivationDescriptor* activationDescriptor = nullptr; + + if (BuildAddMulAddTensorInfoLists<Layer>(layerList, + numInputs, + numOutputs, + inputInfos, + outputInfos, + activationDescriptor, + fuseReLu)) + { + // Create the new Add/Mul/Add layer and set the Relu activation function + FusedDescriptor fusedDescriptor(numInputs, numOutputs, FusedKernelType::AddMulAdd); + arm_compute::Status status = NeonFusedWorkloadValidate({inputInfos.begin(), inputInfos.end()}, + {outputInfos.begin(), outputInfos.end()}, + fusedDescriptor, + activationDescriptor); + if (status) + { + std::string fusedName; + GetFusedName(layerList, fusedName); + + IConnectableLayer* addMulAddLayer = + optimizationViews.GetINetwork()->AddFusedLayer(fusedDescriptor, fusedName.c_str()); + + if (fuseReLu) + { + FusedLayer* addMulAddFusedLayer = PolymorphicDowncast<FusedLayer*>(addMulAddLayer); + addMulAddFusedLayer->SetAdditionalInfoForObject( + std::make_shared<ActivationDescriptor>(*activationDescriptor)); + } + + // Update the graph + std::vector<IConnectableLayer*> originalLayers; + for (unsigned int layerIdx = 0; layerIdx < 4; ++layerIdx) + { + if (layerList[layerIdx]) + { + originalLayers.push_back(layerList[layerIdx]); + } + } + + std::vector<SlotList> inputLayersSlotLists, outputLayersSlotLists; + BuildAddMulAddSlotLists<SlotList>(fuseReLu, + outputInfos.size() > 1, + inputLayersSlotLists, + outputLayersSlotLists); + + ReplaceMultipleLayers<FusedLayer>(optimizationViews, + originalLayers, + PolymorphicDowncast<FusedLayer*>(addMulAddLayer), + inputLayersSlotLists, + outputLayersSlotLists); + + // Remove unused layers + for (unsigned int layerIdx = 0; layerIdx < 4; ++layerIdx) + { + if (layerList[layerIdx]) + { + untouched.erase(layerList[layerIdx]->GetGuid()); + } + } + } + } + } } if (optimizationViews.GetSubstitutions().empty() && optimizationViews.GetDeletedSubgraphs().empty()) diff --git a/src/backends/neon/NeonBackendOptimizationUtils.hpp b/src/backends/neon/NeonBackendOptimizationUtils.hpp new file mode 100644 index 0000000000..3a8bf46599 --- /dev/null +++ b/src/backends/neon/NeonBackendOptimizationUtils.hpp @@ -0,0 +1,215 @@ +// +// Copyright © 2023 Arm Ltd and Contributors. All rights reserved. +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include <aclCommon/ArmComputeSubgraphUtils.hpp> + +namespace armnn +{ + +// Changes shapes of the form [1, 1, ..., W] to [ W ] +inline bool CollapseLeadingUnitDimensions(const TensorInfo& in, TensorInfo& out) +{ + unsigned int numDimensions = in.GetNumDimensions(); + for (unsigned int i = 0; i < (numDimensions-1); ++i) + { + if (in.GetShape()[i] != 1) + { + return false; + } + } + + unsigned int w = in.GetShape()[numDimensions-1]; + out = in; + out.SetShape({w}); + + return true; +} + +// +// Build slot and tensor info lists for Add/Mul/Add replacement +// +template<typename SlotListType> +void BuildAddMulAddSlotLists(bool handleReLu, + bool multipleOutputs, + std::vector<SlotListType>& inputLayersSlotLists, + std::vector<SlotListType>& outputLayersSlotLists) +{ + // Build input slot list + inputLayersSlotLists.push_back({0, 1}); // Add + inputLayersSlotLists.push_back({1}); // Mul + inputLayersSlotLists.push_back({1}); // Add + if (handleReLu) + { + inputLayersSlotLists.push_back({}); // Relu + } + + // Build output slot list + if (multipleOutputs) + { + outputLayersSlotLists.push_back({0}); // Add + } + else + { + outputLayersSlotLists.push_back({}); // Add + } + outputLayersSlotLists.push_back({}); // Mul + if (handleReLu) + { + outputLayersSlotLists.push_back({}); // Add + outputLayersSlotLists.push_back({0}); // Relu + } + else + { + outputLayersSlotLists.push_back({0}); // Add + } +} + +inline void GetFusedName(Layer *layerList[4], std::string& fusedName) +{ + // Build the fused name string + fusedName = "fused"; + for (unsigned int layerIdx = 0; layerIdx< 4; ++layerIdx) + { + if (! layerList[layerIdx]) + { + break; + } + fusedName += "-"; + fusedName += layerList[layerIdx]->GetNameStr(); + } +} + +template<typename Type> +bool BuildAddMulAddTensorInfoLists(Type* layerList[4], + unsigned int& numInputs, + unsigned int& numOutputs, + std::vector<TensorInfo>& inputInfos, + std::vector<TensorInfo>& outputInfos, + const ActivationDescriptor*& activationDescriptor, + bool& fuseReLu) +{ + ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[0]); + ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[1]); + ARMNN_THROW_INVALIDARG_IF_FALSE(layerList[2]); + + ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[0], BinaryOperation::Add)); + ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[1], BinaryOperation::Mul)); + ARMNN_THROW_INVALIDARG_IF_FALSE(IsSequenceLayerType(*layerList[2], BinaryOperation::Add)); + + fuseReLu = (layerList[3] != nullptr); + if (fuseReLu) + { + activationDescriptor = &PolymorphicDowncast<ActivationLayer *>(layerList[3])->GetParameters(); + ARMNN_THROW_INVALIDARG_IF_FALSE((activationDescriptor->m_Function == ActivationFunction::ReLu) || + (activationDescriptor->m_Function == ActivationFunction::BoundedReLu)); + } + + numInputs = 0; + numOutputs = 0; + + // Ensure that there are 6 input slots in the add/mul/add layers + // we are going to replace + unsigned int layerIdx = 0; + unsigned int inputSlotCount = 0; + for (layerIdx = 0; layerIdx < 3; ++layerIdx) + { + for (unsigned int slotIdx = 0; slotIdx < layerList[layerIdx]->GetNumInputSlots(); ++slotIdx) + { + InputSlot* inputSlot = &layerList[layerIdx]->GetInputSlot(slotIdx); + OutputSlot* outputSlot = inputSlot->GetConnectedOutputSlot(); + if (outputSlot) + { + if (layerIdx == 0) + { + // Always count the input connections of the first add + inputInfos.push_back(inputSlot->GetTensorInfo()); + numInputs++; + } + else + { + // For subsequent layers, we skip connections to the previous layers in the counting + if (&outputSlot->GetOwningLayer() != layerList[layerIdx-1]) + { + TensorInfo inputSlotInfo = inputSlot->GetTensorInfo(); + if (numInputs == 2 || numInputs == 3) + { + // Workaround the broadcast optimization to collapse shapes such as + // [1, 1, 1, 2] to [2] as required by backend + if (CollapseLeadingUnitDimensions(inputSlot->GetTensorInfo(), inputSlotInfo)) + { + OutputSlot* previousLayerSlot = inputSlot->GetConnectedOutputSlot(); + if (previousLayerSlot) + { + if (previousLayerSlot->GetOwningLayer().GetType() == LayerType::Constant) + { + // First update the TensorInfo in the constant owning layer + previousLayerSlot->SetTensorInfo(inputSlotInfo); + // Then update the TensorInfo in the workload for the owning layer + ConstantLayer* layer = PolymorphicDowncast<ConstantLayer*>( + &previousLayerSlot->GetOwningLayer()); + layer->m_LayerOutput + = std::make_unique<ScopedTensorHandle>( + ConstTensor(inputSlotInfo, + layer->m_LayerOutput.get()->GetConstTensor<void>())); + } + } + } + } + inputInfos.push_back(inputSlotInfo); + numInputs++; + } + } + inputSlotCount++; + } + } + } + + // Check the input counts + bool validInputCount = (inputSlotCount == 6) && (inputInfos.size() == 4); + if (! validInputCount) + { + return false; + } + + const unsigned int maxIdx = (fuseReLu) ? 4 : 3; + for (layerIdx = 0; layerIdx < maxIdx; ++layerIdx) + { + for (unsigned int slotIdx = 0; slotIdx < layerList[layerIdx]->GetNumOutputSlots(); ++slotIdx) + { + OutputSlot* outputSlot = &layerList[layerIdx]->GetOutputSlot(slotIdx); + + for (unsigned int connectionIdx = 0; connectionIdx < outputSlot->GetNumConnections(); ++connectionIdx) + { + InputSlot* inputSlot = outputSlot->GetConnection(connectionIdx); + if (layerIdx < (maxIdx-1)) + { + if (&inputSlot->GetOwningLayer() != layerList[layerIdx+1]) + { + outputInfos.push_back(outputSlot->GetTensorInfo()); + numOutputs++; + } + } + else if (layerList[layerIdx] != nullptr) + { + outputInfos.push_back(outputSlot->GetTensorInfo()); + numOutputs++; + } + } + } + } + + // Check the output count + bool validOutputCount = (outputInfos.size() > 0); + if (! validOutputCount) + { + return false; + } + + return true; +} + +} diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp index c9dd1ff507..658d718b19 100644 --- a/src/backends/neon/test/NeonLayerTests.cpp +++ b/src/backends/neon/test/NeonLayerTests.cpp @@ -1726,9 +1726,11 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Te // AddMulAdd ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsFloat32, AddMulAddTest<DataType::Float32>, true) +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsInt8, AddMulAddTest<DataType::QAsymmS8>, true) ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsUint8, AddMulAddTest<DataType::QAsymmU8>, true) ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputFloat32, AddMulAddTest<DataType::Float32>, false) +ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputInt8, AddMulAddTest<DataType::QAsymmS8>, false) ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputUint8, AddMulAddTest<DataType::QAsymmU8>, false) #if defined(ARMNNREF_ENABLED) |