diff options
author | Tracy Narine <tracy.narine@arm.com> | 2023-09-20 14:19:07 +0100 |
---|---|---|
committer | Tracy Narine <tracy.narine@arm.com> | 2023-09-28 14:25:16 +0100 |
commit | 6440ce89abb06e090d2b3cf91bafc14277072475 (patch) | |
tree | c55682891a0f01f3edbf5dad58720ded7af3fc64 /src/backends/neon/NeonBackend.cpp | |
parent | 9a418d850333119e219fb05addc57b56cdc60a7e (diff) | |
download | armnn-6440ce89abb06e090d2b3cf91bafc14277072475.tar.gz |
IVGCVSW-7504 Create a backend specific optimization to fuse ADD+MUL+Add+(Activation) in CpuAcc
* Adding CpuAcc backend optimization to fuse add+mul+add into one layer
* Tests added/enhanced
* Also added optional extended parameter to Graph::Print()
and throw macros that could be used in place of assert
Signed-off-by: Tracy Narine <tracy.narine@arm.com>
Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: I5f8d094b969a130d8c2c7b4da07426313a9fea76
Diffstat (limited to 'src/backends/neon/NeonBackend.cpp')
-rw-r--r-- | src/backends/neon/NeonBackend.cpp | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp index b5719db007..7311098631 100644 --- a/src/backends/neon/NeonBackend.cpp +++ b/src/backends/neon/NeonBackend.cpp @@ -9,6 +9,7 @@ #include "NeonWorkloadFactory.hpp" #include "NeonLayerSupport.hpp" #include "NeonTensorHandleFactory.hpp" +#include "NeonBackendOptimizationUtils.hpp" #include <armnn/BackendRegistry.hpp> #include <armnn/Descriptors.hpp> @@ -28,6 +29,7 @@ #include <neon/workloads/NeonDepthwiseConvolutionWorkload.hpp> #include <neon/workloads/NeonDivisionWorkload.hpp> #include <neon/workloads/NeonFullyConnectedWorkload.hpp> +#include <neon/workloads/NeonFusedWorkload.hpp> #include <neon/workloads/NeonMultiplicationWorkload.hpp> #include <neon/workloads/NeonReduceWorkload.hpp> #include <neon/workloads/NeonSubtractionWorkload.hpp> @@ -524,6 +526,86 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph } RemoveReshapeLayer(baseLayer, untouched, optimizationViews); } + + // Replace Add/Mul/Add where possible + Layer* layerList[4] = {nullptr, nullptr, nullptr, nullptr}; + const std::vector<ActivationFunction> validActivates = { ActivationFunction::ReLu, + ActivationFunction::BoundedReLu }; + if (IsLayerSequence<BinaryOperation>(base, + BinaryOperation::Add, BinaryOperation::Mul, BinaryOperation::Add, + layerList, + true, // handleValidActivates + validActivates)) + { + bool fuseReLu = false; + unsigned int numInputs = 0; + unsigned int numOutputs = 0; + std::vector<TensorInfo> inputInfos; + std::vector<TensorInfo> outputInfos; + const ActivationDescriptor* activationDescriptor = nullptr; + + if (BuildAddMulAddTensorInfoLists<Layer>(layerList, + numInputs, + numOutputs, + inputInfos, + outputInfos, + activationDescriptor, + fuseReLu)) + { + // Create the new Add/Mul/Add layer and set the Relu activation function + FusedDescriptor fusedDescriptor(numInputs, numOutputs, FusedKernelType::AddMulAdd); + arm_compute::Status status = NeonFusedWorkloadValidate({inputInfos.begin(), inputInfos.end()}, + {outputInfos.begin(), outputInfos.end()}, + fusedDescriptor, + activationDescriptor); + if (status) + { + std::string fusedName; + GetFusedName(layerList, fusedName); + + IConnectableLayer* addMulAddLayer = + optimizationViews.GetINetwork()->AddFusedLayer(fusedDescriptor, fusedName.c_str()); + + if (fuseReLu) + { + FusedLayer* addMulAddFusedLayer = PolymorphicDowncast<FusedLayer*>(addMulAddLayer); + addMulAddFusedLayer->SetAdditionalInfoForObject( + std::make_shared<ActivationDescriptor>(*activationDescriptor)); + } + + // Update the graph + std::vector<IConnectableLayer*> originalLayers; + for (unsigned int layerIdx = 0; layerIdx < 4; ++layerIdx) + { + if (layerList[layerIdx]) + { + originalLayers.push_back(layerList[layerIdx]); + } + } + + std::vector<SlotList> inputLayersSlotLists, outputLayersSlotLists; + BuildAddMulAddSlotLists<SlotList>(fuseReLu, + outputInfos.size() > 1, + inputLayersSlotLists, + outputLayersSlotLists); + + ReplaceMultipleLayers<FusedLayer>(optimizationViews, + originalLayers, + PolymorphicDowncast<FusedLayer*>(addMulAddLayer), + inputLayersSlotLists, + outputLayersSlotLists); + + // Remove unused layers + for (unsigned int layerIdx = 0; layerIdx < 4; ++layerIdx) + { + if (layerList[layerIdx]) + { + untouched.erase(layerList[layerIdx]->GetGuid()); + } + } + } + } + } } if (optimizationViews.GetSubstitutions().empty() && optimizationViews.GetDeletedSubgraphs().empty()) |