aboutsummaryrefslogtreecommitdiff
path: root/src/backends/neon/NeonBackend.cpp
diff options
context:
space:
mode:
authorTracy Narine <tracy.narine@arm.com>2023-09-20 14:19:07 +0100
committerTracy Narine <tracy.narine@arm.com>2023-09-28 14:25:16 +0100
commit6440ce89abb06e090d2b3cf91bafc14277072475 (patch)
treec55682891a0f01f3edbf5dad58720ded7af3fc64 /src/backends/neon/NeonBackend.cpp
parent9a418d850333119e219fb05addc57b56cdc60a7e (diff)
downloadarmnn-6440ce89abb06e090d2b3cf91bafc14277072475.tar.gz
IVGCVSW-7504 Create a backend specific optimization to fuse ADD+MUL+Add+(Activation) in CpuAcc
* Adding CpuAcc backend optimization to fuse add+mul+add into one layer * Tests added/enhanced * Also added optional extended parameter to Graph::Print() and throw macros that could be used in place of assert Signed-off-by: Tracy Narine <tracy.narine@arm.com> Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com> Change-Id: I5f8d094b969a130d8c2c7b4da07426313a9fea76
Diffstat (limited to 'src/backends/neon/NeonBackend.cpp')
-rw-r--r--src/backends/neon/NeonBackend.cpp82
1 files changed, 82 insertions, 0 deletions
diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp
index b5719db007..7311098631 100644
--- a/src/backends/neon/NeonBackend.cpp
+++ b/src/backends/neon/NeonBackend.cpp
@@ -9,6 +9,7 @@
#include "NeonWorkloadFactory.hpp"
#include "NeonLayerSupport.hpp"
#include "NeonTensorHandleFactory.hpp"
+#include "NeonBackendOptimizationUtils.hpp"
#include <armnn/BackendRegistry.hpp>
#include <armnn/Descriptors.hpp>
@@ -28,6 +29,7 @@
#include <neon/workloads/NeonDepthwiseConvolutionWorkload.hpp>
#include <neon/workloads/NeonDivisionWorkload.hpp>
#include <neon/workloads/NeonFullyConnectedWorkload.hpp>
+#include <neon/workloads/NeonFusedWorkload.hpp>
#include <neon/workloads/NeonMultiplicationWorkload.hpp>
#include <neon/workloads/NeonReduceWorkload.hpp>
#include <neon/workloads/NeonSubtractionWorkload.hpp>
@@ -524,6 +526,86 @@ OptimizationViews NeonBackend::OptimizeSubgraphView(const SubgraphView& subgraph
}
RemoveReshapeLayer(baseLayer, untouched, optimizationViews);
}
+
+ // Replace Add/Mul/Add where possible
+ Layer* layerList[4] = {nullptr, nullptr, nullptr, nullptr};
+ const std::vector<ActivationFunction> validActivates = { ActivationFunction::ReLu,
+ ActivationFunction::BoundedReLu };
+ if (IsLayerSequence<BinaryOperation>(base,
+ BinaryOperation::Add, BinaryOperation::Mul, BinaryOperation::Add,
+ layerList,
+ true, // handleValidActivates
+ validActivates))
+ {
+ bool fuseReLu = false;
+ unsigned int numInputs = 0;
+ unsigned int numOutputs = 0;
+ std::vector<TensorInfo> inputInfos;
+ std::vector<TensorInfo> outputInfos;
+ const ActivationDescriptor* activationDescriptor = nullptr;
+
+ if (BuildAddMulAddTensorInfoLists<Layer>(layerList,
+ numInputs,
+ numOutputs,
+ inputInfos,
+ outputInfos,
+ activationDescriptor,
+ fuseReLu))
+ {
+ // Create the new Add/Mul/Add layer and set the Relu activation function
+ FusedDescriptor fusedDescriptor(numInputs, numOutputs, FusedKernelType::AddMulAdd);
+ arm_compute::Status status = NeonFusedWorkloadValidate({inputInfos.begin(), inputInfos.end()},
+ {outputInfos.begin(), outputInfos.end()},
+ fusedDescriptor,
+ activationDescriptor);
+ if (status)
+ {
+ std::string fusedName;
+ GetFusedName(layerList, fusedName);
+
+ IConnectableLayer* addMulAddLayer =
+ optimizationViews.GetINetwork()->AddFusedLayer(fusedDescriptor, fusedName.c_str());
+
+ if (fuseReLu)
+ {
+ FusedLayer* addMulAddFusedLayer = PolymorphicDowncast<FusedLayer*>(addMulAddLayer);
+ addMulAddFusedLayer->SetAdditionalInfoForObject(
+ std::make_shared<ActivationDescriptor>(*activationDescriptor));
+ }
+
+ // Update the graph
+ std::vector<IConnectableLayer*> originalLayers;
+ for (unsigned int layerIdx = 0; layerIdx < 4; ++layerIdx)
+ {
+ if (layerList[layerIdx])
+ {
+ originalLayers.push_back(layerList[layerIdx]);
+ }
+ }
+
+ std::vector<SlotList> inputLayersSlotLists, outputLayersSlotLists;
+ BuildAddMulAddSlotLists<SlotList>(fuseReLu,
+ outputInfos.size() > 1,
+ inputLayersSlotLists,
+ outputLayersSlotLists);
+
+ ReplaceMultipleLayers<FusedLayer>(optimizationViews,
+ originalLayers,
+ PolymorphicDowncast<FusedLayer*>(addMulAddLayer),
+ inputLayersSlotLists,
+ outputLayersSlotLists);
+
+ // Remove unused layers
+ for (unsigned int layerIdx = 0; layerIdx < 4; ++layerIdx)
+ {
+ if (layerList[layerIdx])
+ {
+ untouched.erase(layerList[layerIdx]->GetGuid());
+ }
+ }
+ }
+ }
+ }
}
if (optimizationViews.GetSubstitutions().empty() && optimizationViews.GetDeletedSubgraphs().empty())