aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTeresa Charlin <teresa.charlinreyes@arm.com>2023-08-17 18:44:58 +0100
committerTeresa Charlin <teresa.charlinreyes@arm.com>2023-08-28 12:37:25 +0100
commit9145e38edf49fa4862008c163c34590141eecb14 (patch)
tree64706ef579f548b804d5b674b33f6b239c638d0f
parente40cc8359b02a7786908294300c45b672cf6b0e4 (diff)
downloadarmnn-9145e38edf49fa4862008c163c34590141eecb14.tar.gz
IVGCVSW-7505 Create FusedLayer and NeonFusedWorkload for AddMulAdd Neon kernel
Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com> Change-Id: Ic778d35b001474b44fb1e433a6fe276e4ec9f565
-rw-r--r--Android.mk1
-rw-r--r--CMakeLists.txt2
-rw-r--r--include/armnn/BackendHelper.hpp5
-rw-r--r--include/armnn/Descriptors.hpp21
-rw-r--r--include/armnn/DescriptorsFwd.hpp1
-rw-r--r--include/armnn/INetwork.hpp8
-rw-r--r--include/armnn/Types.hpp8
-rw-r--r--include/armnn/TypesUtils.hpp9
-rw-r--r--include/armnn/backends/WorkloadData.hpp5
-rw-r--r--src/armnn/BackendHelper.cpp24
-rw-r--r--src/armnn/LayersFwd.hpp2
-rw-r--r--src/armnn/Network.cpp12
-rw-r--r--src/armnn/Network.hpp3
-rw-r--r--src/armnn/SerializeLayerParameters.cpp9
-rw-r--r--src/armnn/SerializeLayerParameters.hpp5
-rw-r--r--src/armnn/layers/FusedLayer.cpp48
-rw-r--r--src/armnn/layers/FusedLayer.hpp38
-rw-r--r--src/backends/backendsCommon/WorkloadData.cpp7
-rw-r--r--src/backends/backendsCommon/WorkloadFactory.cpp30
-rw-r--r--src/backends/backendsCommon/test/CMakeLists.txt1
-rw-r--r--src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp2
-rw-r--r--src/backends/backendsCommon/test/LayerTests.hpp1
-rw-r--r--src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp182
-rw-r--r--src/backends/cl/ClLayerSupport.cpp2
-rw-r--r--src/backends/neon/NeonLayerSupport.cpp54
-rw-r--r--src/backends/neon/NeonLayerSupport.hpp5
-rw-r--r--src/backends/neon/NeonWorkloadFactory.cpp5
-rw-r--r--src/backends/neon/backend.mk1
-rw-r--r--src/backends/neon/test/NeonLayerTests.cpp7
-rw-r--r--src/backends/neon/workloads/CMakeLists.txt2
-rw-r--r--src/backends/neon/workloads/NeonFusedWorkload.cpp115
-rw-r--r--src/backends/neon/workloads/NeonFusedWorkload.hpp35
-rw-r--r--src/backends/neon/workloads/NeonWorkloads.hpp1
-rw-r--r--src/backends/reference/RefLayerSupport.cpp4
34 files changed, 637 insertions, 18 deletions
diff --git a/Android.mk b/Android.mk
index c32afbeb34..e4cb59c2f4 100644
--- a/Android.mk
+++ b/Android.mk
@@ -233,6 +233,7 @@ LOCAL_SRC_FILES := \
src/armnn/layers/FillLayer.cpp \
src/armnn/layers/FloorLayer.cpp \
src/armnn/layers/FullyConnectedLayer.cpp \
+ src/armnn/layers/FusedLayer.cpp \
src/armnn/layers/GatherLayer.cpp \
src/armnn/layers/GatherNdLayer.cpp \
src/armnn/layers/InputLayer.cpp \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 42da39b8a8..91561b77d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -240,6 +240,8 @@ list(APPEND armnn_sources
src/armnn/layers/FloorLayer.cpp
src/armnn/layers/FullyConnectedLayer.hpp
src/armnn/layers/FullyConnectedLayer.cpp
+ src/armnn/layers/FusedLayer.hpp
+ src/armnn/layers/FusedLayer.cpp
src/armnn/layers/GatherLayer.cpp
src/armnn/layers/GatherLayer.hpp
src/armnn/layers/GatherNdLayer.cpp
diff --git a/include/armnn/BackendHelper.hpp b/include/armnn/BackendHelper.hpp
index 59cbbfced3..986f854636 100644
--- a/include/armnn/BackendHelper.hpp
+++ b/include/armnn/BackendHelper.hpp
@@ -194,6 +194,11 @@ public:
const FullyConnectedDescriptor& descriptor,
Optional<std::string&> reasonIfUnsupported = EmptyOptional());
+ bool IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs,
+ const std::vector<std::reference_wrapper<TensorInfo>>& outputs,
+ const FusedDescriptor& descriptor,
+ Optional<std::string&> reasonIfUnsupported = EmptyOptional());
+
bool IsGatherSupported(const TensorInfo& input0,
const TensorInfo& input1,
const TensorInfo& output,
diff --git a/include/armnn/Descriptors.hpp b/include/armnn/Descriptors.hpp
index f60e8f3bea..30eaefd83b 100644
--- a/include/armnn/Descriptors.hpp
+++ b/include/armnn/Descriptors.hpp
@@ -940,6 +940,27 @@ struct FillDescriptor : BaseDescriptor
float m_Value;
};
+/// A FusedDescriptor for the FusedLayer.
+struct FusedDescriptor : BaseDescriptor
+{
+ FusedDescriptor(unsigned int numInputSlots = 4u,
+ unsigned int numOutputSlots = 2u,
+ FusedKernelType fusedType = FusedKernelType::AddMulAdd)
+ : m_NumInputSlots(numInputSlots), m_NumOutputSlots(numOutputSlots), m_FusedKernelType(fusedType)
+ {}
+
+ bool operator ==(const FusedDescriptor& rhs) const
+ {
+ return m_NumInputSlots == rhs.m_NumInputSlots &&
+ m_NumOutputSlots == rhs.m_NumOutputSlots &&
+ m_FusedKernelType == rhs.m_FusedKernelType;
+ }
+
+ unsigned int m_NumInputSlots;
+ unsigned int m_NumOutputSlots;
+ FusedKernelType m_FusedKernelType;
+};
+
/// A GatherDescriptor for the GatherLayer.
struct GatherDescriptor : BaseDescriptor
{
diff --git a/include/armnn/DescriptorsFwd.hpp b/include/armnn/DescriptorsFwd.hpp
index be1a3f6782..4b9a3e5060 100644
--- a/include/armnn/DescriptorsFwd.hpp
+++ b/include/armnn/DescriptorsFwd.hpp
@@ -25,6 +25,7 @@ struct ElementwiseUnaryDescriptor;
struct FakeQuantizationDescriptor;
struct FillDescriptor;
struct FullyConnectedDescriptor;
+struct FusedDescriptor;
struct GatherDescriptor;
struct InstanceNormalizationDescriptor;
struct L2NormalizationDescriptor;
diff --git a/include/armnn/INetwork.hpp b/include/armnn/INetwork.hpp
index 1640d7c37d..c2c76e3d97 100644
--- a/include/armnn/INetwork.hpp
+++ b/include/armnn/INetwork.hpp
@@ -477,6 +477,14 @@ public:
IConnectableLayer* AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor,
const char* name = nullptr);
+ /// Adds a Fused layer to the network.
+ /// Method use is for backend users.
+ /// @param fusedDescriptor - FusedDescriptor contains parameters for the Fused layer.
+ /// @param name - Optional name for the layer.
+ /// @return - Interface for configuring the layer.
+ IConnectableLayer* AddFusedLayer(const FusedDescriptor& fusedDescriptor,
+ const char* name = nullptr);
+
/// Adds a permute layer to the network.
/// @param permuteDescriptor - PermuteDescriptor to configure the permute.
/// @param name - Optional name for the layer.
diff --git a/include/armnn/Types.hpp b/include/armnn/Types.hpp
index bf4458ee7f..7cb3a859c7 100644
--- a/include/armnn/Types.hpp
+++ b/include/armnn/Types.hpp
@@ -262,6 +262,11 @@ enum class MemBlockStrategyType
MultiAxisPacking = 1
};
+enum class FusedKernelType
+{
+ AddMulAdd = 0
+};
+
/// Each backend should implement an IBackend.
class IBackend
{
@@ -475,6 +480,7 @@ using InferenceTimingPair = std::pair<HighResolutionClock, HighResolutionClock>;
X(ElementwiseBinary) \
X(ReverseV2) \
X(Tile) \
+ X(Fused) \
// New layers should be added at last position to minimize instability.
@@ -486,7 +492,7 @@ enum class LayerType
LIST_OF_LAYER_TYPE
#undef X
FirstLayer = Activation,
- LastLayer = Tile
+ LastLayer = Fused
};
const char* GetLayerTypeAsCString(LayerType type);
diff --git a/include/armnn/TypesUtils.hpp b/include/armnn/TypesUtils.hpp
index eeb5c9e614..ca098f60fb 100644
--- a/include/armnn/TypesUtils.hpp
+++ b/include/armnn/TypesUtils.hpp
@@ -115,6 +115,15 @@ constexpr char const* GetLogicalBinaryOperationAsCString(LogicalBinaryOperation
}
}
+constexpr char const* GetFusedTypeAsCString(FusedKernelType type)
+{
+ switch (type)
+ {
+ case FusedKernelType::AddMulAdd: return "AddMulAdd";
+ default: return "Unknown";
+ }
+}
+
constexpr char const* GetPoolingAlgorithmAsCString(PoolingAlgorithm pooling)
{
switch (pooling)
diff --git a/include/armnn/backends/WorkloadData.hpp b/include/armnn/backends/WorkloadData.hpp
index 21a597df8a..86796cbcc0 100644
--- a/include/armnn/backends/WorkloadData.hpp
+++ b/include/armnn/backends/WorkloadData.hpp
@@ -182,6 +182,11 @@ struct FullyConnectedQueueDescriptor : QueueDescriptorWithParameters<FullyConnec
void Validate(const WorkloadInfo& workloadInfo) const;
};
+struct FusedQueueDescriptor : QueueDescriptorWithParameters<FusedDescriptor>
+{
+ void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
// Permute layer workload data.
struct PermuteQueueDescriptor : QueueDescriptorWithParameters<PermuteDescriptor>
{
diff --git a/src/armnn/BackendHelper.cpp b/src/armnn/BackendHelper.cpp
index f025193006..fc7a2fab83 100644
--- a/src/armnn/BackendHelper.cpp
+++ b/src/armnn/BackendHelper.cpp
@@ -748,6 +748,30 @@ bool LayerSupportHandle::IsFullyConnectedSupported(const TensorInfo& input,
reasonIfUnsupported);
}
+bool LayerSupportHandle::IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs,
+ const std::vector<std::reference_wrapper<TensorInfo>>& outputs,
+ const FusedDescriptor& descriptor,
+ Optional<std::string&> reasonIfUnsupported)
+{
+ TensorInfos infos;
+ infos.reserve(inputs.size() + outputs.size());
+ for (TensorInfo inInfo : inputs)
+ {
+ infos.emplace_back(inInfo);
+ }
+ for (TensorInfo outInfo : outputs)
+ {
+ infos.emplace_back(outInfo);
+ }
+
+ return m_LayerSupport->IsLayerSupported(LayerType::Fused,
+ infos,
+ descriptor,
+ EmptyOptional(),
+ EmptyOptional(),
+ reasonIfUnsupported);
+}
+
bool LayerSupportHandle::IsGatherSupported(const TensorInfo& input0,
const TensorInfo& input1,
const TensorInfo& output,
diff --git a/src/armnn/LayersFwd.hpp b/src/armnn/LayersFwd.hpp
index 743b8d7205..f83b710134 100644
--- a/src/armnn/LayersFwd.hpp
+++ b/src/armnn/LayersFwd.hpp
@@ -33,6 +33,7 @@
#include "layers/FillLayer.hpp"
#include "layers/FloorLayer.hpp"
#include "layers/FullyConnectedLayer.hpp"
+#include "layers/FusedLayer.hpp"
#include "layers/GatherLayer.hpp"
#include "layers/GatherNdLayer.hpp"
#include "layers/InputLayer.hpp"
@@ -136,6 +137,7 @@ DECLARE_LAYER(FakeQuantization)
DECLARE_LAYER(Fill)
DECLARE_LAYER(Floor)
DECLARE_LAYER(FullyConnected)
+DECLARE_LAYER(Fused)
DECLARE_LAYER(Gather)
DECLARE_LAYER(GatherNd)
DECLARE_LAYER(Input)
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 41111476da..7f4ef6b1b6 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -335,6 +335,12 @@ IConnectableLayer* INetwork::AddFullyConnectedLayer(const FullyConnectedDescript
return pNetworkImpl->AddFullyConnectedLayer(fullyConnectedDescriptor, name);
}
+IConnectableLayer* INetwork::AddFusedLayer(const FusedDescriptor& fusedDescriptor,
+ const char* name)
+{
+ return pNetworkImpl->AddFusedLayer(fusedDescriptor, name);
+}
+
IConnectableLayer* INetwork::AddPermuteLayer(const PermuteDescriptor& permuteDescriptor,
const char* name)
{
@@ -2195,6 +2201,12 @@ IConnectableLayer* NetworkImpl::AddFullyConnectedLayer(const FullyConnectedDescr
return m_Graph->AddLayer<FullyConnectedLayer>(fullyConnectedDescriptor, name);
}
+IConnectableLayer* NetworkImpl::AddFusedLayer(const FusedDescriptor& fusedDescriptor,
+ const char* name)
+{
+ return m_Graph->AddLayer<FusedLayer>(fusedDescriptor, name);
+}
+
IConnectableLayer* NetworkImpl::AddConcatLayer(const ConcatDescriptor& concatDescriptor,
const char* name)
{
diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp
index 34549248bc..5a3570d825 100644
--- a/src/armnn/Network.hpp
+++ b/src/armnn/Network.hpp
@@ -113,6 +113,9 @@ public:
IConnectableLayer* AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor,
const char* name = nullptr);
+ IConnectableLayer* AddFusedLayer(const FusedDescriptor& fusedDescriptor,
+ const char* name = nullptr);
+
IConnectableLayer* AddGatherLayer(const GatherDescriptor& gatherDescriptor,
const char* name = nullptr);
diff --git a/src/armnn/SerializeLayerParameters.cpp b/src/armnn/SerializeLayerParameters.cpp
index d65a7d55fa..cc59e1fad3 100644
--- a/src/armnn/SerializeLayerParameters.cpp
+++ b/src/armnn/SerializeLayerParameters.cpp
@@ -325,6 +325,15 @@ void StringifyLayerParameters<PreCompiledDescriptor>::Serialize(ParameterStringi
fn("NumOutputSlots", std::to_string(desc.m_NumOutputSlots));
}
+void StringifyLayerParameters<FusedDescriptor>::Serialize(ParameterStringifyFunction& fn,
+ const FusedDescriptor& desc)
+{
+ fn("NumInputSlots", std::to_string(desc.m_NumInputSlots));
+ fn("NumOutputSlots", std::to_string(desc.m_NumOutputSlots));
+ fn("PaddingMode", GetFusedTypeAsCString(desc.m_FusedKernelType));
+
+}
+
void StringifyLayerParameters<Pooling2dDescriptor>::Serialize(ParameterStringifyFunction& fn,
const Pooling2dDescriptor& desc)
{
diff --git a/src/armnn/SerializeLayerParameters.hpp b/src/armnn/SerializeLayerParameters.hpp
index 5b0378eab7..34a2986534 100644
--- a/src/armnn/SerializeLayerParameters.hpp
+++ b/src/armnn/SerializeLayerParameters.hpp
@@ -149,6 +149,11 @@ template <> struct StringifyLayerParameters<PreCompiledDescriptor>
static void Serialize(ParameterStringifyFunction& fn, const PreCompiledDescriptor& desc);
};
+template <> struct StringifyLayerParameters<FusedDescriptor>
+{
+ static void Serialize(ParameterStringifyFunction& fn, const FusedDescriptor& desc);
+};
+
template <> struct StringifyLayerParameters<ReduceDescriptor>
{
static void Serialize(ParameterStringifyFunction& fn, const ReduceDescriptor& desc);
diff --git a/src/armnn/layers/FusedLayer.cpp b/src/armnn/layers/FusedLayer.cpp
new file mode 100644
index 0000000000..37b1835450
--- /dev/null
+++ b/src/armnn/layers/FusedLayer.cpp
@@ -0,0 +1,48 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "FusedLayer.hpp"
+#include "LayerCloneBase.hpp"
+
+#include <armnn/backends/Workload.hpp>
+#include <armnn/TypesUtils.hpp>
+
+namespace armnn
+{
+
+FusedLayer::FusedLayer(const FusedDescriptor& param, const char* name)
+ : LayerWithParameters(param.m_NumInputSlots, param.m_NumOutputSlots, LayerType::Fused, param, name)
+{}
+
+FusedLayer::~FusedLayer()
+{}
+
+FusedLayer* FusedLayer::Clone(Graph& graph) const
+{
+ FusedLayer* clonedLayer = CloneBase<FusedLayer>(graph, m_Param, GetName());
+ clonedLayer->m_AdditionalInfoObject = const_cast<FusedLayer*>(this)->m_AdditionalInfoObject;
+ return clonedLayer;
+}
+
+std::unique_ptr<IWorkload> FusedLayer::CreateWorkload(const armnn::IWorkloadFactory& factory) const
+{
+ FusedQueueDescriptor descriptor;
+ SetAdditionalInfo(descriptor);
+
+ return factory.CreateWorkload(LayerType::Fused, descriptor, PrepInfoAndDesc(descriptor));
+}
+
+void FusedLayer::ValidateTensorShapesFromInputs()
+{
+ // NOTE: since the FusedLayer is an internal layer created from a valid SubgraphView,
+ // we do not need to validate its input shapes
+}
+
+void FusedLayer::ExecuteStrategy(IStrategy& strategy) const
+{
+ strategy.ExecuteStrategy(this, GetParameters(), {}, GetName());
+}
+
+} // namespace armnn
diff --git a/src/armnn/layers/FusedLayer.hpp b/src/armnn/layers/FusedLayer.hpp
new file mode 100644
index 0000000000..e26a379707
--- /dev/null
+++ b/src/armnn/layers/FusedLayer.hpp
@@ -0,0 +1,38 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerWithParameters.hpp"
+#include <armnn/backends/WorkloadFactory.hpp>
+
+#include <armnn/Descriptors.hpp>
+
+#include <memory>
+#include <functional>
+
+namespace armnn
+{
+
+class FusedLayer : public LayerWithParameters<FusedDescriptor>
+{
+public:
+ FusedLayer(const FusedDescriptor& param, const char* name);
+ ~FusedLayer();
+
+ virtual std::unique_ptr<IWorkload> CreateWorkload(const IWorkloadFactory& factory) const override;
+
+ FusedLayer* Clone(Graph &graph) const override;
+
+ void ValidateTensorShapesFromInputs() override;
+
+ void ExecuteStrategy(IStrategy& strategy) const override;
+
+private:
+ FusedLayer(const FusedLayer& other) = delete;
+ FusedLayer& operator=(const FusedLayer& other) = delete;
+};
+
+} // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index aa6bb848e5..d0f6eea3d4 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -1093,6 +1093,11 @@ void FullyConnectedQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c
}
}
+void FusedQueueDescriptor::Validate(const WorkloadInfo& /*workloadInfo*/) const
+{
+ // This is internally generated, so it should not need validation.
+}
+
void NormalizationQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
{
const std::string descriptorName{"NormalizationQueueDescriptor"};
@@ -3003,7 +3008,7 @@ void SwitchQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
void PreCompiledQueueDescriptor::Validate(const WorkloadInfo& /*workloadInfo*/) const
{
- // This is internally generated so it should not need validation.
+ // This is internally generated, so it should not need validation.
}
void PreluQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index f067056ce1..6ff237bc12 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -525,6 +525,36 @@ bool IWorkloadFactory::IsLayerConfigurationSupported(const BackendId& backendId,
reason);
break;
}
+ case LayerType::Fused:
+ {
+ auto cLayer = PolymorphicDowncast<const FusedLayer*>(&layer);
+
+ // Get vector of all outputs.
+ auto getOutTensorInfo = [&dataType](const OutputSlot& slot)
+ {
+ return OverrideDataType(slot.GetTensorInfo(), dataType);
+ };
+ auto beginOutputs = MakeTransformIterator(layer.GetOutputSlots().begin(), getOutTensorInfo);
+ auto endOutputs = MakeTransformIterator(layer.GetOutputSlots().end(), getOutTensorInfo);
+ std::vector<TensorInfo> outputs(beginOutputs, endOutputs);
+ const std::vector<std::reference_wrapper<TensorInfo>> outputPtrs(outputs.begin(), outputs.end());
+
+ // Get vector of all inputs.
+ auto getInputTensorInfo = [&dataType](const InputSlot& slot)
+ {
+ return OverrideDataType(slot.GetTensorInfo(), dataType);
+ };
+ auto beginInputs = MakeTransformIterator(layer.GetInputSlots().begin(), getInputTensorInfo);
+ auto endInputs = MakeTransformIterator(layer.GetInputSlots().end(), getInputTensorInfo);
+ std::vector<TensorInfo> inputs(beginInputs, endInputs);
+ const std::vector<std::reference_wrapper<TensorInfo>> inputPtrs(inputs.begin(), inputs.end());
+
+ result = layerSupportObject.IsFusedSupported(inputPtrs,
+ outputPtrs,
+ cLayer->GetParameters(),
+ reason);
+ break;
+ }
case LayerType::Gather:
{
const TensorInfo& input0 = layer.GetInputSlot(0).GetTensorInfo();
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index 5d8fb1a953..8f3a22d53b 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -71,6 +71,7 @@ list(APPEND armnnBackendsCommonUnitTests_sources
layerTests/ActivationTestImpl.hpp
layerTests/AdditionTestImpl.cpp
layerTests/AdditionTestImpl.hpp
+ layerTests/AddMulAddTestImpl.hpp
layerTests/ArgMinMaxTestImpl.cpp
layerTests/ArgMinMaxTestImpl.hpp
layerTests/BatchMatMulTestImpl.cpp
diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
index ff02e06859..e8a2ec6931 100644
--- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
+++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
@@ -678,6 +678,8 @@ DECLARE_LAYER_POLICY_1_PARAM(Floor)
DECLARE_LAYER_POLICY_2_PARAM(FullyConnected)
+DECLARE_LAYER_POLICY_2_PARAM(Fused)
+
DECLARE_LAYER_POLICY_2_PARAM(Gather)
DECLARE_LAYER_POLICY_1_PARAM(GatherNd)
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index 7182cb2d47..3f8d045c06 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -8,6 +8,7 @@
#include <backendsCommon/test/layerTests/AbsTestImpl.hpp>
#include <backendsCommon/test/layerTests/ActivationTestImpl.hpp>
#include <backendsCommon/test/layerTests/AdditionTestImpl.hpp>
+#include <backendsCommon/test/layerTests/AddMulAddTestImpl.hpp>
#include <backendsCommon/test/layerTests/ArgMinMaxTestImpl.hpp>
#include <backendsCommon/test/layerTests/BatchMatMulTestImpl.hpp>
#include <backendsCommon/test/layerTests/BatchNormalizationTestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp
new file mode 100644
index 0000000000..9dece9be3b
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/AddMulAddTestImpl.hpp
@@ -0,0 +1,182 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnnTestUtils/LayerTestResult.hpp>
+
+#include <armnnUtils/QuantizeHelper.hpp>
+#include <ResolveType.hpp>
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <armnn/backends/WorkloadFactory.hpp>
+
+#include <armnnTestUtils/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadFactoryHelper.hpp>
+#include <armnnTestUtils/WorkloadTestUtils.hpp>
+
+#include <armnnTestUtils/TensorHelpers.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+std::vector<LayerTestResult<T,4>> AddMulAddTest(armnn::IWorkloadFactory& workloadFactory,
+ const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+ const armnn::ITensorHandleFactory& tensorHandleFactory,
+ bool addOutput)
+{
+ using namespace armnn;
+ IgnoreUnused(memoryManager);
+
+ TensorInfo input0TensorInfo({ 1, 2, 2, 3 }, ArmnnType);
+ TensorInfo input1TensorInfo({ 1, 2, 2, 3 }, ArmnnType);
+ TensorInfo mulInput1TensorInfo({ 3 }, ArmnnType);
+ TensorInfo addInput1TensorInfo({ 3 }, ArmnnType);
+
+ TensorInfo output0TensorInfo({ 1, 2, 2, 3 }, ArmnnType);
+ TensorInfo output1TensorInfo({ 1, 2, 2, 3 }, ArmnnType);
+
+ if (IsQuantizedType<T>())
+ {
+ input0TensorInfo.SetQuantizationScale(0.25f);
+ input0TensorInfo.SetQuantizationOffset(128);
+ input1TensorInfo.SetQuantizationScale(0.25f);
+ input1TensorInfo.SetQuantizationOffset(128);
+ mulInput1TensorInfo.SetQuantizationScale(0.25f);
+ mulInput1TensorInfo.SetQuantizationOffset(128);
+ addInput1TensorInfo.SetQuantizationScale(0.25f);
+ addInput1TensorInfo.SetQuantizationOffset(128);
+
+ output0TensorInfo.SetQuantizationScale(0.5f);
+ output0TensorInfo.SetQuantizationOffset(120);
+ output1TensorInfo.SetQuantizationScale(0.5f);
+ output1TensorInfo.SetQuantizationOffset(120);
+ }
+
+ std::vector<float> input0Data
+ {
+ 0.0f, 0.0f, 0.0f,
+ 1.0f, 1.0f, 1.0f,
+ -1.0f, -1.0f, -1.0f,
+ -2.0f, -2.0f, -2.0f
+ };
+ std::vector<float> input1Data
+ {
+ 0.0f, 0.0f, 0.0f,
+ 1.0f, 1.0f, 1.0f,
+ -1.0f, -1.0f, -1.0f,
+ -2.0f, -2.0f, -2.0f
+ };
+ std::vector<float> mulInput1Data
+ {
+ 2.0f, 1.0f, 1.0f
+ };
+ std::vector<float> addInput1Data
+ {
+ 3.0f, 0.0f, 0.0f
+ };
+ std::vector<float> output0ExpectedData =
+ {
+ 0.0f, 0.0f, 0.0f,
+ 2.0f, 2.0f, 2.0f,
+ -2.0f, -2.0f, -2.0f,
+ -4.0f, -4.0f, -4.0f
+ };
+
+ std::vector<float> output1ExpectedData =
+ {
+ 3.0f, 0.0f, 0.0f,
+ 7.0f, 2.0f, 2.0f,
+ -1.0f, -2.0f, -2.0f,
+ -5.0f, -4.0f, -4.0f
+ };
+
+ std::vector<T> input0 = armnnUtils::QuantizedVector<T>(input0Data,
+ input0TensorInfo.GetQuantizationScale(),
+ input0TensorInfo.GetQuantizationOffset());
+
+ std::vector<T> input1 = armnnUtils::QuantizedVector<T>(input1Data,
+ input1TensorInfo.GetQuantizationScale(),
+ input1TensorInfo.GetQuantizationOffset());
+
+ std::vector<T> mulInput1 = armnnUtils::QuantizedVector<T>(mulInput1Data,
+ mulInput1TensorInfo.GetQuantizationScale(),
+ mulInput1TensorInfo.GetQuantizationOffset());
+
+ std::vector<T> addInput1 = armnnUtils::QuantizedVector<T>(addInput1Data,
+ addInput1TensorInfo.GetQuantizationScale(),
+ addInput1TensorInfo.GetQuantizationOffset());
+
+ std::vector<T> output0Expected = armnnUtils::QuantizedVector<T>(output0ExpectedData,
+ output0TensorInfo.GetQuantizationScale(),
+ output0TensorInfo.GetQuantizationOffset());
+
+ std::vector<T> output1Expected = armnnUtils::QuantizedVector<T>(output1ExpectedData,
+ output1TensorInfo.GetQuantizationScale(),
+ output1TensorInfo.GetQuantizationOffset());
+
+ std::vector<T> output0Actual(output0TensorInfo.GetNumElements());
+ std::vector<T> output1Actual(output1TensorInfo.GetNumElements());
+
+ std::unique_ptr<ITensorHandle> input0Handle = tensorHandleFactory.CreateTensorHandle(input0TensorInfo);
+ std::unique_ptr<ITensorHandle> input1Handle = tensorHandleFactory.CreateTensorHandle(input1TensorInfo);
+ std::unique_ptr<ITensorHandle> mulInput1Handle = tensorHandleFactory.CreateTensorHandle(mulInput1TensorInfo);
+ std::unique_ptr<ITensorHandle> addInput1Handle = tensorHandleFactory.CreateTensorHandle(addInput1TensorInfo);
+ std::unique_ptr<ITensorHandle> output0Handle = tensorHandleFactory.CreateTensorHandle(output0TensorInfo);
+ std::unique_ptr<ITensorHandle> output1Handle = tensorHandleFactory.CreateTensorHandle(output1TensorInfo);
+
+ uint32_t numOutputs = addOutput ? 2 : 1;
+ FusedDescriptor descriptor(4, numOutputs, FusedKernelType::AddMulAdd);
+ FusedQueueDescriptor fusedQueueDescriptor;
+ fusedQueueDescriptor.m_Parameters = descriptor;
+ WorkloadInfo info;
+ AddInputToWorkload (fusedQueueDescriptor, info, input0TensorInfo, input0Handle.get());
+ AddInputToWorkload (fusedQueueDescriptor, info, input1TensorInfo, input1Handle.get());
+ AddInputToWorkload (fusedQueueDescriptor, info, mulInput1TensorInfo, mulInput1Handle.get());
+ AddInputToWorkload (fusedQueueDescriptor, info, addInput1TensorInfo, addInput1Handle.get());
+ if (addOutput)
+ {
+ AddOutputToWorkload(fusedQueueDescriptor, info, output0TensorInfo, output0Handle.get());
+ }
+ AddOutputToWorkload(fusedQueueDescriptor, info, output1TensorInfo, output1Handle.get());
+
+ std::unique_ptr<IWorkload> workload = workloadFactory.CreateWorkload(LayerType::Fused,
+ fusedQueueDescriptor,
+ info);
+
+ input0Handle->Allocate();
+ input1Handle->Allocate();
+ mulInput1Handle->Allocate();
+ addInput1Handle->Allocate();
+ if (addOutput)
+ {
+ output0Handle->Allocate();
+ }
+ output1Handle->Allocate();
+
+ CopyDataToITensorHandle(input0Handle.get(), input0.data());
+ CopyDataToITensorHandle(input1Handle.get(), input1.data());
+ CopyDataToITensorHandle(mulInput1Handle.get(), mulInput1.data());
+ CopyDataToITensorHandle(addInput1Handle.get(), addInput1.data());
+
+ workload->Execute();
+
+ CopyDataFromITensorHandle(output1Actual.data(), output1Handle.get());
+ LayerTestResult<T,4> ret1(output1Actual,
+ output1Expected,
+ output1Handle->GetShape(),
+ output1TensorInfo.GetShape());
+
+ std::vector<LayerTestResult<T,4>> ret = {ret1};
+
+ if (addOutput)
+ {
+ CopyDataFromITensorHandle(output0Actual.data(), output0Handle.get());
+ LayerTestResult<T,4> ret0(output0Actual,
+ output0Expected,
+ output0Handle->GetShape(),
+ output0TensorInfo.GetShape());
+ ret = {ret0, ret1};
+ }
+ return ret;
+}
diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp
index 60dab0538a..1acaba0384 100644
--- a/src/backends/cl/ClLayerSupport.cpp
+++ b/src/backends/cl/ClLayerSupport.cpp
@@ -682,7 +682,7 @@ bool ClLayerSupport::IsLayerSupported(const LayerType& type,
default:
// layers not supported in cl by default:
// debug, detectionpostprocess, fakequantization,
- // precompiled, standin, switch, pooling3d
+ // precompiled, standin, switch, pooling3d, fused
return false;
}
}
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index b491ba8493..ef1d21835a 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -4,7 +4,6 @@
//
#include "NeonLayerSupport.hpp"
-#include "NeonBackendId.hpp"
#include "NeonBackendModelContext.hpp"
#include <armnn/Exceptions.hpp>
@@ -12,7 +11,6 @@
#include <armnn/Types.hpp>
#include <armnn/BackendRegistry.hpp>
-#include <InternalTypes.hpp>
#include <LayerSupportCommon.hpp>
#include <armnn/utility/IgnoreUnused.hpp>
#include <armnn/utility/PolymorphicDowncast.hpp>
@@ -39,8 +37,13 @@
#include "workloads/NeonDepthToSpaceWorkload.hpp"
#include "workloads/NeonDepthwiseConvolutionWorkload.hpp"
#include "workloads/NeonDequantizeWorkload.hpp"
+#include "workloads/NeonDivisionWorkload.hpp"
#include "workloads/NeonElementwiseBinaryWorkload.hpp"
#include "workloads/NeonExpWorkload.hpp"
+#include "workloads/NeonFullyConnectedWorkload.hpp"
+#include "workloads/NeonFusedWorkload.hpp"
+#include "workloads/NeonGatherWorkload.hpp"
+#include "workloads/NeonGatherNdWorkload.hpp"
#include "workloads/NeonInstanceNormalizationWorkload.hpp"
#include "workloads/NeonL2NormalizationFloatWorkload.hpp"
#include "workloads/NeonLogWorkload.hpp"
@@ -53,12 +56,8 @@
#include "workloads/NeonMeanWorkload.hpp"
#include "workloads/NeonMinimumWorkload.hpp"
#include "workloads/NeonMultiplicationWorkload.hpp"
-#include "workloads/NeonDivisionWorkload.hpp"
#include "workloads/NeonNegWorkload.hpp"
#include "workloads/NeonNormalizationFloatWorkload.hpp"
-#include "workloads/NeonFullyConnectedWorkload.hpp"
-#include "workloads/NeonGatherWorkload.hpp"
-#include "workloads/NeonGatherNdWorkload.hpp"
#include "workloads/NeonPadWorkload.hpp"
#include "workloads/NeonPermuteWorkload.hpp"
#include "workloads/NeonPooling2dWorkload.hpp"
@@ -128,13 +127,13 @@ bool IsSupportedForDataTypeNeon(Optional<std::string&> reasonIfUnsupported,
{
return IsNeonBackendSupported(reasonIfUnsupported) &&
IsSupportedForDataTypeGeneric(reasonIfUnsupported,
- dataType,
- floatFuncPtr,
- floatFuncPtr,
- uint8FuncPtr,
- &FalseFunc<>,
- &FalseFunc<>,
- std::forward<Params>(params)...);
+ dataType,
+ floatFuncPtr,
+ floatFuncPtr,
+ uint8FuncPtr,
+ &FalseFunc<>,
+ &FalseFunc<>,
+ std::forward<Params>(params)...);
}
#if defined(ARMCOMPUTENEON_ENABLED)
@@ -430,6 +429,22 @@ bool IsLayerTypeSupported(const LayerType& type,
*(PolymorphicDowncast<const
FullyConnectedDescriptor*>(&descriptor)),
reasonIfUnsupported);
+ case LayerType::Fused:
+ {
+ auto fusedDescriptor = *(PolymorphicDowncast<const FusedDescriptor*>(&descriptor));
+ if (fusedDescriptor.m_NumInputSlots + fusedDescriptor.m_NumOutputSlots != infos.size())
+ {
+ throw InvalidArgumentException("Invalid number of FusedLayer TensorInfos.");
+ }
+
+ std::vector<TensorInfo> inputInfos(infos.begin(), infos.begin() + fusedDescriptor.m_NumInputSlots);
+ std::vector<TensorInfo> outputInfos(infos.begin() + fusedDescriptor.m_NumInputSlots, infos.end());
+
+ return support.IsFusedSupported({inputInfos.begin(), inputInfos.end()},
+ {outputInfos.begin(), outputInfos.end()},
+ fusedDescriptor,
+ reasonIfUnsupported);
+ }
case LayerType::Gather:
return support.IsGatherSupported(infos[0],
infos[1],
@@ -1155,6 +1170,19 @@ bool NeonLayerSupport::IsFullyConnectedSupported(const TensorInfo& input,
nullptr);
}
+bool NeonLayerSupport::IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs,
+ const std::vector<std::reference_wrapper<TensorInfo>>& outputs,
+ const FusedDescriptor& descriptor,
+ Optional<std::string&> reasonIfUnsupported) const
+{
+ FORWARD_WORKLOAD_VALIDATE_FUNC(NeonFusedWorkloadValidate,
+ reasonIfUnsupported,
+ inputs,
+ outputs,
+ descriptor,
+ nullptr);
+}
+
bool NeonLayerSupport::IsGatherSupported(const TensorInfo& input0,
const TensorInfo& input1,
const TensorInfo& output,
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index 4bc96acd30..0295c2b3e2 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -151,6 +151,11 @@ public:
const FullyConnectedDescriptor& descriptor,
Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const;
+ bool IsFusedSupported(const std::vector<std::reference_wrapper<TensorInfo>>& inputs,
+ const std::vector<std::reference_wrapper<TensorInfo>>& outputs,
+ const FusedDescriptor& descriptor,
+ Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const;
+
bool IsGatherNdSupported(const TensorInfo& input0,
const TensorInfo& input1,
const TensorInfo& output,
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index e3411de254..4f131ac575 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -400,6 +400,11 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateWorkload(LayerType type,
info,
m_MemoryManager->GetIntraLayerManager());
}
+ case LayerType::Fused :
+ {
+ auto fusedQueueDescriptor = PolymorphicDowncast<const FusedQueueDescriptor*>(&descriptor);
+ return std::make_unique<NeonFusedWorkload>(*fusedQueueDescriptor, info);
+ }
case LayerType::Gather :
{
auto gatherQueueDescriptor = PolymorphicDowncast<const GatherQueueDescriptor*>(&descriptor);
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 2c91d1491d..3961ed1e34 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -48,6 +48,7 @@ BACKEND_SOURCES := \
workloads/NeonFillWorkload.cpp \
workloads/NeonFloorFloatWorkload.cpp \
workloads/NeonFullyConnectedWorkload.cpp \
+ workloads/NeonFusedWorkload.cpp \
workloads/NeonGatherWorkload.cpp \
workloads/NeonGatherNdWorkload.cpp \
workloads/NeonInstanceNormalizationWorkload.cpp \
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 588c90be6d..c9dd1ff507 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -1724,6 +1724,13 @@ ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Te
// Convert from Float32 to Float16
ARMNN_AUTO_TEST_CASE_WITH_THF(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test)
+// AddMulAdd
+ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsFloat32, AddMulAddTest<DataType::Float32>, true)
+ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd2OutputsUint8, AddMulAddTest<DataType::QAsymmU8>, true)
+
+ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputFloat32, AddMulAddTest<DataType::Float32>, false)
+ARMNN_AUTO_TEST_CASE_WITH_THF(AddMulAdd1OutputUint8, AddMulAddTest<DataType::QAsymmU8>, false)
+
#if defined(ARMNNREF_ENABLED)
// The ARMNN_COMPARE_REF_AUTO_TEST_CASE and the ARMNN_COMPARE_REF_FIXTURE_TEST_CASE test units are not available
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index 2cb2ccf385..f4438e4baa 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -54,6 +54,8 @@ list(APPEND armnnNeonBackendWorkloads_sources
NeonFloorFloatWorkload.hpp
NeonFullyConnectedWorkload.cpp
NeonFullyConnectedWorkload.hpp
+ NeonFusedWorkload.cpp
+ NeonFusedWorkload.hpp
NeonGatherWorkload.cpp
NeonGatherWorkload.hpp
NeonGatherNdWorkload.cpp
diff --git a/src/backends/neon/workloads/NeonFusedWorkload.cpp b/src/backends/neon/workloads/NeonFusedWorkload.cpp
new file mode 100644
index 0000000000..f770f46c81
--- /dev/null
+++ b/src/backends/neon/workloads/NeonFusedWorkload.cpp
@@ -0,0 +1,115 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonFusedWorkload.hpp"
+#include "NeonWorkloadUtils.hpp"
+
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+#include <aclCommon/ArmComputeUtils.hpp>
+
+#include <armnn/utility/PolymorphicDowncast.hpp>
+#include <armnn/backends/TensorHandle.hpp>
+
+#include <arm_compute/runtime/NEON/functions/NEAddMulAdd.h>
+
+namespace armnn
+{
+
+using namespace armcomputetensorutils;
+
+arm_compute::Status NeonFusedWorkloadValidate(const std::vector<std::reference_wrapper<TensorInfo>>& inputInfos,
+ const std::vector<std::reference_wrapper<TensorInfo>>& outputInfos,
+ const FusedDescriptor& fusedDescriptor,
+ const ActivationDescriptor* activationDescriptor)
+{
+ std::vector<arm_compute::TensorInfo> actInputInfos;
+ actInputInfos.reserve(inputInfos.size());
+ for (size_t i = 0u; i < inputInfos.size(); ++i)
+ {
+ actInputInfos.emplace_back(BuildArmComputeTensorInfo(inputInfos[i]));
+ }
+
+ std::vector<arm_compute::TensorInfo> actOutputInfos;
+ actOutputInfos.reserve(outputInfos.size());
+ for (size_t i = 0u; i < outputInfos.size(); ++i)
+ {
+ actOutputInfos.emplace_back(BuildArmComputeTensorInfo(outputInfos[i]));
+ }
+
+ const arm_compute::ActivationLayerInfo activationInfo =
+ ConvertActivationDescriptorToAclActivationLayerInfo(activationDescriptor);
+
+ switch (fusedDescriptor.m_FusedKernelType)
+ {
+ case FusedKernelType::AddMulAdd:
+ return arm_compute::NEAddMulAdd::validate(
+ &actInputInfos[0],
+ &actInputInfos[1],
+ &actInputInfos[2], // bn_mul
+ &actInputInfos[3], // bn_add
+ actOutputInfos.size() == 1 ? nullptr : &actOutputInfos[0], // add_output
+ actOutputInfos.size() == 1 ? &actOutputInfos[0] : &actOutputInfos[1], // final_output
+ arm_compute::ConvertPolicy::SATURATE,
+ activationInfo);
+ default:
+ return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR,
+ "NeonFusedWorkloadValidate: no valid kernel type"};
+ }
+}
+
+
+NeonFusedWorkload::NeonFusedWorkload(const FusedQueueDescriptor& descriptor, const WorkloadInfo& info)
+ : NeonBaseWorkload<FusedQueueDescriptor>(descriptor, info)
+{
+ m_Data.ValidateInputsOutputs("NeonFusedWorkload",
+ static_cast<unsigned int>(info.m_InputTensorInfos.size()),
+ static_cast<unsigned int>(info.m_OutputTensorInfos.size()));
+
+ std::vector<arm_compute::ITensor*> inputs;
+ inputs.reserve(info.m_InputTensorInfos.size());
+ for (auto input : m_Data.m_Inputs)
+ {
+ inputs.emplace_back(&PolymorphicDowncast<IAclTensorHandle*>(input)->GetTensor());
+ }
+
+ std::vector<arm_compute::ITensor*> outputs;
+ outputs.reserve(info.m_OutputTensorInfos.size());
+ for (auto output : m_Data.m_Outputs)
+ {
+ outputs.emplace_back(&PolymorphicDowncast<IAclTensorHandle*>(output)->GetTensor());
+ }
+
+ const arm_compute::ActivationLayerInfo activationInfo =
+ ConvertAdditionalInfoToAclActivationLayerInfo(descriptor);
+
+ switch (descriptor.m_Parameters.m_FusedKernelType)
+ {
+ case FusedKernelType::AddMulAdd:
+ {
+ auto layer = std::make_unique<arm_compute::NEAddMulAdd>();
+ layer->configure(inputs[0],
+ inputs[1],
+ inputs[2], // bn_mul
+ inputs[3], // bn_add
+ outputs.size() == 1 ? nullptr : outputs[0], // add_output
+ outputs.size() == 1 ? outputs[0] : outputs[1], // final_output
+ arm_compute::ConvertPolicy::SATURATE,
+ activationInfo);
+ m_FusedLayer.reset(layer.release());
+ break;
+ }
+ default:
+ throw Exception("NeonFusedWorkload: no valid kernel type.");
+ }
+}
+
+void NeonFusedWorkload::Execute() const
+{
+ ARMNN_SCOPED_PROFILING_EVENT_NEON_GUID("NeonFusedWorkload_Execute", this->GetGuid());
+ m_FusedLayer->run();
+}
+
+} //namespace armnn
+
diff --git a/src/backends/neon/workloads/NeonFusedWorkload.hpp b/src/backends/neon/workloads/NeonFusedWorkload.hpp
new file mode 100644
index 0000000000..aaabf61560
--- /dev/null
+++ b/src/backends/neon/workloads/NeonFusedWorkload.hpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "NeonBaseWorkload.hpp"
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/core/Types.h>
+#include <arm_compute/runtime/IFunction.h>
+
+namespace armnn
+{
+
+arm_compute::Status NeonFusedWorkloadValidate(const std::vector<std::reference_wrapper<TensorInfo>>& inputInfos,
+ const std::vector<std::reference_wrapper<TensorInfo>>& outputInfos,
+ const FusedDescriptor& fusedDescriptor,
+ const ActivationDescriptor* activationDescriptor = nullptr);
+
+class NeonFusedWorkload : public NeonBaseWorkload<FusedQueueDescriptor>
+{
+public:
+ NeonFusedWorkload(const FusedQueueDescriptor& descriptor, const WorkloadInfo& info);
+ virtual void Execute() const override;
+
+private:
+ std::unique_ptr<arm_compute::IFunction> m_FusedLayer;
+};
+
+} //namespace armnn
+
+
+
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index b72f3bb703..615e5d87c8 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -30,6 +30,7 @@
#include "NeonFillWorkload.hpp"
#include "NeonFloorFloatWorkload.hpp"
#include "NeonFullyConnectedWorkload.hpp"
+#include "NeonFusedWorkload.hpp"
#include "NeonGatherWorkload.hpp"
#include "NeonGatherNdWorkload.hpp"
#include "NeonInstanceNormalizationWorkload.hpp"
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index 6323db1c03..0b1b9c7824 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -539,8 +539,8 @@ bool RefLayerSupport::IsLayerSupported(const LayerType& type,
quantizedLstmInputParamsInfo.value(),
reasonIfUnsupported);
default:
- // layers not supported in neon by default:
- // precompiled, standin, switch
+ // layers not supported in reference by default:
+ // precompiled, standin, switch, fused
return false;
}
}